From 67b4fad682954e874ce6755738c0aa1c6c727f54 Mon Sep 17 00:00:00 2001 From: Vraj Date: Tue, 3 Dec 2024 18:38:54 -0800 Subject: [PATCH 1/4] add completeness judge --- examples/evaluate_completeness_judge.py | 67 +++++++++++++++++++ .../llm_as_judge/conversation_completeness.py | 44 ++++++++++++ .../judges/completeness/v5.py | 18 +++++ 3 files changed, 129 insertions(+) create mode 100644 examples/evaluate_completeness_judge.py create mode 100644 prepare/metrics/llm_as_judge/conversation_completeness.py create mode 100644 prepare/templates/response_assessment/judges/completeness/v5.py diff --git a/examples/evaluate_completeness_judge.py b/examples/evaluate_completeness_judge.py new file mode 100644 index 0000000000..3bfdd43145 --- /dev/null +++ b/examples/evaluate_completeness_judge.py @@ -0,0 +1,67 @@ +from unitxt import get_logger +from unitxt.api import evaluate, load_dataset +from unitxt.blocks import Task, TaskCard +from unitxt.loaders import LoadFromDictionary +from unitxt.templates import InputOutputTemplate, TemplatesDict +from unitxt.text_utils import print_dict + +logger = get_logger() + +data = { + "test": [ + { + "conversation": "user: is eugene from walking dead really a scientist\nagent: No, Eugene from The Walking Dead is not a real scientist. He lied about being a scientist to manipulate the other survivors into taking him to Washington D.C., believing it to be the best chance for survival. In reality, he is a high school science teacher who does not know how to cure the virus. However, he is shown to be highly intelligent and resourceful, able to do things such as repairing the solar power system in Alexandria and making bullets for the Saviors.\nuser: Why did he lied about himself being a scientist?\nagent: Eugene lied about being a scientist to manipulate the other survivors into taking him to Washington D.C., believing it to be the best chance for survival. He initially used this lie to gain the protection of Abraham Ford. He assists the group with technical tasks even after the truth is revealed, which also improves his own confidence. His lie caused tension within the group, particularly between him and Abraham, but eventually Abraham forgave him and they resumed their friendship.\nuser: Is The Walking Dead currently shown on TV? Which station?", + "document": "The Walking Dead (season 6)\nOn October 7 , 2014 , AMC renewed The Walking Dead for a sixth season . Scott M. Gimple said that the sixth season would continue to remix material from the comic and explained that there would be a flashback backstory to some of the characters :\n\nThe Walking Dead (season 6)\nThe sixth season of The Walking Dead , an American post-apocalyptic horror television series on AMC , premiered on October 11 , 2015 , and concluded on April 3 , 2016 , consisting of 16 episodes . Developed for television by Frank Darabont , the series is based on the eponymous series of comic books by Robert Kirkman , Tony Moore , and Charlie Adlard . The executive producers are Kirkman , David Alpert , Scott M. Gimple , Greg Nicotero , Tom Luse , and Gale Anne Hurd , with Gimple as showrunner for the third consecutive season . The sixth season received positive reviews by critics . It was nominated for multiple awards and won three , including Best Horror Television Series , at the 42nd Saturn Awards .", + }, + { + "conversation": "user: is eugene from walking dead really a scientist\nagent: No, Eugene from The Walking Dead is not a real scientist. He lied about being a scientist to manipulate the other survivors into taking him to Washington D.C., believing it to be the best chance for survival. In reality, he is a high school science teacher who does not know how to cure the virus. However, he is shown to be highly intelligent and resourceful, able to do things such as repairing the solar power system in Alexandria and making bullets for the Saviors.\nuser: Why did he lied about himself being a scientist?\nagent: Eugene lied about being a scientist to manipulate the other survivors into taking him to Washington D.C., believing it to be the best chance for survival. He initially used this lie to gain the protection of Abraham Ford. He assists the group with technical tasks even after the truth is revealed, which also improves his own confidence. His lie caused tension within the group, particularly between him and Abraham, but eventually Abraham forgave him and they resumed their friendship.\nuser: Is The Walking Dead currently shown on TV? Which station?", + "document": "The Walking Dead (season 6)\nOn October 7 , 2014 , AMC renewed The Walking Dead for a sixth season . Scott M. Gimple said that the sixth season would continue to remix material from the comic and explained that there would be a flashback backstory to some of the characters :\n\nThe Walking Dead (season 6)\nThe sixth season of The Walking Dead , an American post-apocalyptic horror television series on AMC , premiered on October 11 , 2015 , and concluded on April 3 , 2016 , consisting of 16 episodes . Developed for television by Frank Darabont , the series is based on the eponymous series of comic books by Robert Kirkman , Tony Moore , and Charlie Adlard . The executive producers are Kirkman , David Alpert , Scott M. Gimple , Greg Nicotero , Tom Luse , and Gale Anne Hurd , with Gimple as showrunner for the third consecutive season . The sixth season received positive reviews by critics . It was nominated for multiple awards and won three , including Best Horror Television Series , at the 42nd Saturn Awards .", + }, + { + "conversation": "user: is eugene from walking dead really a scientist\nagent: No, Eugene from The Walking Dead is not a real scientist. He lied about being a scientist to manipulate the other survivors into taking him to Washington D.C., believing it to be the best chance for survival. In reality, he is a high school science teacher who does not know how to cure the virus. However, he is shown to be highly intelligent and resourceful, able to do things such as repairing the solar power system in Alexandria and making bullets for the Saviors.\nuser: Why did he lied about himself being a scientist?", + "document": "Eugene Porter\nDr. Eugene Porter is a fictional character from the comic book series The Walking Dead and is portrayed by Josh McDermitt beginning in the fourth season of the American television series of the same name . In both mediums he claims to be a scientist who knows the cure to the zombie plague and is being escorted to Washington D.C. by Sgt . Abraham Ford and Rosita Espinosa , and encounter Rick Grimes and his group and recruit them to assist their mission . Eugene is overweight , possesses virtually no survival skills of his own and is extremely dependent on the group , but is highly intelligent and resourceful in using technology to ensure the group 's survival . Eventually it is revealed that Eugene is not a scientist , but a high school science teacher , and that he does n't know how to cure the virus but lied to manipulate the other survivors into taking him to Washington D.C. believing it to be the best chance for survival . This proves true as the group eventually finds the Alexandria Safe - Zone where Eugene becomes its primary engineer . Though his lie puts a strain on their friendship , Abraham eventually forgives him and they resume being friends .\n\nList of The Walking Dead (TV series) characters\nDr. Eugene Porter , portrayed by Josh McDermitt , is a survivor of the apocalypse and a member of Abraham Ford 's survivor group and later a minor antagonist after he unwillingly begins working for the Saviors . He comes across as timid and socially awkward . Initially when encountered by Rick 's group , he appears to be a scientist that knows of a cure to the outbreak located in Washington , D.C. , with Abraham and Rosita assisting him . Later , Eugene is forced to reveal that he lied to Abraham to gain his protection back when they first met , leading Abraham to punch him unconscious and creating a rift in their friendship . When the group makes it to Alexandria , Eugene assists the community with his technical knowledge to repair their solar power system . This helps to boost his confidence to learn how to defend himself , reconcile his friendship with Abraham , and help the community find a way to make more bullets . When Negan discovers that Eugene makes bullets , Eugene is taken to work for the Saviors . Although Eugene accepts the offer to become one of the Saviors , he still helps Sasha kill herself but lies about it to Negan . He is an important player for the Saviors in the All Out War arc , but is tormented with conflicting loyalties , especially after he discovers that Dwight is playing both sides .", + }, + ] +} + +predictions = [ + "No, The Walking Dead is not currently shown on TV. The final episode of the series aired on April 17, 2022, on AMC. The show has completed its 11-season run. However, if you're interested in watching previous seasons, they are available to stream on various platforms such as AMC+, Hulu, and Peacock. Additionally, you can find The Walking Dead merchandise and spin-off series on these platforms and other marketplaces.", + "Yes, The Walking Dead is currently being shown on TV. It airs on AMC, an American basic cable network. AMV airs new episodes of The Walking Dead as well as reruns of previous seasons.", + "Eugene lied about being a scientist to manipulate the other survivors into taking him to Washington D.C., believing it to be the best chance for survival. He initially used this lie to gain the protection of Abraham Ford. He assists the group with technical tasks even after the truth is revealed, which also improves his own confidence. His lie caused tension within the group, particularly between him and Abraham, but eventually Abraham forgave him and they resumed their friendship.", +] + + +card = TaskCard( + loader=LoadFromDictionary(data=data), + task=Task( + input_fields={"conversation": "str", "document": "str"}, + reference_fields={}, + prediction_type="str", + metrics=[ + "metrics.llm_as_judge.conversation_answer_completeness.llama3_1_v1_ibmgenai_judges" + ], + ), + templates=TemplatesDict( + { + "simple": InputOutputTemplate( + input_format="{conversation}\n\n\nDocument:\n{document}", + output_format="", + ) + } + ), +) + +test_dataset = load_dataset(card=card, template_card_index="simple")["test"] +evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + +for instance in evaluated_dataset: + print_dict( + instance, + keys_to_print=[ + "source", + "prediction", + "processed_prediction", + "references", + "score", + ], + ) diff --git a/prepare/metrics/llm_as_judge/conversation_completeness.py b/prepare/metrics/llm_as_judge/conversation_completeness.py new file mode 100644 index 0000000000..7362342d0b --- /dev/null +++ b/prepare/metrics/llm_as_judge/conversation_completeness.py @@ -0,0 +1,44 @@ +from unitxt import add_to_catalog +from unitxt.inference import ( + GenericInferenceEngine, + IbmGenAiInferenceEngine, + IbmGenAiInferenceEngineParams, +) +from unitxt.llm_as_judge import LLMAsJudge + +template_name = "templates.response_assessment.judges.completeness.v5" + +inference_models = { + "llama3_1_v1_ibmgenai": { + "model_name": "llama3-1-70binstruct", + "inference_model": IbmGenAiInferenceEngine( + model_name="meta-llama/llama-3-1-70b-instruct", + parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256), + ), + }, + "generic_inference_engine": { + "model_name": "generic", + "inference_model": (GenericInferenceEngine()), + }, +} + +for label, inference_model in inference_models.items(): + model_label = inference_model["model_name"] + template_label = template_name.split(".")[-1] + metric_label = ( + "metrics.llm_as_judge.rating." + model_label + "_template_" + template_label + ) + + cur_metric = LLMAsJudge( + inference_model=inference_model["inference_model"], + template=template_name, + task="rating.single_turn", + main_score=metric_label, + prediction_type="str", + ) + + add_to_catalog( + cur_metric, + f"metrics.llm_as_judge.conversation_answer_completeness.{label}_judges", + overwrite=True, + ) diff --git a/prepare/templates/response_assessment/judges/completeness/v5.py b/prepare/templates/response_assessment/judges/completeness/v5.py new file mode 100644 index 0000000000..bec8b94b35 --- /dev/null +++ b/prepare/templates/response_assessment/judges/completeness/v5.py @@ -0,0 +1,18 @@ +from unitxt import add_to_catalog +from unitxt.templates import InputOutputTemplate + +add_to_catalog( + InputOutputTemplate( + instruction="""The following tasks each contains a document, a conversation and a response to the last turn in the conversation. The conversation is written either as a single inquiry, or as a sequence of utterances in the form of speaker: text. \nExamine them carefully to determine whether the response is complete with respect to the information in the given document, by carefully adhere to the following steps:\n\n1. Identify the last turn (utterance) of the conversation and the inquiry made in this utterance.\n - Use the earlier turns as context to understand the last turn.\n - If the entire conversation is a single turn, consider inquiry made within that turn.\n\n2. Consider what should be included in a proper response to the inquiry, based on information provided in the document only.\n - Consider all the substantial points in the document that should be an essential part of a proper response. Do they all appear in the response? \n - If not, review carefully whether it must be an essential part of a proper response.\n\n3. Only consider information for a proper response if the information comes from the given document.\n - If the document does not provide information to address the inquiry, as long as the response indicates so, it must be considered complete. DO NOT penalize the response due to lack of information in the document.\n4. Generate your answer as a label with one of these values: yes, no, unsure, followed by an explanation of the label.\n - Choose label "yes" if all substantive information in document necessary to address the inquiry has been included in the response.\n - Choose label "no" if you have identified an essential piece of information from the document missing from the response. Make sure that if you choose "no", you must mention in the explanation at least one piece of missing information quoted from the document.\n - Choose label "unsure" if ambiguity in the conversation, response, or the document makes it unclear whether an essential part of the the response is omitted. \n\n\n[Document]\nThe company offers the Dependent Care Spending Account program. It allows you to use pre-tax dollar for qualified dependent care expenses.\n\n\n[Conversation]\nIs there a benefit for childcare?\n\n\n[Response]\nYes, the Dependent Care Spending Account program provides childcare benefits.\n\n\n[Output]\nyes.\nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC).\n\n\n[Output]\nno.\nThe conversation is a single inquiry. The response missed the information in document necessary to answer the inquiry: Incoterms are rules regarding how to define trade terms.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nHow many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 20 days of vacation per year.\n\n\n[Output]\nno. \nThe conversation is a single inquiry. The response missed the crucial information from document about how the number of vacation days varies with the number of years served, which is necessary to fully address the inquiry.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nI have been with the company of 21 years. How many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry, which is predicated on specific number of years served.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nuser: How many vacation days do I get per year.\nagent: The number of vacations per year you get varies with how many years you have been with the company.\nuser: I have been with the company of 21 years.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation has multiple turns. The inquiry of the last turn is essentially "how many vacation days does a person with the company for 21 years get?" The response contains all information in document necessary to respond to this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\n\n\n[Response]\nMars is also smaller than earth.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mars smaller than earth?" The response contains all information in document necessary to respond to the inquiry..\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant is Is mercury larger than mars?\n\n\n[Response]\nSorry for misunderstanding your previous question. No, \nmercury is also smaller than mars.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mercury smaller than mars?" The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant comparing mercury and mars.\n\n\n[Response]\nSorry for misunderstanding your previous question. Compared with mars, mercury is much hotter.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mercury smaller than mars?" The response missed the information in the document about the sizes of the planets necessary to address this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: Compare mercury and mars.\n\n\n[Response]\nMercury is smaller than mars.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially "Provide comparison between mercury and mars". The include any comparable aspects. The information in the document enables comparison in term of sizes and in terms of temperatures. The response missed the comparison of temperatures.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about their temperatures\n\n\n[Response]\nagent: I am sorry the provided document does not have this information.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last inquiry is essentially "compare the temperatures of mercury and earth". The document does not provide such information. The response indicating the lack of such information for this inquiry. Therefore the response is complete.""", + input_format="\n\nConversation:\n{question}\n\nResponse:\n{answer}\n\n\nOutput:", + output_format="[[{rating}]]", + postprocessors=[ + "processors.take_first_word", + "processors.lower_case", + "processors.yes_no_to_int", + "processors.cast_to_float_return_0_5_if_failed", + ], + ), + "templates.response_assessment.judges.completeness.v5", + overwrite=True, +) From 4bd3c036dd09191a373241be7875566f5cb41ead Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Wed, 4 Dec 2024 12:55:18 +0200 Subject: [PATCH 2/4] Run prepare files and dded msssing json files to catalog Signed-off-by: Yoav Katz --- .../generic_inference_engine_judges.json | 10 ++++++++++ .../llama3_1_v1_ibmgenai_judges.json | 15 +++++++++++++++ .../judges/completeness/v5.json | 12 ++++++++++++ 3 files changed, 37 insertions(+) create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/generic_inference_engine_judges.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/llama3_1_v1_ibmgenai_judges.json create mode 100644 src/unitxt/catalog/templates/response_assessment/judges/completeness/v5.json diff --git a/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/generic_inference_engine_judges.json b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/generic_inference_engine_judges.json new file mode 100644 index 0000000000..0c7d0edcf3 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/generic_inference_engine_judges.json @@ -0,0 +1,10 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.response_assessment.judges.completeness.v5", + "task": "rating.single_turn", + "main_score": "metrics.llm_as_judge.rating.generic_template_v5", + "prediction_type": "str" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/llama3_1_v1_ibmgenai_judges.json b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/llama3_1_v1_ibmgenai_judges.json new file mode 100644 index 0000000000..0d4279db02 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_completeness/llama3_1_v1_ibmgenai_judges.json @@ -0,0 +1,15 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "ibm_gen_ai_inference_engine", + "model_name": "meta-llama/llama-3-1-70b-instruct", + "parameters": { + "__type__": "ibm_gen_ai_inference_engine_params", + "max_new_tokens": 256 + } + }, + "template": "templates.response_assessment.judges.completeness.v5", + "task": "rating.single_turn", + "main_score": "metrics.llm_as_judge.rating.llama3-1-70binstruct_template_v5", + "prediction_type": "str" +} diff --git a/src/unitxt/catalog/templates/response_assessment/judges/completeness/v5.json b/src/unitxt/catalog/templates/response_assessment/judges/completeness/v5.json new file mode 100644 index 0000000000..901a51a02e --- /dev/null +++ b/src/unitxt/catalog/templates/response_assessment/judges/completeness/v5.json @@ -0,0 +1,12 @@ +{ + "__type__": "input_output_template", + "instruction": "The following tasks each contains a document, a conversation and a response to the last turn in the conversation. The conversation is written either as a single inquiry, or as a sequence of utterances in the form of speaker: text. \nExamine them carefully to determine whether the response is complete with respect to the information in the given document, by carefully adhere to the following steps:\n\n1. Identify the last turn (utterance) of the conversation and the inquiry made in this utterance.\n - Use the earlier turns as context to understand the last turn.\n - If the entire conversation is a single turn, consider inquiry made within that turn.\n\n2. Consider what should be included in a proper response to the inquiry, based on information provided in the document only.\n - Consider all the substantial points in the document that should be an essential part of a proper response. Do they all appear in the response? \n - If not, review carefully whether it must be an essential part of a proper response.\n\n3. Only consider information for a proper response if the information comes from the given document.\n - If the document does not provide information to address the inquiry, as long as the response indicates so, it must be considered complete. DO NOT penalize the response due to lack of information in the document.\n4. Generate your answer as a label with one of these values: yes, no, unsure, followed by an explanation of the label.\n - Choose label \"yes\" if all substantive information in document necessary to address the inquiry has been included in the response.\n - Choose label \"no\" if you have identified an essential piece of information from the document missing from the response. Make sure that if you choose \"no\", you must mention in the explanation at least one piece of missing information quoted from the document.\n - Choose label \"unsure\" if ambiguity in the conversation, response, or the document makes it unclear whether an essential part of the the response is omitted. \n\n\n[Document]\nThe company offers the Dependent Care Spending Account program. It allows you to use pre-tax dollar for qualified dependent care expenses.\n\n\n[Conversation]\nIs there a benefit for childcare?\n\n\n[Response]\nYes, the Dependent Care Spending Account program provides childcare benefits.\n\n\n[Output]\nyes.\nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC).\n\n\n[Output]\nno.\nThe conversation is a single inquiry. The response missed the information in document necessary to answer the inquiry: Incoterms are rules regarding how to define trade terms.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nHow many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 20 days of vacation per year.\n\n\n[Output]\nno. \nThe conversation is a single inquiry. The response missed the crucial information from document about how the number of vacation days varies with the number of years served, which is necessary to fully address the inquiry.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nI have been with the company of 21 years. How many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry, which is predicated on specific number of years served.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nuser: How many vacation days do I get per year.\nagent: The number of vacations per year you get varies with how many years you have been with the company.\nuser: I have been with the company of 21 years.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation has multiple turns. The inquiry of the last turn is essentially \"how many vacation days does a person with the company for 21 years get?\" The response contains all information in document necessary to respond to this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\n\n\n[Response]\nMars is also smaller than earth.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially \"Is mars smaller than earth?\" The response contains all information in document necessary to respond to the inquiry..\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant is Is mercury larger than mars?\n\n\n[Response]\nSorry for misunderstanding your previous question. No, \nmercury is also smaller than mars.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially \"Is mercury smaller than mars?\" The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant comparing mercury and mars.\n\n\n[Response]\nSorry for misunderstanding your previous question. Compared with mars, mercury is much hotter.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially \"Is mercury smaller than mars?\" The response missed the information in the document about the sizes of the planets necessary to address this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: Compare mercury and mars.\n\n\n[Response]\nMercury is smaller than mars.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially \"Provide comparison between mercury and mars\". The include any comparable aspects. The information in the document enables comparison in term of sizes and in terms of temperatures. The response missed the comparison of temperatures.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about their temperatures\n\n\n[Response]\nagent: I am sorry the provided document does not have this information.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last inquiry is essentially \"compare the temperatures of mercury and earth\". The document does not provide such information. The response indicating the lack of such information for this inquiry. Therefore the response is complete.", + "input_format": "\n\nConversation:\n{question}\n\nResponse:\n{answer}\n\n\nOutput:", + "output_format": "[[{rating}]]", + "postprocessors": [ + "processors.take_first_word", + "processors.lower_case", + "processors.yes_no_to_int", + "processors.cast_to_float_return_0_5_if_failed" + ] +} From 4a32ad3d87cdd9dfcfe86e4d1aff9fa03e3d7e7a Mon Sep 17 00:00:00 2001 From: Vraj Date: Tue, 10 Dec 2024 10:20:54 -0800 Subject: [PATCH 3/4] fix formatting issue --- prepare/templates/response_assessment/judges/completeness/v5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare/templates/response_assessment/judges/completeness/v5.py b/prepare/templates/response_assessment/judges/completeness/v5.py index bec8b94b35..567d996eb8 100644 --- a/prepare/templates/response_assessment/judges/completeness/v5.py +++ b/prepare/templates/response_assessment/judges/completeness/v5.py @@ -3,7 +3,7 @@ add_to_catalog( InputOutputTemplate( - instruction="""The following tasks each contains a document, a conversation and a response to the last turn in the conversation. The conversation is written either as a single inquiry, or as a sequence of utterances in the form of speaker: text. \nExamine them carefully to determine whether the response is complete with respect to the information in the given document, by carefully adhere to the following steps:\n\n1. Identify the last turn (utterance) of the conversation and the inquiry made in this utterance.\n - Use the earlier turns as context to understand the last turn.\n - If the entire conversation is a single turn, consider inquiry made within that turn.\n\n2. Consider what should be included in a proper response to the inquiry, based on information provided in the document only.\n - Consider all the substantial points in the document that should be an essential part of a proper response. Do they all appear in the response? \n - If not, review carefully whether it must be an essential part of a proper response.\n\n3. Only consider information for a proper response if the information comes from the given document.\n - If the document does not provide information to address the inquiry, as long as the response indicates so, it must be considered complete. DO NOT penalize the response due to lack of information in the document.\n4. Generate your answer as a label with one of these values: yes, no, unsure, followed by an explanation of the label.\n - Choose label "yes" if all substantive information in document necessary to address the inquiry has been included in the response.\n - Choose label "no" if you have identified an essential piece of information from the document missing from the response. Make sure that if you choose "no", you must mention in the explanation at least one piece of missing information quoted from the document.\n - Choose label "unsure" if ambiguity in the conversation, response, or the document makes it unclear whether an essential part of the the response is omitted. \n\n\n[Document]\nThe company offers the Dependent Care Spending Account program. It allows you to use pre-tax dollar for qualified dependent care expenses.\n\n\n[Conversation]\nIs there a benefit for childcare?\n\n\n[Response]\nYes, the Dependent Care Spending Account program provides childcare benefits.\n\n\n[Output]\nyes.\nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC).\n\n\n[Output]\nno.\nThe conversation is a single inquiry. The response missed the information in document necessary to answer the inquiry: Incoterms are rules regarding how to define trade terms.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nHow many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 20 days of vacation per year.\n\n\n[Output]\nno. \nThe conversation is a single inquiry. The response missed the crucial information from document about how the number of vacation days varies with the number of years served, which is necessary to fully address the inquiry.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nI have been with the company of 21 years. How many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry, which is predicated on specific number of years served.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nuser: How many vacation days do I get per year.\nagent: The number of vacations per year you get varies with how many years you have been with the company.\nuser: I have been with the company of 21 years.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation has multiple turns. The inquiry of the last turn is essentially "how many vacation days does a person with the company for 21 years get?" The response contains all information in document necessary to respond to this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\n\n\n[Response]\nMars is also smaller than earth.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mars smaller than earth?" The response contains all information in document necessary to respond to the inquiry..\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant is Is mercury larger than mars?\n\n\n[Response]\nSorry for misunderstanding your previous question. No, \nmercury is also smaller than mars.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mercury smaller than mars?" The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant comparing mercury and mars.\n\n\n[Response]\nSorry for misunderstanding your previous question. Compared with mars, mercury is much hotter.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mercury smaller than mars?" The response missed the information in the document about the sizes of the planets necessary to address this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: Compare mercury and mars.\n\n\n[Response]\nMercury is smaller than mars.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially "Provide comparison between mercury and mars". The include any comparable aspects. The information in the document enables comparison in term of sizes and in terms of temperatures. The response missed the comparison of temperatures.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth’s size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth’s size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about their temperatures\n\n\n[Response]\nagent: I am sorry the provided document does not have this information.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last inquiry is essentially "compare the temperatures of mercury and earth". The document does not provide such information. The response indicating the lack of such information for this inquiry. Therefore the response is complete.""", + instruction="""The following tasks each contains a document, a conversation and a response to the last turn in the conversation. The conversation is written either as a single inquiry, or as a sequence of utterances in the form of speaker: text. \nExamine them carefully to determine whether the response is complete with respect to the information in the given document, by carefully adhere to the following steps:\n\n1. Identify the last turn (utterance) of the conversation and the inquiry made in this utterance.\n - Use the earlier turns as context to understand the last turn.\n - If the entire conversation is a single turn, consider inquiry made within that turn.\n\n2. Consider what should be included in a proper response to the inquiry, based on information provided in the document only.\n - Consider all the substantial points in the document that should be an essential part of a proper response. Do they all appear in the response? \n - If not, review carefully whether it must be an essential part of a proper response.\n\n3. Only consider information for a proper response if the information comes from the given document.\n - If the document does not provide information to address the inquiry, as long as the response indicates so, it must be considered complete. DO NOT penalize the response due to lack of information in the document.\n4. Generate your answer as a label with one of these values: yes, no, unsure, followed by an explanation of the label.\n - Choose label "yes" if all substantive information in document necessary to address the inquiry has been included in the response.\n - Choose label "no" if you have identified an essential piece of information from the document missing from the response. Make sure that if you choose "no", you must mention in the explanation at least one piece of missing information quoted from the document.\n - Choose label "unsure" if ambiguity in the conversation, response, or the document makes it unclear whether an essential part of the the response is omitted. \n\n\n[Document]\nThe company offers the Dependent Care Spending Account program. It allows you to use pre-tax dollar for qualified dependent care expenses.\n\n\n[Conversation]\nIs there a benefit for childcare?\n\n\n[Response]\nYes, the Dependent Care Spending Account program provides childcare benefits.\n\n\n[Output]\nyes.\nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nWith this app you can create and maintain incoterms which are the official rules of the International Chamber of Commerce (ICC) regarding how to define trade terms.\n\nProvide the Incoterms Version. An incoterms version is an edition containing a list of international terms for transportation that is defined by the International Chamber of Commerce (ICC).\n\n\n[Conversation]\nWhat are Incoterms?\n\n\n[Response]\nIncoterms are the official rules of the International Chamber of Commerce (ICC).\n\n\n[Output]\nno.\nThe conversation is a single inquiry. The response missed the information in document necessary to answer the inquiry: Incoterms are rules regarding how to define trade terms.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nHow many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 20 days of vacation per year.\n\n\n[Output]\nno. \nThe conversation is a single inquiry. The response missed the crucial information from document about how the number of vacation days varies with the number of years served, which is necessary to fully address the inquiry.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nI have been with the company of 21 years. How many vacation days do I get per year.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation is a single inquiry. The response contains all information in document necessary to respond to the inquiry, which is predicated on specific number of years served.\n\n\n\n[Document]\nVacation days per year depends on years served: \n0-9 years of service: 15 days\n10-19 years of service: 20 days\n20+ years of service: 25 days\n\n\n[Conversation]\nuser: How many vacation days do I get per year.\nagent: The number of vacations per year you get varies with how many years you have been with the company.\nuser: I have been with the company of 21 years.\n\n\n[Response]\nYou can have up to 25 days of vacation per year.\n\n\n[Output]\nyes. \nThe conversation has multiple turns. The inquiry of the last turn is essentially "how many vacation days does a person with the company for 21 years get?" The response contains all information in document necessary to respond to this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth's size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth's size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\n\n\n[Response]\nMars is also smaller than earth.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mars smaller than earth?" The response contains all information in document necessary to respond to the inquiry..\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth's size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth's size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant is Is mercury larger than mars?\n\n\n[Response]\nSorry for misunderstanding your previous question. No, \nmercury is also smaller than mars.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mercury smaller than mars?" The response contains all information in document necessary to respond to the inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth's size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth's size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about mars?\nagent: Mars is also smaller than earth.\nuser: No I meant comparing mercury and mars.\n\n\n[Response]\nSorry for misunderstanding your previous question. Compared with mars, mercury is much hotter.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially "Is mercury smaller than mars?" The response missed the information in the document about the sizes of the planets necessary to address this inquiry.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth's size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth's size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\nSolar System Temperatures: Mean Temperatures on Each Planet:\nMercury: 333°F (167°C)\nVenus: 867°F (464°C)\nEarth: 59°F (15°C)\nMars: Minus 85°F (-65°C)\nJupiter: Minus 166°F (-110°C)\nSaturn: Minus 220°F (-140°C)\nUranus: Minus 320°F (-195°C)\nNeptune: Minus 330°F (-200°C)\nDwarf Planet Pluto: Minus 375°F (-225°C)\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: Compare mercury and mars.\n\n\n[Response]\nMercury is smaller than mars.\n\n\n[Output]\nno.\nThe conversation has multiple turns. The last user inquiry is essentially "Provide comparison between mercury and mars". The include any comparable aspects. The information in the document enables comparison in term of sizes and in terms of temperatures. The response missed the comparison of temperatures.\n\n\n\n[Document]\nThe Solar System: Planet Sizes:\nMercury - 1,516mi (2,440km) radius; about 1/3 the size of Earth\nVenus - 3,760mi (6,052km) radius; only slightly smaller than Earth\nEarth - 3,959mi (6,371km) radius\nMars - 2,106mi (3,390km) radius; about half the size of Earth\nJupiter - 43,441mi (69,911km) radius; 11x Earth's size\nSaturn - 36,184mi (58,232km) radius; 9x larger than Earth\nUranus - 15,759mi (25,362km) radius; 4x Earth's size\nNeptune - 15,299mi (24,622km) radius; only slightly smaller than Uranus\n\n\n[Conversation]\nuser: Is Mercury larger than earth?\nagent: No. Mercury is smaller than earth.\nuser: How about their temperatures\n\n\n[Response]\nagent: I am sorry the provided document does not have this information.\n\n\n[Output]\nyes.\nThe conversation has multiple turns. The last inquiry is essentially "compare the temperatures of mercury and earth". The document does not provide such information. The response indicating the lack of such information for this inquiry. Therefore the response is complete.""", input_format="\n\nConversation:\n{question}\n\nResponse:\n{answer}\n\n\nOutput:", output_format="[[{rating}]]", postprocessors=[ From 76e7502421dd673902a75bea448ff099bd705371 Mon Sep 17 00:00:00 2001 From: Vraj Date: Tue, 10 Dec 2024 10:34:33 -0800 Subject: [PATCH 4/4] update inference enginer to cross provider --- .../llm_as_judge/conversation_completeness.py | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/prepare/metrics/llm_as_judge/conversation_completeness.py b/prepare/metrics/llm_as_judge/conversation_completeness.py index 7362342d0b..51270f7051 100644 --- a/prepare/metrics/llm_as_judge/conversation_completeness.py +++ b/prepare/metrics/llm_as_judge/conversation_completeness.py @@ -5,21 +5,33 @@ IbmGenAiInferenceEngineParams, ) from unitxt.llm_as_judge import LLMAsJudge +from unitxt.inference import ( + CrossProviderInferenceEngine, +) template_name = "templates.response_assessment.judges.completeness.v5" +# inference_models = { +# "llama3_1_v1_ibmgenai": { +# "model_name": "llama3-1-70binstruct", +# "inference_model": IbmGenAiInferenceEngine( +# model_name="meta-llama/llama-3-1-70b-instruct", +# parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256), +# ), +# }, +# "generic_inference_engine": { +# "model_name": "generic", +# "inference_model": (GenericInferenceEngine()), +# }, +# } + inference_models = { "llama3_1_v1_ibmgenai": { - "model_name": "llama3-1-70binstruct", - "inference_model": IbmGenAiInferenceEngine( - model_name="meta-llama/llama-3-1-70b-instruct", - parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256), - ), - }, - "generic_inference_engine": { - "model_name": "generic", - "inference_model": (GenericInferenceEngine()), - }, + "model_name": "llama3-1-70b-instruct", + "inference_model": CrossProviderInferenceEngine( + model="llama-3-1-70b-instruct", provider="rits" + ) + } } for label, inference_model in inference_models.items():