Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into demos_experimental
Browse files Browse the repository at this point in the history
Signed-off-by: elronbandel <[email protected]>
  • Loading branch information
elronbandel committed Dec 24, 2024
2 parents 3eb8eed + 3d0e23c commit 140fddd
Show file tree
Hide file tree
Showing 81 changed files with 3,189 additions and 472 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
concurrency:
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
cancel-in-progress: true

jobs:
docs:

Expand All @@ -23,10 +23,10 @@ jobs:

- uses: actions/setup-python@v5
with:
python-version: '3.9'
python-version: '3.8'

- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv pip install --system ".[tests,docs]"
- run: uv pip install --system ".[docs]"

- name: Compile Docs
run: make docs
Expand Down
5 changes: 0 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ repos:
args: [--fix]
exclude: src/unitxt/metrics.py|examples/evaluate_existing_dataset_no_install.py
# Run the linter on the specific file with the ignore flag
- id: ruff
name: ruff (src/unitxt/metrics.py)
files: src/unitxt/metrics.py
args: [--fix, --ignore, C901]
# Run the linter on the specific file with the ignore flag
- id: ruff
name: ruff (examples/evaluate_existing_dataset_no_install.py)
files: examples/evaluate_existing_dataset_no_install.py
Expand Down
118 changes: 118 additions & 0 deletions examples/evaluate_existing_dataset_by_llm_as_judge_direct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import statistics

from unitxt import get_logger, get_settings, load_dataset
from unitxt.api import evaluate
from unitxt.inference import (
CrossProviderInferenceEngine,
)
from unitxt.text_utils import print_dict

logger = get_logger()
settings = get_settings()

# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
# We set loader_limit to 20 to reduce download time.
criterias = ["answer_relevance", "coherence", "conciseness"]
metrics = [
"metrics.llm_as_judge.direct.rits.llama3_1_70b"
"[context_fields=[context,question],"
f"criteria=metrics.llm_as_judge.direct.criterias.{criteria},"
f"score_prefix={criteria}_]"
for criteria in criterias
]
dataset = load_dataset(
card="cards.squad",
metrics=metrics,
loader_limit=10,
max_test_instances=10,
split="test",
)

# Infer a model to get predictions.
inference_model = CrossProviderInferenceEngine(
model="llama-3-2-1b-instruct", provider="watsonx"
)

"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.
For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""
predictions = inference_model.infer(dataset)

gold_answers = [d[0] for d in dataset["references"]]

# Evaluate the predictions using the defined metric.
evaluated_predictions = evaluate(predictions=predictions, data=dataset)
evaluated_gold_answers = evaluate(predictions=gold_answers, data=dataset)

print_dict(
evaluated_predictions[0],
keys_to_print=[
"source",
"score",
],
)
print_dict(
evaluated_gold_answers[0],
keys_to_print=[
"source",
"score",
],
)

for criteria in criterias:
logger.info(f"Scores for criteria '{criteria}'")
gold_answer_scores = [
instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
for instance in evaluated_gold_answers
]
gold_answer_position_bias = [
int(instance["score"]["instance"][f"{criteria}_positional_bias"])
for instance in evaluated_gold_answers
]
prediction_scores = [
instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
for instance in evaluated_predictions
]
prediction_position_bias = [
int(instance["score"]["instance"][f"{criteria}_positional_bias"])
for instance in evaluated_predictions
]

logger.info(
f"Scores of gold answers: {statistics.mean(gold_answer_scores)} +/- {statistics.stdev(gold_answer_scores)}"
)
logger.info(
f"Scores of predicted answers: {statistics.mean(prediction_scores)} +/- {statistics.stdev(prediction_scores)}"
)
logger.info(
f"Positional bias occurrence on gold answers: {statistics.mean(gold_answer_position_bias)}"
)
logger.info(
f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_position_bias)}\n"
)

"""
Output with 100 examples
Scores for criteria 'answer_relevance'
Scores of gold answers: 0.9625 +/- 0.14811526360619054
Scores of predicted answers: 0.5125 +/- 0.4638102516061385
Positional bias occurrence on gold answers: 0.03
Positional bias occurrence on predicted answers: 0.12
Scores for criteria 'coherence'
Scores of gold answers: 0.159 +/- 0.15689216524464028
Scores of predicted answers: 0.066 +/- 0.11121005695384194
Positional bias occurrence on gold answers: 0.16
Positional bias occurrence on predicted answers: 0.07
Scores for criteria 'conciseness'
Scores of gold answers: 1.0 +/- 0.0
Scores of predicted answers: 0.34 +/- 0.47609522856952335
Positional bias occurrence on gold answers: 0.03
Positional bias occurrence on predicted answers: 0.01
"""
49 changes: 49 additions & 0 deletions examples/evaluate_llm_as_judge_direct_criteria_from_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Any

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import CreateYesNoCriteriaFromString
from unitxt.loaders import LoadFromDictionary

data = {
"test": [
{
"question": "How is the weather?",
"judgement": "In the response, if there is a numerical temperature present, is it denominated in both Fahrenheit and Celsius?",
},
{
"question": "Tell me a joke about cats",
"judgement": "Is the response funny?",
},
]
}

card = TaskCard(
loader=LoadFromDictionary(data=data, data_classification_policy=["public"]),
preprocess_steps=[
CreateYesNoCriteriaFromString(field="judgement", to_field="criteria"),
],
task=Task(
input_fields={"question": str},
reference_fields={"criteria": Any},
prediction_type=str,
metrics=[
"metrics.llm_as_judge.direct.watsonx.llama3_1_70b[context_fields=question,criteria_field=criteria]"
],
),
)

dataset = load_dataset(card=card, template="templates.empty", split="test")

predictions = [
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""Why did the cat cross the road? To cat to the other side.""",
]

results = evaluate(predictions=predictions, data=dataset)

print("Global Scores:")
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
33 changes: 33 additions & 0 deletions examples/evaluate_llm_as_judge_direct_predefined_criteria.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from unitxt import get_logger
from unitxt.api import create_dataset, evaluate

logger = get_logger()

data = [
{"question": "How is the weather?"},
{"question": "How is the weather?"},
{"question": "How is the weather?"},
]

criteria = "metrics.llm_as_judge.direct.criterias.temperature_in_celsius_and_fahrenheit"
metrics = [
f"metrics.llm_as_judge.direct.rits.llama3_1_70b[criteria={criteria}, context_fields=[question]]"
]

dataset = create_dataset(
task="tasks.qa.open", test_set=data, metrics=metrics, split="test"
)

predictions = [
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
]

results = evaluate(predictions=predictions, data=dataset)

print("Global Scores:")
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
62 changes: 62 additions & 0 deletions examples/evaluate_llm_as_judge_direct_user_criteria_no_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from unitxt.api import create_dataset, evaluate
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge import LLMJudgeDirect
from unitxt.llm_as_judge_constants import (
CriteriaWithOptions,
)

criteria = CriteriaWithOptions.from_obj(
{
"name": "Temperature in Fahrenheit and Celsius",
"description": "In the response, if there is a numerical temperature present, is it denominated in both Fahrenheit and Celsius?",
"options": [
{
"name": "Yes",
"description": "The temperature reading is provided in both Fahrenheit and Celsius.",
},
{
"name": "No",
"description": "The temperature reading is provided either in Fahrenheit or Celsius, but not both.",
},
{
"name": "Pass",
"description": "There is no numerical temperature reading in the response.",
},
],
"option_map": {"Yes": 1.0, "No": 0.5, "Pass": 0.0},
}
)


data = [
{"question": "How is the weather?"},
{"question": "How is the weather?"},
{"question": "How is the weather?"},
]

metric = LLMJudgeDirect(
inference_engine=CrossProviderInferenceEngine(
model="llama-3-1-70b-instruct", max_tokens=1024
),
criteria=criteria,
context_fields=["question"],
criteria_field="criteria",
)

dataset = create_dataset(
task="tasks.qa.open", test_set=data, metrics=[metric], split="test"
)

predictions = [
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
]

results = evaluate(predictions=predictions, data=dataset)

print("Global Scores:")
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
File renamed without changes.
61 changes: 61 additions & 0 deletions examples/evaluate_llm_as_judge_pairwise_criteria_from_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Any, List

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import (
CreateCriteriaFromString,
)
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

data = {
"test": [
{
"question": "How is the weather?",
"judgement": "The temperature is described in both Fahrenheit and Celsius.",
},
{
"question": "Tell me a joke about cats",
"judgement": "Is the response funny?",
},
]
}

card = TaskCard(
loader=LoadFromDictionary(data=data, data_classification_policy=["public"]),
preprocess_steps=[
CreateCriteriaFromString(field="judgement", to_field="criteria"),
],
task=Task(
input_fields={"question": str},
reference_fields={"criteria": Any},
prediction_type=List[str],
metrics=[
"metrics.llm_as_judge.pairwise.rits.llama3_1_70b[context_fields=question,criteria_field=criteria]"
],
default_template=NullTemplate(),
),
)

test_dataset = load_dataset(card=card, split="test")

predictions = [
[
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
"""On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
],
[
"""Why did the cat cross the road? To cat to the other side.""",
"""Why did the cat sit on the computer? Because it wanted to keep an eye on the mouse!""",
"""What is red, yellow and green? A traffic light.""",
],
]

results = evaluate(predictions=predictions, data=test_dataset)

print("Global Scores:")
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
Loading

0 comments on commit 140fddd

Please sign in to comment.