Merge remote-tracking branch 'origin/main' into demos_experimental

Signed-off-by: elronbandel <[email protected]>
IBM · Dec 24, 2024 · 140fddd · 140fddd
2 parents 3eb8eed + 3d0e23c
commit 140fddd
Show file tree

Hide file tree

Showing 81 changed files with 3,189 additions and 472 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -9,7 +9,7 @@ on:
 concurrency:
     group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
     cancel-in-progress: true
-    
+
 jobs:
     docs:
 
@@ -23,10 +23,10 @@ jobs:
 
         - uses: actions/setup-python@v5
           with:
-            python-version: '3.9'
+            python-version: '3.8'
 
         - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-        - run: uv pip install --system ".[tests,docs]"
+        - run: uv pip install --system ".[docs]"
 
         - name: Compile Docs
           run: make docs

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,11 +10,6 @@ repos:
         args: [--fix]
         exclude: src/unitxt/metrics.py|examples/evaluate_existing_dataset_no_install.py
       # Run the linter on the specific file with the ignore flag
-      - id: ruff
-        name: ruff (src/unitxt/metrics.py)
-        files: src/unitxt/metrics.py
-        args: [--fix, --ignore, C901]
-      # Run the linter on the specific file with the ignore flag
       - id: ruff
         name: ruff (examples/evaluate_existing_dataset_no_install.py)
         files: examples/evaluate_existing_dataset_no_install.py

diff --git a/examples/evaluate_existing_dataset_by_llm_as_judge_direct.py b/examples/evaluate_existing_dataset_by_llm_as_judge_direct.py
@@ -0,0 +1,118 @@
+import statistics
+
+from unitxt import get_logger, get_settings, load_dataset
+from unitxt.api import evaluate
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+)
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+settings = get_settings()
+
+# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
+# We set loader_limit to 20 to reduce download time.
+criterias = ["answer_relevance", "coherence", "conciseness"]
+metrics = [
+    "metrics.llm_as_judge.direct.rits.llama3_1_70b"
+    "[context_fields=[context,question],"
+    f"criteria=metrics.llm_as_judge.direct.criterias.{criteria},"
+    f"score_prefix={criteria}_]"
+    for criteria in criterias
+]
+dataset = load_dataset(
+    card="cards.squad",
+    metrics=metrics,
+    loader_limit=10,
+    max_test_instances=10,
+    split="test",
+)
+
+# Infer a model to get predictions.
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", provider="watsonx"
+)
+
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
+predictions = inference_model.infer(dataset)
+
+gold_answers = [d[0] for d in dataset["references"]]
+
+# Evaluate the predictions using the defined metric.
+evaluated_predictions = evaluate(predictions=predictions, data=dataset)
+evaluated_gold_answers = evaluate(predictions=gold_answers, data=dataset)
+
+print_dict(
+    evaluated_predictions[0],
+    keys_to_print=[
+        "source",
+        "score",
+    ],
+)
+print_dict(
+    evaluated_gold_answers[0],
+    keys_to_print=[
+        "source",
+        "score",
+    ],
+)
+
+for criteria in criterias:
+    logger.info(f"Scores for criteria '{criteria}'")
+    gold_answer_scores = [
+        instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
+        for instance in evaluated_gold_answers
+    ]
+    gold_answer_position_bias = [
+        int(instance["score"]["instance"][f"{criteria}_positional_bias"])
+        for instance in evaluated_gold_answers
+    ]
+    prediction_scores = [
+        instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
+        for instance in evaluated_predictions
+    ]
+    prediction_position_bias = [
+        int(instance["score"]["instance"][f"{criteria}_positional_bias"])
+        for instance in evaluated_predictions
+    ]
+
+    logger.info(
+        f"Scores of gold answers: {statistics.mean(gold_answer_scores)} +/- {statistics.stdev(gold_answer_scores)}"
+    )
+    logger.info(
+        f"Scores of predicted answers: {statistics.mean(prediction_scores)} +/- {statistics.stdev(prediction_scores)}"
+    )
+    logger.info(
+        f"Positional bias occurrence on gold answers: {statistics.mean(gold_answer_position_bias)}"
+    )
+    logger.info(
+        f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_position_bias)}\n"
+    )
+
+"""
+Output with 100 examples
+
+Scores for criteria 'answer_relevance'
+Scores of gold answers: 0.9625 +/- 0.14811526360619054
+Scores of predicted answers: 0.5125 +/- 0.4638102516061385
+Positional bias occurrence on gold answers: 0.03
+Positional bias occurrence on predicted answers: 0.12
+
+Scores for criteria 'coherence'
+Scores of gold answers: 0.159 +/- 0.15689216524464028
+Scores of predicted answers: 0.066 +/- 0.11121005695384194
+Positional bias occurrence on gold answers: 0.16
+Positional bias occurrence on predicted answers: 0.07
+
+Scores for criteria 'conciseness'
+Scores of gold answers: 1.0 +/- 0.0
+Scores of predicted answers: 0.34 +/- 0.47609522856952335
+Positional bias occurrence on gold answers: 0.03
+Positional bias occurrence on predicted answers: 0.01
+"""
diff --git a/...luate_existing_dataset_by_llm_as_judge.py → ..._dataset_by_llm_as_judge_from_template.py b/...luate_existing_dataset_by_llm_as_judge.py → ..._dataset_by_llm_as_judge_from_template.py
diff --git a/examples/evaluate_llm_as_judge_direct_criteria_from_dataset.py b/examples/evaluate_llm_as_judge_direct_criteria_from_dataset.py
@@ -0,0 +1,49 @@
+from typing import Any
+
+from unitxt import evaluate, load_dataset
+from unitxt.blocks import Task, TaskCard
+from unitxt.llm_as_judge_operators import CreateYesNoCriteriaFromString
+from unitxt.loaders import LoadFromDictionary
+
+data = {
+    "test": [
+        {
+            "question": "How is the weather?",
+            "judgement": "In the response, if there is a numerical temperature present, is it denominated in both Fahrenheit and Celsius?",
+        },
+        {
+            "question": "Tell me a joke about cats",
+            "judgement": "Is the response funny?",
+        },
+    ]
+}
+
+card = TaskCard(
+    loader=LoadFromDictionary(data=data, data_classification_policy=["public"]),
+    preprocess_steps=[
+        CreateYesNoCriteriaFromString(field="judgement", to_field="criteria"),
+    ],
+    task=Task(
+        input_fields={"question": str},
+        reference_fields={"criteria": Any},
+        prediction_type=str,
+        metrics=[
+            "metrics.llm_as_judge.direct.watsonx.llama3_1_70b[context_fields=question,criteria_field=criteria]"
+        ],
+    ),
+)
+
+dataset = load_dataset(card=card, template="templates.empty", split="test")
+
+predictions = [
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """Why did the cat cross the road? To cat to the other side.""",
+]
+
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Global Scores:")
+print(results.global_scores.summary)
+
+print("Instance Scores:")
+print(results.instance_scores.summary)
diff --git a/examples/evaluate_llm_as_judge_direct_predefined_criteria.py b/examples/evaluate_llm_as_judge_direct_predefined_criteria.py
@@ -0,0 +1,33 @@
+from unitxt import get_logger
+from unitxt.api import create_dataset, evaluate
+
+logger = get_logger()
+
+data = [
+    {"question": "How is the weather?"},
+    {"question": "How is the weather?"},
+    {"question": "How is the weather?"},
+]
+
+criteria = "metrics.llm_as_judge.direct.criterias.temperature_in_celsius_and_fahrenheit"
+metrics = [
+    f"metrics.llm_as_judge.direct.rits.llama3_1_70b[criteria={criteria}, context_fields=[question]]"
+]
+
+dataset = create_dataset(
+    task="tasks.qa.open", test_set=data, metrics=metrics, split="test"
+)
+
+predictions = [
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+]
+
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Global Scores:")
+print(results.global_scores.summary)
+
+print("Instance Scores:")
+print(results.instance_scores.summary)
diff --git a/examples/evaluate_llm_as_judge_direct_user_criteria_no_catalog.py b/examples/evaluate_llm_as_judge_direct_user_criteria_no_catalog.py
@@ -0,0 +1,62 @@
+from unitxt.api import create_dataset, evaluate
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.llm_as_judge import LLMJudgeDirect
+from unitxt.llm_as_judge_constants import (
+    CriteriaWithOptions,
+)
+
+criteria = CriteriaWithOptions.from_obj(
+    {
+        "name": "Temperature in Fahrenheit and Celsius",
+        "description": "In the response, if there is a numerical temperature present, is it denominated in both Fahrenheit and Celsius?",
+        "options": [
+            {
+                "name": "Yes",
+                "description": "The temperature reading is provided in both Fahrenheit and Celsius.",
+            },
+            {
+                "name": "No",
+                "description": "The temperature reading is provided either in Fahrenheit or Celsius, but not both.",
+            },
+            {
+                "name": "Pass",
+                "description": "There is no numerical temperature reading in the response.",
+            },
+        ],
+        "option_map": {"Yes": 1.0, "No": 0.5, "Pass": 0.0},
+    }
+)
+
+
+data = [
+    {"question": "How is the weather?"},
+    {"question": "How is the weather?"},
+    {"question": "How is the weather?"},
+]
+
+metric = LLMJudgeDirect(
+    inference_engine=CrossProviderInferenceEngine(
+        model="llama-3-1-70b-instruct", max_tokens=1024
+    ),
+    criteria=criteria,
+    context_fields=["question"],
+    criteria_field="criteria",
+)
+
+dataset = create_dataset(
+    task="tasks.qa.open", test_set=data, metrics=[metric], split="test"
+)
+
+predictions = [
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    """On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+]
+
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Global Scores:")
+print(results.global_scores.summary)
+
+print("Instance Scores:")
+print(results.instance_scores.summary)
diff --git a/examples/evaluate_llm_as_judge.py → ...es/evaluate_llm_as_judge_from_template.py b/examples/evaluate_llm_as_judge.py → ...es/evaluate_llm_as_judge_from_template.py
diff --git a/examples/evaluate_llm_as_judge_pairwise_criteria_from_dataset.py b/examples/evaluate_llm_as_judge_pairwise_criteria_from_dataset.py
@@ -0,0 +1,61 @@
+from typing import Any, List
+
+from unitxt import evaluate, load_dataset
+from unitxt.blocks import Task, TaskCard
+from unitxt.llm_as_judge_operators import (
+    CreateCriteriaFromString,
+)
+from unitxt.loaders import LoadFromDictionary
+from unitxt.templates import NullTemplate
+
+data = {
+    "test": [
+        {
+            "question": "How is the weather?",
+            "judgement": "The temperature is described in both Fahrenheit and Celsius.",
+        },
+        {
+            "question": "Tell me a joke about cats",
+            "judgement": "Is the response funny?",
+        },
+    ]
+}
+
+card = TaskCard(
+    loader=LoadFromDictionary(data=data, data_classification_policy=["public"]),
+    preprocess_steps=[
+        CreateCriteriaFromString(field="judgement", to_field="criteria"),
+    ],
+    task=Task(
+        input_fields={"question": str},
+        reference_fields={"criteria": Any},
+        prediction_type=List[str],
+        metrics=[
+            "metrics.llm_as_judge.pairwise.rits.llama3_1_70b[context_fields=question,criteria_field=criteria]"
+        ],
+        default_template=NullTemplate(),
+    ),
+)
+
+test_dataset = load_dataset(card=card, split="test")
+
+predictions = [
+    [
+        """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit (around 31-34°C). The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+        """On most days, the weather is warm and humid, with temperatures often soaring into the high 80s and low 90s Fahrenheit. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+        """On most days, the weather is warm and humid. The dense foliage of the jungle acts as a natural air conditioner, keeping the temperature relatively stable and comfortable for the inhabitants.""",
+    ],
+    [
+        """Why did the cat cross the road? To cat to the other side.""",
+        """Why did the cat sit on the computer? Because it wanted to keep an eye on the mouse!""",
+        """What is red, yellow and green? A traffic light.""",
+    ],
+]
+
+results = evaluate(predictions=predictions, data=test_dataset)
+
+print("Global Scores:")
+print(results.global_scores.summary)
+
+print("Instance Scores:")
+print(results.instance_scores.summary)