diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 64f8f8cc4..4ca43ad39 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -226,8 +226,20 @@ def wrapped_program(example_idx, example): if return_outputs: # Handle the return_outputs logic results = [(example, prediction, score) for _, example, prediction, score in predicted_devset] + def prediction_is_dictlike(prediction): + try: + dict(prediction) + return True + except Exception: + return False + data = [ - merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset + ( + merge_dicts(example, prediction) | {"correct": score} + if prediction_is_dictlike(prediction) + else dict(example) | {"prediction": prediction, "correct": score} + ) + for _, example, prediction, score in predicted_devset ] result_df = pd.DataFrame(data) diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index a8552c7aa..7615c70b3 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -8,6 +8,7 @@ import dspy from dspy.evaluate.evaluate import Evaluate from dspy.evaluate.metrics import answer_exact_match +from dspy.functional import TypedPredictor from dspy.predict import Predict from dspy.utils.dummies import DummyLM @@ -120,14 +121,38 @@ def test_evaluate_call_bad(): assert score == 0.0 +@pytest.mark.parametrize( + "program_with_example", + [ + (Predict("question -> answer"), new_example("What is 1+1?", "2")), + ( + # Create a program that extracts entities from text and returns them as a list, + # rather than returning a Predictor() wrapper. This is done intentionally to test + # the case where the program does not output a dictionary-like object because + # Evaluate() has failed for this case in the past + lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities, + dspy.Example(text="United States", entities=["United States"]).with_inputs("text"), + ), + ], +) @pytest.mark.parametrize("display_table", [True, False, 1]) @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False]) -def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd): - devset = [new_example("What is 1+1?", "2")] - program = Predict("question -> answer") +def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd): + program, example = program_with_example + example_input = next(iter(example.inputs().values())) + example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()} + + dspy.settings.configure( + lm=DummyLM( + { + example_input: example_output, + } + ) + ) + ev = Evaluate( - devset=devset, - metric=answer_exact_match, + devset=[example], + metric=lambda example, pred, **kwargs: example == pred, display_table=display_table, ) assert ev.display_table == display_table @@ -140,4 +165,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen if not is_in_ipython_notebook_environment and display_table: # In console environments where IPython is not available, the table should be printed # to the console - assert "What is 1+1?" in out + example_input = next(iter(example.inputs().values())) + assert example_input in out