Skip to content

Commit

Permalink
Fix error in Evaluate with display_table=True with outputs that canno…
Browse files Browse the repository at this point in the history
…t be converted to dict (#1682)

* fix

Signed-off-by: dbczumar <[email protected]>

* fix

Signed-off-by: dbczumar <[email protected]>

* fix

Signed-off-by: dbczumar <[email protected]>

* fix

Signed-off-by: dbczumar <[email protected]>

* fix

Signed-off-by: dbczumar <[email protected]>

* fix

Signed-off-by: dbczumar <[email protected]>

---------

Signed-off-by: dbczumar <[email protected]>
  • Loading branch information
dbczumar authored Oct 24, 2024
1 parent 56dec59 commit a68f2d9
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 7 deletions.
14 changes: 13 additions & 1 deletion dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,20 @@ def wrapped_program(example_idx, example):
if return_outputs: # Handle the return_outputs logic
results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]

def prediction_is_dictlike(prediction):
try:
dict(prediction)
return True
except Exception:
return False

data = [
merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset
(
merge_dicts(example, prediction) | {"correct": score}
if prediction_is_dictlike(prediction)
else dict(example) | {"prediction": prediction, "correct": score}
)
for _, example, prediction, score in predicted_devset
]

result_df = pd.DataFrame(data)
Expand Down
38 changes: 32 additions & 6 deletions tests/evaluate/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import dspy
from dspy.evaluate.evaluate import Evaluate
from dspy.evaluate.metrics import answer_exact_match
from dspy.functional import TypedPredictor
from dspy.predict import Predict
from dspy.utils.dummies import DummyLM

Expand Down Expand Up @@ -120,14 +121,38 @@ def test_evaluate_call_bad():
assert score == 0.0


@pytest.mark.parametrize(
"program_with_example",
[
(Predict("question -> answer"), new_example("What is 1+1?", "2")),
(
# Create a program that extracts entities from text and returns them as a list,
# rather than returning a Predictor() wrapper. This is done intentionally to test
# the case where the program does not output a dictionary-like object because
# Evaluate() has failed for this case in the past
lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities,
dspy.Example(text="United States", entities=["United States"]).with_inputs("text"),
),
],
)
@pytest.mark.parametrize("display_table", [True, False, 1])
@pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False])
def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd):
devset = [new_example("What is 1+1?", "2")]
program = Predict("question -> answer")
def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd):
program, example = program_with_example
example_input = next(iter(example.inputs().values()))
example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()}

dspy.settings.configure(
lm=DummyLM(
{
example_input: example_output,
}
)
)

ev = Evaluate(
devset=devset,
metric=answer_exact_match,
devset=[example],
metric=lambda example, pred, **kwargs: example == pred,
display_table=display_table,
)
assert ev.display_table == display_table
Expand All @@ -140,4 +165,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen
if not is_in_ipython_notebook_environment and display_table:
# In console environments where IPython is not available, the table should be printed
# to the console
assert "What is 1+1?" in out
example_input = next(iter(example.inputs().values()))
assert example_input in out

0 comments on commit a68f2d9

Please sign in to comment.