Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automatic evaluation with LLM-as-a-Judge, LangSmith export, and SGI evaluation #174

Merged
merged 61 commits into from
Nov 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
3e336dc
add endpoint
FelixTJDietrich Nov 5, 2023
2fb28f0
add evaluation_provider
FelixTJDietrich Nov 5, 2023
38f6055
add new line
FelixTJDietrich Nov 5, 2023
92ca4ed
add evaluation_provider to export
FelixTJDietrich Nov 5, 2023
433cd7f
add example evaluation endpoint
FelixTJDietrich Nov 5, 2023
9b4e2c9
add playground ui
FelixTJDietrich Nov 5, 2023
5f39b8c
add automatic evaluation
FelixTJDietrich Nov 6, 2023
2667cab
add automatic evaluation
FelixTJDietrich Nov 6, 2023
7dbb316
add UI changes
FelixTJDietrich Nov 7, 2023
5053a4a
fix color
FelixTJDietrich Nov 7, 2023
e9bcb26
add evaluation model
FelixTJDietrich Nov 7, 2023
cabed57
add llm as a judge
FelixTJDietrich Nov 7, 2023
2595d5c
fix ui issue and some var naming
FelixTJDietrich Nov 7, 2023
2739757
fix line break
FelixTJDietrich Nov 7, 2023
6b383e3
add langsmith logging
FelixTJDietrich Nov 7, 2023
e46df2c
inline statistics
FelixTJDietrich Nov 7, 2023
8d50922
add sgi evaluation
FelixTJDietrich Nov 7, 2023
a753b8a
refactor
FelixTJDietrich Nov 7, 2023
fa0bde5
remove unused
FelixTJDietrich Nov 7, 2023
daff54a
Merge branch 'develop' of https://github.com/ls1intum/Athena into fea…
FelixTJDietrich Nov 7, 2023
a4d7d8c
update ini
FelixTJDietrich Nov 7, 2023
fd4fdab
only use selected modules
FelixTJDietrich Nov 7, 2023
f2c1736
remove skip
FelixTJDietrich Nov 7, 2023
33f2a2b
add endpoint
FelixTJDietrich Nov 5, 2023
368ebcc
add evaluation_provider
FelixTJDietrich Nov 5, 2023
5c68106
add new line
FelixTJDietrich Nov 5, 2023
7afd655
add evaluation_provider to export
FelixTJDietrich Nov 5, 2023
1926144
add example evaluation endpoint
FelixTJDietrich Nov 5, 2023
db5e518
add playground ui
FelixTJDietrich Nov 5, 2023
2da4391
add automatic evaluation
FelixTJDietrich Nov 6, 2023
a8589dc
add automatic evaluation
FelixTJDietrich Nov 6, 2023
39c729d
add UI changes
FelixTJDietrich Nov 7, 2023
fdb073d
fix color
FelixTJDietrich Nov 7, 2023
d0838f5
add evaluation model
FelixTJDietrich Nov 7, 2023
05608aa
add llm as a judge
FelixTJDietrich Nov 7, 2023
c68ba0f
fix ui issue and some var naming
FelixTJDietrich Nov 7, 2023
afb1892
fix line break
FelixTJDietrich Nov 7, 2023
2a1d4b6
add langsmith logging
FelixTJDietrich Nov 7, 2023
e845e1f
inline statistics
FelixTJDietrich Nov 7, 2023
d17e486
add sgi evaluation
FelixTJDietrich Nov 7, 2023
5266461
refactor
FelixTJDietrich Nov 7, 2023
9f7494a
remove unused
FelixTJDietrich Nov 7, 2023
d44178b
update ini
FelixTJDietrich Nov 7, 2023
42f8210
only use selected modules
FelixTJDietrich Nov 7, 2023
cc5693d
remove skip
FelixTJDietrich Nov 7, 2023
9462ec5
add retries
FelixTJDietrich Nov 7, 2023
c6609af
Merge branch 'feature/automatic-evaluation' of https://github.com/ls1…
FelixTJDietrich Nov 7, 2023
292d588
enable example module evaluation support for now
FelixTJDietrich Nov 8, 2023
fea2d85
Merge branch 'develop' into feature/automatic-evaluation
pal03377 Nov 8, 2023
5548293
fix filter
FelixTJDietrich Nov 8, 2023
22941ee
Merge branch 'feature/automatic-evaluation' of https://github.com/ls1…
FelixTJDietrich Nov 9, 2023
8de0ee6
implement feedbacl
FelixTJDietrich Nov 9, 2023
0edea59
update retry
FelixTJDietrich Nov 9, 2023
b4b529e
validate grading instruction id
FelixTJDietrich Nov 9, 2023
331f353
add additional check
FelixTJDietrich Nov 9, 2023
4eac295
fix index
FelixTJDietrich Nov 9, 2023
0fe6e6d
add docs
FelixTJDietrich Nov 11, 2023
c6e08df
Merge branch 'develop' of https://github.com/ls1intum/Athena into fea…
FelixTJDietrich Nov 11, 2023
c384232
Merge branch 'develop' into feature/automatic-evaluation
FelixTJDietrich Nov 11, 2023
2b7f212
fix text module
FelixTJDietrich Nov 12, 2023
fe10281
Merge branch 'feature/automatic-evaluation' of https://github.com/ls1…
FelixTJDietrich Nov 12, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class HealthResponse(BaseModel):
"""
Response indicating whether the Assessment Module Manager is healthy,
and whether all the modules are healthy (i.e. reachable).
Additional information about the modules is also provided.
"""
status: str = Field(const=True, default="ok", example="ok")
modules: dict = Field(
Expand All @@ -35,7 +36,8 @@ class HealthResponse(BaseModel):
"module_example": {
"url": "http://localhost:5001",
"type": "programming",
"healthy": True
"healthy": True,
"supportsEvaluation": True
}
}
]
Expand All @@ -56,6 +58,7 @@ async def get_health() -> HealthResponse:
"url": module.url,
"type": module.type,
"healthy": await is_healthy(module),
"supportsEvaluation": module.supports_evaluation
}
for module in get_modules()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def list_modules() -> List[Module]:
name=module,
url=cast(AnyHttpUrl, os.environ.get(f"{module.upper()}_URL", modules_config[module]["url"])),
type=ExerciseType(modules_config[module]["type"]),
supports_evaluation=modules_config[module].getboolean("supports_evaluation"),
)
for module in modules_config.sections()
]
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ class Module(BaseModel):
name: str = Field(example="module_example")
url: AnyHttpUrl = Field(example="http://localhost:5001")
type: ExerciseType = Field(example=ExerciseType.text)
supports_evaluation: bool = Field(description="Whether the module supports evaluation", example=True)
7 changes: 6 additions & 1 deletion assessment_module_manager/modules.docker.ini
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
[module_example]
url = http://module-example:5001
type = programming
supports_evaluation = true

[module_programming_llm]
url = http://module-programming-llm:5002
type = programming
supports_evaluation = false

[module_text_llm]
url = http://module-text-llm:5003
type = text
supports_evaluation = true

[module_text_cofee]
url = http://module-text-cofee:5004
type = text
supports_evaluation = false

[module_programming_themisml]
url = http://module-programming-themisml:5005
type = programming
type = programming
supports_evaluation = false
5 changes: 5 additions & 0 deletions assessment_module_manager/modules.ini
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
[module_example]
url = http://localhost:5001
type = programming
supports_evaluation = true

[module_programming_llm]
url = http://localhost:5002
type = programming
supports_evaluation = false

[module_text_llm]
url = http://localhost:5003
type = text
supports_evaluation = true

[module_text_cofee]
url = http://localhost:5004
type = text
supports_evaluation = false

[module_programming_themisml]
url = http://localhost:5005
type = programming
supports_evaluation = false
3 changes: 2 additions & 1 deletion athena/athena/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
from .metadata import emit_meta, get_meta
from .experiment import get_experiment_environment
from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider # type: ignore
from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider # type: ignore


@app.get("/")
Expand All @@ -28,6 +28,7 @@ def run_module():
"feedback_consumer",
"feedback_provider",
"config_schema_provider",
"evaluation_provider",
"emit_meta",
"get_meta",
"get_experiment_environment",
Expand Down
61 changes: 60 additions & 1 deletion athena/athena/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,4 +358,63 @@ def config_schema_provider(cls: Type[C]) -> Type[C]:
async def wrapper():
return cls.schema()

return cls
return cls


def evaluation_provider(func: Union[
Callable[[E, S, List[F], List[F]], Any],
Callable[[E, S, List[F], List[F]], Coroutine[Any, Any, Any]]
]):
"""
Provide evaluated feedback to the Assessment Module Manager.

Note: The evaluation provider is usually called during the research and development phase (by the Playground).
Return arbitrary evaluation results.

This decorator can be used with several types of functions: synchronous or asynchronous.

Examples:
Below are some examples of possible functions that you can decorate with this decorator:

Without using module config (both synchronous and asynchronous forms):
>>> @evaluation_provider
... def sync_evaluate_feedback(
... exercise: Exercise, submission: Submission,
... true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
... ) -> Any:
... # evaluate predicted feedback here and return evaluation results

>>> @feedback_provider
... async def async_evaluate_feedback(
... exercise: Exercise, submission: Submission,
... true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]
... ) -> Any:
... # evaluate predicted feedback here and return evaluation results
"""
exercise_type = inspect.signature(func).parameters["exercise"].annotation
submission_type = inspect.signature(func).parameters["submission"].annotation
feedback_type = inspect.signature(func).parameters["predicted_feedbacks"].annotation.__args__[0]

@app.post("/evaluation", responses=module_responses)
@authenticated
@with_meta
async def wrapper(
exercise: exercise_type,
submission: submission_type,
true_feedbacks: List[feedback_type],
predicted_feedbacks: List[feedback_type],
):
# Retrieve existing metadata for the exercise, submission and feedback
exercise.meta.update(get_stored_exercise_meta(exercise) or {})
submission.meta.update(get_stored_submission_meta(submission) or {})
for feedback in true_feedbacks + predicted_feedbacks:
feedback.meta.update(get_stored_feedback_meta(feedback) or {})

# Call the actual provider
if inspect.iscoroutinefunction(func):
evaluation = await func(exercise, submission, true_feedbacks, predicted_feedbacks)
else:
evaluation = func(exercise, submission, true_feedbacks, predicted_feedbacks)

return evaluation
return wrapper
33 changes: 32 additions & 1 deletion docs/module/structure.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Example:
)
]

Provide Config Schema
Provide Config Schema (Optional)
~~~~~~~~~~~~~~~~~~~~~~
Get a schema for config options of the module as json schema. The config complying to the schema can then be provided in the header of a request `X-Module-Config` to override the default values. The module can decorate one pydantic model with ``@config_schema_provider`` to provide the schema and should have default values set for all fields as default configuration. The configuration class can be appended to the function signature of all other decorators to provide the configuration to the function.

Expand All @@ -108,6 +108,37 @@ Example:
debug: bool = Field(False, description="Whether the module is in debug mode.")
...

Provide Evaluation (Optional)
~~~~~~~~~~~~~~~~~~
Get an arbitrary evaluation for a submission with historical ``true_feedback`` and feedback suggestions ``predicted_feedback``. The Playground would usually call this when conducting an evaluation during an experiment. The module will receive the request at the function annotated with ``@evaluation_provider``.

If you want to have the ``/evaluation`` endpoint available during the Playground evaluation mode, you need to set ``supports_evaluation = true`` in the ``modules.ini`` and ``modules.docker.ini`` files.

Example:
.. code-block:: python

from athena import *

@evaluation_provider
def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
# Do something with the true and predicted feedback and return the evaluation result
...
# Example: Generate some example evaluation result
evaluation_results = []
true_feedback_embeddings = [random.random() for _ in true_feedbacks]
predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
feedback_evaluation = {
"feedback_id": feedback.id,
"embedding": embedding,
"has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
"correctness": random.random()
}
evaluation_results.append(feedback_evaluation)
...
# Return arbitrary evaluation results
return evaluation_results

Environment Variables
---------------------
You should provide at least the following environment variables for your module to work properly:
Expand Down
30 changes: 28 additions & 2 deletions module_example/module_example/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Entry point for the module_example module.
"""
from typing import List
import random
from typing import List, Any
from pydantic import BaseModel, Field

from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, emit_meta
from athena import app, config_schema_provider, submissions_consumer, submission_selector, feedback_consumer, feedback_provider, evaluation_provider, emit_meta
from athena.programming import Exercise, Submission, Feedback
from athena.logger import logger
from athena.storage import store_exercise, store_submissions, store_feedback
Expand Down Expand Up @@ -139,5 +140,30 @@ def suggest_feedback(exercise: Exercise, submission: Submission, module_config:
]


# Only if it makes sense for a module (Optional)
FelixTJDietrich marked this conversation as resolved.
Show resolved Hide resolved
@evaluation_provider
def evaluate_feedback(exercise: Exercise, submission: Submission, true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback]) -> Any:
logger.info(
"evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
submission.id, exercise.id, len(true_feedbacks), len(predicted_feedbacks)
)

# Do something with the true and predicted feedback and return the evaluation result
# Generate some example evaluation result
evaluation_results = []
true_feedback_embeddings = [random.random() for _ in true_feedbacks]
predicted_feedback_embeddings = [random.random() for _ in predicted_feedbacks]
for feedback, embedding in zip(predicted_feedbacks, predicted_feedback_embeddings):
feedback_evaluation = {
"feedback_id": feedback.id,
"embedding": embedding,
"has_match": len([t for t in true_feedback_embeddings if abs(t - embedding) < 0.1]) > 0,
"correctness": random.random()
}
evaluation_results.append(feedback_evaluation)

return evaluation_results


if __name__ == "__main__":
app.start()
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,19 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
]
)

grading_instruction_ids = set(
grading_instruction.id
for criterion in exercise.grading_criteria or []
for grading_instruction in criterion.structured_grading_instructions
)

feedbacks: List[Feedback] = []
for prompt_input, result in zip(prompt_inputs, results):
file_path = prompt_input["file_path"]
if result is None:
continue
for feedback in result.feedbacks:
grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
feedbacks.append(Feedback(
exercise_id=exercise.id,
submission_id=submission.id,
Expand All @@ -242,7 +249,7 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
line_start=feedback.line_start,
line_end=feedback.line_end,
credits=feedback.credits,
structured_grading_instruction_id=feedback.grading_instruction_id,
structured_grading_instruction_id=grading_instruction_id,
meta={}
))

Expand Down
6 changes: 6 additions & 0 deletions module_text_llm/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ DATABASE_URL=sqlite:///../data/data.sqlite
# See below for options, available models are also logged on startup
LLM_DEFAULT_MODEL="azure_openai_gpt-35"

# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
LLM_ENABLE_LLM_AS_A_JUDGE=1
# Evaluation model to use for the LLM-as-a-judge approach [Only important if you want to use it in the /evaluate endpoint]
# See below for options, available models are also logged on startup
LLM_EVALUATION_MODEL="azure_openai_gpt-4"

# Standard OpenAI (Non-Azure) [leave blank if not used]
# Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
# A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
Expand Down
34 changes: 32 additions & 2 deletions module_text_llm/module_text_llm/__main__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from typing import List
import json
import os
from typing import List, Any

import nltk
import tiktoken

from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.config import Configuration
from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
from module_text_llm.generate_suggestions import generate_suggestions
from module_text_llm.generate_evaluation import generate_evaluation


@submissions_consumer
Expand All @@ -33,6 +37,32 @@ async def suggest_feedback(exercise: Exercise, submission: Submission, module_co
return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)


@evaluation_provider
async def evaluate_feedback(
exercise: Exercise, submission: Submission,
true_feedbacks: List[Feedback], predicted_feedbacks: List[Feedback],
) -> Any:
logger.info(
"evaluate_feedback: Evaluation for submission %d of exercise %d was requested with %d true and %d predicted feedbacks",
submission.id, exercise.id, len(
true_feedbacks), len(predicted_feedbacks)
)

evaluation = {}

# 1. LLM as a judge
if len(predicted_feedbacks) > 0 and bool(os.environ.get("LLM_ENABLE_LLM_AS_A_JUDGE")):
evaluation["llm_as_a_judge"] = await generate_evaluation(exercise, submission, true_feedbacks, predicted_feedbacks)

# 2. LangSmith runs, token usage, and respose times
if bool(os.environ.get("LANGCHAIN_TRACING_V2")):
evaluation["llm_statistics"] = get_llm_statistics(submission)

# 3. Feedback statistics
evaluation["feedback_statistics"] = get_feedback_statistics(exercise, true_feedbacks, predicted_feedbacks)

return evaluation

if __name__ == "__main__":
nltk.download("punkt")
tiktoken.get_encoding("cl100k_base")
Expand Down
Loading
Loading