Skip to content

Commit

Permalink
Merge pull request #11 from quotient-ai/jamesliounis/autojudge
Browse files Browse the repository at this point in the history
Add new `AutoJudge` class for custom judges
  • Loading branch information
freddiev4 authored Dec 20, 2024
2 parents 8c5f5b3 + 44c4970 commit ad63e29
Show file tree
Hide file tree
Showing 8 changed files with 987 additions and 24 deletions.
93 changes: 93 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- [Send data to an LLM](#send-data-to-an-llm)
- [Use a `judges` classifier LLM as an evaluator model](#use-a-judges-classifier-llm-as-an-evaluator-model)
- [Use a `Jury` for averaging and diversification](#use-a-jury-for-averaging-and-diversification)
- [Use `AutoJudge` to create a custom LLM judge](#use-autojudge-to-create-a-custom-llm-judge)
5. [Appendix of Judges](#appendix)
- [Classifiers](#classifiers)
- [Grader](#graders)
Expand Down Expand Up @@ -159,6 +160,98 @@ verdict = jury.vote(
print(verdict.score)
```

### Use `AutoJudge` to create a custom LLM judge

`autojudge` is an extension to the **judges** library that builds on our [previous work](https://www.quotientai.co/post/subject-matter-expert-language-liaison-smell-a-framework-for-aligning-llm-evaluators-to-human-feedback) aligning judges to human feedback -- given a labeled dataset with feedback, `autojudge` creates custom, task-specific LLM judges.

Install it using:

```bash
pip install judges[auto]
```

**Step 1 - Prepare your dataset:**
Your dataset can be either a list of dictionaries or path to a csv file with the following fields:

- **`input`**: The input provided to your model
- **`output`**: The model's response
- **`label`**: `1` for correct, `0` for incorrect
- **`feedback`**: Feedback explaining why the response is correct or incorrect

Example:

| input | output | label | feedback |
|-----------------------------------|---------------------------------------------------------------------|-------|---------------------------------------|
| What's the best time to visit Paris? | The best time to visit Paris is during the spring or fall. | 1 | Provides accurate and detailed advice. |
| Can I ride a dragon in Scotland? | Yes, dragons are commonly seen in the highlands and can be ridden with proper training | 0 | Dragons are mythical creatures; the information is fictional. |

**Step 2 - Initialize your `autojudge`:**
Provide a labeled dataset and describe the evaluation task.

```python
from judges.classifiers.auto import AutoJudge

dataset = [
{
"input": "Can I ride a dragon in Scotland?",
"output": "Yes, dragons are commonly seen in the highlands and can be ridden with proper training.",
"label": 0,
"feedback": "Dragons are mythical creatures; the information is fictional.",
},
{
"input": "Can you recommend a good hotel in Tokyo?",
"output": "Certainly! Hotel Sunroute Plaza Shinjuku is highly rated for its location and amenities. It offers comfortable rooms and excellent service.",
"label": 1,
"feedback": "Offers a specific and helpful recommendation.",
},
{
"input": "Can I drink tap water in London?",
"output": "Yes, tap water in London is safe to drink and meets high quality standards.",
"label": 1,
"feedback": "Gives clear and reassuring information.",
},
{
"input": "What's the boiling point of water on the moon?",
"output": "The boiling point of water on the moon is 100°C, the same as on Earth.",
"label": 0,
"feedback": "Boiling point varies with pressure; the moon's vacuum affects it.",
}
]


# Task description
task = "Evaluate responses for accuracy, clarity, and helpfulness."

# Initialize autojudge
autojudge = AutoJudge.from_dataset(
dataset=dataset,
task=task,
model="gpt-4-turbo-2024-04-09",
# increase workers for speed ⚡
# max_workers=2,
# generated prompts are automatically saved to disk
# save_to_disk=False,
)
```

**Step 3 - Use your judge to evaluate new input-output pairs:**
You can use `autojudge` to evaluate a single input-output pair using the `.judge()` method.

```python
# Input-output pair to evaluate
input_ = "What are the top attractions in New York City?"
output = "Some top attractions in NYC include the Statue of Liberty and Central Park."

# Get the judgment
judgment = autojudge.judge(input=input_, output=output)

# Print the judgment
print(judgment.reasoning)
# The response accurately lists popular attractions like the Statue of Liberty and Central Park, which are well-known and relevant to the user's query.
print(judgment.score)
# True (correct)
```

## Appendix

### Classifiers
Expand Down
78 changes: 78 additions & 0 deletions judges/_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging

import instructor
import openai

from pydantic import BaseModel
from tenacity import retry, wait_random_exponential, stop_after_attempt

openai._utils._logs.logger.setLevel(logging.WARNING)
openai._utils._logs.httpx_logger.setLevel(logging.WARNING)


def llm_client():
try:
import litellm
except ImportError:
# fallback to openai
client = openai.OpenAI()
return client
else:
return litellm


@retry(
wait=wait_random_exponential(min=1, max=60),
stop=stop_after_attempt(5),
)
def get_completion(
messages: list[dict[str, str]],
model: str,
temperature: float,
max_tokens: int,
seed: int,
response_format: dict = None,
response_model: BaseModel = None,
):
client = llm_client()

if response_format and response_model:
raise Exception("response_format and response_model cannot both be provided. please provide only one.")

if response_model and response_format is None:
if client.__class__.__name__ == "OpenAI":
client = instructor.from_openai(client)
elif hasattr(client, "__name__") and client.__name__ == "litellm":
client = instructor.from_litellm(client.completion)
else:
raise Exception("unknown client. please create an issue on GitHub if you see this message.")

response = client.chat.completions.create(
model=model,
max_tokens=max_tokens,
temperature=temperature,
messages=messages,
seed=seed,
response_model=response_model,
)
else:
if client.__class__.__name__ == "OpenAI":
response = client.chat.completions.create(
model=model,
max_tokens=max_tokens,
temperature=temperature,
messages=messages,
seed=seed,
response_format=response_format,
)
elif hasattr(client, "__name__") and client.__name__ == "litellm":
response = client.completion(
model=model,
max_tokens=max_tokens,
temperature=temperature,
messages=messages,
seed=seed,
response_format=response_format,
)

return response
36 changes: 12 additions & 24 deletions judges/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from openai import OpenAI

from judges.voting_methods import AVAILABLE_VOTING_METHODS

from judges._client import get_completion

if TYPE_CHECKING:
import pydantic
Expand Down Expand Up @@ -37,9 +36,9 @@ def __post_init__(self):
"""
Post-initialization to normalize score values for consistency.
"""
if self.score.lower() in ["yes", "true", 1, "good"]:
if not isinstance(self.score, bool) and self.score.lower() in ["yes", "true", 1, "1", "good"]:
self.score = True
elif self.score.lower() in ["no", "false", 0, "bad"]:
elif not isinstance(self.score, bool) and self.score.lower() in ["no", "false", 0, "0", "bad"]:
self.score = False


Expand Down Expand Up @@ -85,17 +84,6 @@ def __init__(
The model identifier to be used for evaluations.
"""
self.model = model
self._client = self._configure_client()

def _configure_client(self):
try:
import litellm
except ImportError:
# fallback to openai
client = OpenAI()
return client
else:
return litellm

def _build_messages(self, user_prompt: str, system_prompt: Optional[str] = None):
"""
Expand Down Expand Up @@ -143,15 +131,15 @@ def _judge(self, user_prompt: str, system_prompt: Optional[str] = None):
"""
messages = self._build_messages(user_prompt, system_prompt)

if self._client.__class__.__name__ == "OpenAI":
completion = self._client.chat.completions.create(
model=self.model,
messages=messages,
response_format={"type": "json_object"},
)
else:
completion = self._client.completion(self.model, messages)

completion = get_completion(
model=self.model,
messages=messages,
max_tokens=None,
temperature=1,
seed=None,
response_model=None,
response_format={"type": "json_object"}
)
data = json.loads(completion.choices[0].message.content)
reasoning = data["REASONING"]
score = data["SCORE"]
Expand Down
3 changes: 3 additions & 0 deletions judges/classifiers/auto/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from judges.classifiers.auto.core import AutoJudge

__all__ = ["AutoJudge"]
Loading

0 comments on commit ad63e29

Please sign in to comment.