Merge pull request #11 from quotient-ai/jamesliounis/autojudge

Add new `AutoJudge` class for custom judges
quotient-ai · Dec 20, 2024 · ad63e29 · ad63e29
2 parents 8c5f5b3 + 44c4970
commit ad63e29
Show file tree

Hide file tree

Showing 8 changed files with 987 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@
    - [Send data to an LLM](#send-data-to-an-llm)
    - [Use a `judges` classifier LLM as an evaluator model](#use-a-judges-classifier-llm-as-an-evaluator-model)
    - [Use a `Jury` for averaging and diversification](#use-a-jury-for-averaging-and-diversification)
+   - [Use `AutoJudge` to create a custom LLM judge](#use-autojudge-to-create-a-custom-llm-judge)
 5. [Appendix of Judges](#appendix)
    - [Classifiers](#classifiers) 
    - [Grader](#graders) 
@@ -159,6 +160,98 @@ verdict = jury.vote(
 print(verdict.score)
 ```
 
+### Use `AutoJudge` to create a custom LLM judge
+
+`autojudge` is an extension to the **judges** library that builds on our [previous work](https://www.quotientai.co/post/subject-matter-expert-language-liaison-smell-a-framework-for-aligning-llm-evaluators-to-human-feedback) aligning judges to human feedback -- given a labeled dataset with feedback, `autojudge` creates custom, task-specific LLM judges.
+
+Install it using:  
+
+```bash
+pip install judges[auto]
+```
+
+**Step 1 - Prepare your dataset:**
+Your dataset can be either a list of dictionaries or path to a csv file with the following fields:
+
+- **`input`**: The input provided to your model
+- **`output`**: The model's response
+- **`label`**: `1` for correct, `0` for incorrect  
+- **`feedback`**: Feedback explaining why the response is correct or incorrect
+
+Example:  
+
+| input                             | output                                                              | label | feedback                              |
+|-----------------------------------|---------------------------------------------------------------------|-------|---------------------------------------|
+| What's the best time to visit Paris? | The best time to visit Paris is during the spring or fall.          | 1     | Provides accurate and detailed advice. |
+| Can I ride a dragon in Scotland? | Yes, dragons are commonly seen in the highlands and can be ridden with proper training          | 0     | Dragons are mythical creatures; the information is fictional. |
+
+**Step 2 - Initialize your `autojudge`:**
+Provide a labeled dataset and describe the evaluation task.  
+
+```python
+from judges.classifiers.auto import AutoJudge
+
+dataset = [
+    {
+        "input": "Can I ride a dragon in Scotland?",
+        "output": "Yes, dragons are commonly seen in the highlands and can be ridden with proper training.",
+        "label": 0,
+        "feedback": "Dragons are mythical creatures; the information is fictional.",
+    },
+    {
+        "input": "Can you recommend a good hotel in Tokyo?",
+        "output": "Certainly! Hotel Sunroute Plaza Shinjuku is highly rated for its location and amenities. It offers comfortable rooms and excellent service.",
+        "label": 1,
+        "feedback": "Offers a specific and helpful recommendation.",
+    },
+    {
+        "input": "Can I drink tap water in London?",
+        "output": "Yes, tap water in London is safe to drink and meets high quality standards.",
+        "label": 1,
+        "feedback": "Gives clear and reassuring information.",
+    },
+    {
+        "input": "What's the boiling point of water on the moon?",
+        "output": "The boiling point of water on the moon is 100°C, the same as on Earth.",
+        "label": 0,
+        "feedback": "Boiling point varies with pressure; the moon's vacuum affects it.",
+    }
+]
+
+
+# Task description
+task = "Evaluate responses for accuracy, clarity, and helpfulness."
+
+# Initialize autojudge
+autojudge = AutoJudge.from_dataset(
+    dataset=dataset,
+    task=task,
+    model="gpt-4-turbo-2024-04-09",
+    # increase workers for speed ⚡
+    # max_workers=2,
+    # generated prompts are automatically saved to disk
+    # save_to_disk=False,
+)
+```
+
+**Step 3 - Use your judge to evaluate new input-output pairs:**
+You can use `autojudge` to evaluate a single input-output pair using the `.judge()` method.  
+
+```python
+# Input-output pair to evaluate
+input_ = "What are the top attractions in New York City?"
+output = "Some top attractions in NYC include the Statue of Liberty and Central Park."
+
+# Get the judgment
+judgment = autojudge.judge(input=input_, output=output)
+
+# Print the judgment
+print(judgment.reasoning)
+# The response accurately lists popular attractions like the Statue of Liberty and Central Park, which are well-known and relevant to the user's query.
+print(judgment.score)
+# True (correct)
+```
+
 ## Appendix
 
 ### Classifiers

diff --git a/judges/_client.py b/judges/_client.py
@@ -0,0 +1,78 @@
+import logging
+
+import instructor
+import openai
+
+from pydantic import BaseModel
+from tenacity import retry, wait_random_exponential, stop_after_attempt
+
+openai._utils._logs.logger.setLevel(logging.WARNING)
+openai._utils._logs.httpx_logger.setLevel(logging.WARNING)
+
+
+def llm_client():
+    try:
+        import litellm
+    except ImportError:
+        # fallback to openai
+        client = openai.OpenAI()
+        return client
+    else:
+        return litellm
+
+
+@retry(
+    wait=wait_random_exponential(min=1, max=60),
+    stop=stop_after_attempt(5),
+)
+def get_completion(
+    messages: list[dict[str, str]],
+    model: str,
+    temperature: float,
+    max_tokens: int,
+    seed: int,
+    response_format: dict = None,
+    response_model: BaseModel = None,
+):
+    client = llm_client()
+
+    if response_format and response_model:
+        raise Exception("response_format and response_model cannot both be provided. please provide only one.")
+
+    if response_model and response_format is None:
+        if client.__class__.__name__ == "OpenAI":
+            client = instructor.from_openai(client)
+        elif hasattr(client, "__name__") and client.__name__ == "litellm":
+            client = instructor.from_litellm(client.completion)
+        else:
+            raise Exception("unknown client. please create an issue on GitHub if you see this message.")
+
+        response = client.chat.completions.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=messages,
+            seed=seed,
+            response_model=response_model,
+        )
+    else:
+        if client.__class__.__name__ == "OpenAI":
+            response = client.chat.completions.create(
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                messages=messages,
+                seed=seed,
+                response_format=response_format,
+            )
+        elif hasattr(client, "__name__") and client.__name__ == "litellm":
+            response = client.completion(
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                messages=messages,
+                seed=seed,
+                response_format=response_format,
+            )
+
+    return response
diff --git a/judges/base.py b/judges/base.py
@@ -5,10 +5,9 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from openai import OpenAI
-
 from judges.voting_methods import AVAILABLE_VOTING_METHODS
 
+from judges._client import get_completion
 
 if TYPE_CHECKING:
     import pydantic
@@ -37,9 +36,9 @@ def __post_init__(self):
         """
         Post-initialization to normalize score values for consistency.
         """
-        if self.score.lower() in ["yes", "true", 1, "good"]:
+        if not isinstance(self.score, bool) and self.score.lower() in ["yes", "true", 1, "1", "good"]:
             self.score = True
-        elif self.score.lower() in ["no", "false", 0, "bad"]:
+        elif not isinstance(self.score, bool) and self.score.lower() in ["no", "false", 0, "0", "bad"]:
             self.score = False
 
 
@@ -85,17 +84,6 @@ def __init__(
             The model identifier to be used for evaluations.
         """
         self.model = model
-        self._client = self._configure_client()
-
-    def _configure_client(self):
-        try:
-            import litellm
-        except ImportError:
-            # fallback to openai
-            client = OpenAI()
-            return client
-        else:
-            return litellm
 
     def _build_messages(self, user_prompt: str, system_prompt: Optional[str] = None):
         """
@@ -143,15 +131,15 @@ def _judge(self, user_prompt: str, system_prompt: Optional[str] = None):
         """
         messages = self._build_messages(user_prompt, system_prompt)
 
-        if self._client.__class__.__name__ == "OpenAI":
-            completion = self._client.chat.completions.create(
-                model=self.model,
-                messages=messages,
-                response_format={"type": "json_object"},
-            )
-        else:
-            completion = self._client.completion(self.model, messages)
-
+        completion = get_completion(
+            model=self.model,
+            messages=messages,
+            max_tokens=None,
+            temperature=1,
+            seed=None,
+            response_model=None,
+            response_format={"type": "json_object"}
+        )
         data = json.loads(completion.choices[0].message.content)
         reasoning = data["REASONING"]
         score = data["SCORE"]

diff --git a/judges/classifiers/auto/__init__.py b/judges/classifiers/auto/__init__.py
@@ -0,0 +1,3 @@
+from judges.classifiers.auto.core import AutoJudge
+
+__all__ = ["AutoJudge"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from judges.classifiers.auto.core import AutoJudge

		__all__ = ["AutoJudge"]