Fix formatting

guestrin-lab · Nov 2, 2024 · 058a172 · 058a172
1 parent bf8c73a
commit 058a172
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 13 deletions.
diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py
@@ -1,9 +1,9 @@
 import pandas as pd
 import pytest
+from tokenizers import Tokenizer
 
 import lotus
 from lotus.models import LM
-from tokenizers import Tokenizer
 
 # Set logger level to DEBUG
 lotus.logger.setLevel("DEBUG")
@@ -179,6 +179,7 @@ def test_map_fewshot(setup_gpt_models):
     expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")])
     assert pairs == expected_pairs
 
+
 def test_agg_then_map(setup_gpt_models):
     _, gpt_4o = setup_gpt_models
     lotus.settings.configure(lm=gpt_4o)
@@ -187,10 +188,11 @@ def test_agg_then_map(setup_gpt_models):
     df = pd.DataFrame(data)
     agg_instruction = "What is the most common name in {Text}?"
     agg_df = df.sem_agg(agg_instruction, suffix="draft_output")
-    map_instruction = f"{{draft_output}} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
+    map_instruction = "{draft_output} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
     cleaned_df = agg_df.sem_map(map_instruction, suffix="final_output")
     assert cleaned_df["final_output"].values[0] == "John"
 
+
 def test_count_tokens(setup_gpt_models):
     gpt_4o_mini, _ = setup_gpt_models
     lotus.settings.configure(lm=gpt_4o_mini)

diff --git a/lotus/models/lm.py b/lotus/models/lm.py
@@ -7,7 +7,15 @@
 
 
 class LM:
-    def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0.0, max_ctx_len: int = 128000, max_tokens: int = 512, tokenizer: Tokenizer = None, **kwargs):
+    def __init__(
+        self,
+        model: str = "gpt-4o-mini",
+        temperature: float = 0.0,
+        max_ctx_len: int = 128000,
+        max_tokens: int = 512,
+        tokenizer: Tokenizer = None,
+        **kwargs,
+    ):
         self.model = model
         self.max_ctx_len = max_ctx_len
         self.max_tokens = max_tokens
@@ -69,25 +77,20 @@ def get_normalized_true_prob(token_probs: dict[str, float]) -> float | None:
             # Default to 1 if "True" in tokens, 0 if not
             if true_prob is None:
                 true_prob = 1 if "True" in base_cascade.tokens[resp_idx] else 0
-                
+
             all_true_probs.append(true_prob)
 
         return LogprobsForFilterCascade(
-            tokens=base_cascade.tokens,
-            confidences=base_cascade.confidences,
-            true_probs=all_true_probs
+            tokens=base_cascade.tokens, confidences=base_cascade.confidences, true_probs=all_true_probs
         )
 
     def count_tokens(self, messages: list[dict[str, str]] | str) -> int:
         """Count tokens in messages using either custom tokenizer or model's default tokenizer"""
         if isinstance(messages, str):
             messages = [{"role": "user", "content": messages}]
-            
+
         kwargs = {"model": self.model, "messages": messages}
         if self.tokenizer:
-            kwargs["custom_tokenizer"] = {
-                "type": "huggingface_tokenizer", 
-                "tokenizer": self.tokenizer
-            }
-
+            kwargs["custom_tokenizer"] = {"type": "huggingface_tokenizer", "tokenizer": self.tokenizer}
+
         return token_counter(**kwargs)