Skip to content

Commit

Permalink
Fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
sidjha1 committed Nov 2, 2024
1 parent bf8c73a commit 058a172
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 13 deletions.
6 changes: 4 additions & 2 deletions .github/tests/lm_tests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pandas as pd
import pytest
from tokenizers import Tokenizer

import lotus
from lotus.models import LM
from tokenizers import Tokenizer

# Set logger level to DEBUG
lotus.logger.setLevel("DEBUG")
Expand Down Expand Up @@ -179,6 +179,7 @@ def test_map_fewshot(setup_gpt_models):
expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")])
assert pairs == expected_pairs


def test_agg_then_map(setup_gpt_models):
_, gpt_4o = setup_gpt_models
lotus.settings.configure(lm=gpt_4o)
Expand All @@ -187,10 +188,11 @@ def test_agg_then_map(setup_gpt_models):
df = pd.DataFrame(data)
agg_instruction = "What is the most common name in {Text}?"
agg_df = df.sem_agg(agg_instruction, suffix="draft_output")
map_instruction = f"{{draft_output}} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
map_instruction = "{draft_output} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
cleaned_df = agg_df.sem_map(map_instruction, suffix="final_output")
assert cleaned_df["final_output"].values[0] == "John"


def test_count_tokens(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)
Expand Down
25 changes: 14 additions & 11 deletions lotus/models/lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,15 @@


class LM:
def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0.0, max_ctx_len: int = 128000, max_tokens: int = 512, tokenizer: Tokenizer = None, **kwargs):
def __init__(
self,
model: str = "gpt-4o-mini",
temperature: float = 0.0,
max_ctx_len: int = 128000,
max_tokens: int = 512,
tokenizer: Tokenizer = None,
**kwargs,
):
self.model = model
self.max_ctx_len = max_ctx_len
self.max_tokens = max_tokens
Expand Down Expand Up @@ -69,25 +77,20 @@ def get_normalized_true_prob(token_probs: dict[str, float]) -> float | None:
# Default to 1 if "True" in tokens, 0 if not
if true_prob is None:
true_prob = 1 if "True" in base_cascade.tokens[resp_idx] else 0

all_true_probs.append(true_prob)

return LogprobsForFilterCascade(
tokens=base_cascade.tokens,
confidences=base_cascade.confidences,
true_probs=all_true_probs
tokens=base_cascade.tokens, confidences=base_cascade.confidences, true_probs=all_true_probs
)

def count_tokens(self, messages: list[dict[str, str]] | str) -> int:
"""Count tokens in messages using either custom tokenizer or model's default tokenizer"""
if isinstance(messages, str):
messages = [{"role": "user", "content": messages}]

kwargs = {"model": self.model, "messages": messages}
if self.tokenizer:
kwargs["custom_tokenizer"] = {
"type": "huggingface_tokenizer",
"tokenizer": self.tokenizer
}

kwargs["custom_tokenizer"] = {"type": "huggingface_tokenizer", "tokenizer": self.tokenizer}

return token_counter(**kwargs)

0 comments on commit 058a172

Please sign in to comment.