Skip to content

Commit

Permalink
Add ollama tests
Browse files Browse the repository at this point in the history
  • Loading branch information
sidjha1 committed Nov 3, 2024
1 parent b05784a commit 77271fc
Show file tree
Hide file tree
Showing 3 changed files with 211 additions and 107 deletions.
230 changes: 130 additions & 100 deletions .github/tests/lm_tests.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,59 @@
import os
import pandas as pd
import pytest
from tokenizers import Tokenizer

import lotus
from lotus.models import LM

################################################################################
# Setup
################################################################################
# Set logger level to DEBUG
lotus.logger.setLevel("DEBUG")

# Environment flags to enable/disable tests
ENABLE_OPENAI_TESTS = os.getenv("ENABLE_OPENAI_TESTS", "false").lower() == "true"
ENABLE_OLLAMA_TESTS = os.getenv("ENABLE_OLLAMA_TESTS", "false").lower() == "true"

MODEL_NAME_TO_ENABLED = {
"gpt-4o-mini": ENABLE_OPENAI_TESTS,
"gpt-4o": ENABLE_OPENAI_TESTS,
"ollama/llama3.2": ENABLE_OLLAMA_TESTS
}
ENABLED_MODEL_NAMES = set([model_name for model_name, is_enabled in MODEL_NAME_TO_ENABLED.items() if is_enabled])

def get_enabled(*candidate_models: tuple) -> list[str]:
return [model for model in candidate_models if model in ENABLED_MODEL_NAMES]


@pytest.fixture(scope="session")
def setup_gpt_models():
# Setup GPT models
gpt_4o_mini = LM(model="gpt-4o-mini")
gpt_4o = LM(model="gpt-4o")
return gpt_4o_mini, gpt_4o
def setup_models():
models = {}

for model_path in ENABLED_MODEL_NAMES:
models[model_path] = LM(model=model_path)

return models


@pytest.fixture(autouse=True)
def print_usage_after_each_test(setup_gpt_models):
def print_usage_after_each_test(setup_models):
yield # this runs the test
gpt_4o_mini, gpt_4o = setup_gpt_models
print("\nUsage stats for gpt-4o-mini after test:")
gpt_4o_mini.print_total_usage()
print("\nUsage stats for gpt-4o after test:")
gpt_4o.print_total_usage()

# Reset stats
gpt_4o_mini.reset_stats()
gpt_4o.reset_stats()
models = setup_models
for model_name, model in models.items():
print(f"\nUsage stats for {model_name} after test:")
model.print_total_usage()
model.reset_stats()


def test_filter_operation(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)
################################################################################
# Standard tests
################################################################################
@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
def test_filter_operation(setup_models, model):
lm = setup_models[model]
lotus.settings.configure(lm=lm)

# Test filter operation on an easy dataframe
data = {"Text": ["I am really excited to go to class today!", "I am very sad"]}
Expand All @@ -44,10 +64,84 @@ def test_filter_operation(setup_gpt_models):
expected_df = pd.DataFrame({"Text": ["I am really excited to go to class today!"]})
assert filtered_df.equals(expected_df)

@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini"))
def test_top_k(setup_models, model):
lm = setup_models[model]
lotus.settings.configure(lm=lm)

data = {
"Text": [
"Lionel Messi is a good soccer player",
"Michael Jordan is a good basketball player",
"Steph Curry is a good basketball player",
"Tom Brady is a good football player",
]
}
df = pd.DataFrame(data)
user_instruction = "Which {Text} is most related to basketball?"
top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"])

strategies = ["quick", "heap", "naive"]
for strategy in strategies:
sorted_df = df.sem_topk(user_instruction, K=2, strategy=strategy)

top_2_actual = set(sorted_df["Text"].values)
assert top_2_expected == top_2_actual


@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
def test_join(setup_models, model):
lm = setup_models[model]
lotus.settings.configure(lm=lm)

data1 = {"School": ["UC Berkeley", "Stanford"]}
data2 = {"School Type": ["Public School", "Private School"]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
join_instruction = "{School} is a {School Type}"
joined_df = df1.sem_join(df2, join_instruction)
joined_pairs = set(zip(joined_df["School"], joined_df["School Type"]))
expected_pairs = set([("UC Berkeley", "Public School"), ("Stanford", "Private School")])
assert joined_pairs == expected_pairs

@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
def test_map_fewshot(setup_models, model):
lm = setup_models[model]
lotus.settings.configure(lm=lm)

data = {"School": ["UC Berkeley", "Carnegie Mellon"]}
df = pd.DataFrame(data)
examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]}
examples_df = pd.DataFrame(examples)
user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation."
df = df.sem_map(user_instruction, examples=examples_df, suffix="State")

pairs = set(zip(df["School"], df["State"]))
expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")])
assert pairs == expected_pairs


@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini"))
def test_agg_then_map(setup_models, model):
lm = setup_models[model]
lotus.settings.configure(lm=lm)

data = {"Text": ["My name is John", "My name is Jane", "My name is John"]}
df = pd.DataFrame(data)
agg_instruction = "What is the most common name in {Text}?"
agg_df = df.sem_agg(agg_instruction, suffix="draft_output")
map_instruction = "{draft_output} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
cleaned_df = agg_df.sem_map(map_instruction, suffix="final_output")
assert cleaned_df["final_output"].values[0].lower().strip(".,!?\"'") == "john"

def test_filter_cascade(setup_gpt_models):
gpt_4o_mini, gpt_4o = setup_gpt_models
lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini)
################################################################################
# Cascade tests
################################################################################
@pytest.mark.skipif(not ENABLE_OPENAI_TESTS, reason="Skipping test because OpenAI tests are not enabled")
def test_filter_cascade(setup_models):
models = setup_models
lotus.settings.configure(lm=models["gpt-4o"], helper_lm=models["gpt-4o-mini"])

data = {
"Text": [
Expand Down Expand Up @@ -113,50 +207,10 @@ def test_filter_cascade(setup_gpt_models):
assert "I am very sad" not in filtered_df["Text"].values
assert stats["filters_resolved_by_helper_model"] > 0, stats


def test_top_k(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)

data = {
"Text": [
"Lionel Messi is a good soccer player",
"Michael Jordan is a good basketball player",
"Steph Curry is a good basketball player",
"Tom Brady is a good football player",
]
}
df = pd.DataFrame(data)
user_instruction = "Which {Text} is most related to basketball?"
top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"])

strategies = ["quick", "heap", "naive"]
for strategy in strategies:
sorted_df = df.sem_topk(user_instruction, K=2, strategy=strategy)

top_2_actual = set(sorted_df["Text"].values)
assert top_2_expected == top_2_actual


def test_join(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)

data1 = {"School": ["UC Berkeley", "Stanford"]}
data2 = {"School Type": ["Public School", "Private School"]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
join_instruction = "{School} is a {School Type}"
joined_df = df1.sem_join(df2, join_instruction)
joined_pairs = set(zip(joined_df["School"], joined_df["School Type"]))
expected_pairs = set([("UC Berkeley", "Public School"), ("Stanford", "Private School")])
assert joined_pairs == expected_pairs


def test_join_cascade(setup_gpt_models):
gpt_4o_mini, gpt_4o = setup_gpt_models
lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini)
@pytest.mark.skipif(not ENABLE_OPENAI_TESTS, reason="Skipping test because OpenAI tests are not enabled")
def test_join_cascade(setup_models):
models = setup_models
lotus.settings.configure(lm=models["gpt-4o"], helper_lm=models["gpt-4o-mini"])

data1 = {"School": ["UC Berkeley", "Stanford"]}
data2 = {"School Type": ["Public School", "Private School"]}
Expand All @@ -180,44 +234,20 @@ def test_join_cascade(setup_gpt_models):
assert stats["filters_resolved_by_large_model"] == 4, stats
assert stats["filters_resolved_by_helper_model"] == 0, stats

################################################################################
# Token counting tests
################################################################################
@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
def test_count_tokens(setup_models, model):
lm = setup_models[model]
lotus.settings.configure(lm=lm)

def test_map_fewshot(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)

data = {"School": ["UC Berkeley", "Carnegie Mellon"]}
df = pd.DataFrame(data)
examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]}
examples_df = pd.DataFrame(examples)
user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation."
df = df.sem_map(user_instruction, examples=examples_df, suffix="State")

pairs = set(zip(df["School"], df["State"]))
expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")])
assert pairs == expected_pairs


def test_agg_then_map(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)

data = {"Text": ["My name is John", "My name is Jane", "My name is John"]}
df = pd.DataFrame(data)
agg_instruction = "What is the most common name in {Text}?"
agg_df = df.sem_agg(agg_instruction, suffix="draft_output")
map_instruction = "{draft_output} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
cleaned_df = agg_df.sem_map(map_instruction, suffix="final_output")
assert cleaned_df["final_output"].values[0] == "John"


def test_count_tokens(setup_gpt_models):
gpt_4o_mini, _ = setup_gpt_models
lotus.settings.configure(lm=gpt_4o_mini)

tokens = gpt_4o_mini.count_tokens("Hello, world!")
assert gpt_4o_mini.count_tokens([{"role": "user", "content": "Hello, world!"}]) == tokens
tokens = lm.count_tokens("Hello, world!")
assert lm.count_tokens([{"role": "user", "content": "Hello, world!"}]) == tokens
assert tokens < 100


def test_custom_tokenizer():
custom_tokenizer = Tokenizer.from_pretrained("gpt2")
custom_lm = LM(model="doesn't matter", tokenizer=custom_tokenizer)
tokens = custom_lm.count_tokens("Hello, world!")
Expand Down
77 changes: 72 additions & 5 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ jobs:
- name: Run mypy
run: mypy lotus/

test:
name: Python Tests
openai_lm_test:
name: OpenAI Language Model Tests
runs-on: ubuntu-latest
timeout-minutes: 5

Expand All @@ -78,9 +78,76 @@ jobs:
- name: Set OpenAI API Key
run: echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV

- name: Run Python tests
- name: Run LM tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ENABLE_OPENAI_TESTS: true
run: pytest .github/tests/lm_tests.py

ollama_lm_test:
name: Ollama Language Model Tests
runs-on: ubuntu-latest
timeout-minutes: 10

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -e .
pip install pytest
- name: Start Ollama container
run: |
pytest .github/tests/lm_tests.py
pytest .github/tests/rm_tests.py
docker pull ollama/ollama:latest
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
# Wait for Ollama server to be ready
timeout=30
while ! curl -s http://localhost:11434/ >/dev/null; do
if [ $timeout -le 0 ]; then
echo "Timed out waiting for Ollama server"
exit 1
fi
echo "Waiting for Ollama server to be ready..."
sleep 1
timeout=$((timeout - 1))
done
docker exec $(docker ps -q) ollama run llama3.2
- name: Run LM tests
env:
ENABLE_OLLAMA_TESTS: true
run: pytest .github/tests/lm_tests.py


rm_test:
name: Retrieval Model Tests
runs-on: ubuntu-latest
timeout-minutes: 5

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -e .
pip install pytest
- name: Run RM tests
run: pytest .github/tests/rm_tests.py
Loading

0 comments on commit 77271fc

Please sign in to comment.