Add ollama tests

guestrin-lab · Nov 3, 2024 · 77271fc · 77271fc
1 parent b05784a
commit 77271fc
Show file tree

Hide file tree

Showing 3 changed files with 211 additions and 107 deletions.
diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py
@@ -1,39 +1,59 @@
+import os
 import pandas as pd
 import pytest
 from tokenizers import Tokenizer
 
 import lotus
 from lotus.models import LM
 
+################################################################################
+# Setup
+################################################################################
 # Set logger level to DEBUG
 lotus.logger.setLevel("DEBUG")
 
+# Environment flags to enable/disable tests
+ENABLE_OPENAI_TESTS = os.getenv("ENABLE_OPENAI_TESTS", "false").lower() == "true"
+ENABLE_OLLAMA_TESTS = os.getenv("ENABLE_OLLAMA_TESTS", "false").lower() == "true"
+
+MODEL_NAME_TO_ENABLED = {
+    "gpt-4o-mini": ENABLE_OPENAI_TESTS,
+    "gpt-4o": ENABLE_OPENAI_TESTS,
+    "ollama/llama3.2": ENABLE_OLLAMA_TESTS
+}
+ENABLED_MODEL_NAMES = set([model_name for model_name, is_enabled in MODEL_NAME_TO_ENABLED.items() if is_enabled])
+
+def get_enabled(*candidate_models: tuple) -> list[str]:
+    return [model for model in candidate_models if model in ENABLED_MODEL_NAMES]
+
 
 @pytest.fixture(scope="session")
-def setup_gpt_models():
-    # Setup GPT models
-    gpt_4o_mini = LM(model="gpt-4o-mini")
-    gpt_4o = LM(model="gpt-4o")
-    return gpt_4o_mini, gpt_4o
+def setup_models():
+    models = {}
+
+    for model_path in ENABLED_MODEL_NAMES:
+        models[model_path] = LM(model=model_path)
+
+    return models
 
 
 @pytest.fixture(autouse=True)
-def print_usage_after_each_test(setup_gpt_models):
+def print_usage_after_each_test(setup_models):
     yield  # this runs the test
-    gpt_4o_mini, gpt_4o = setup_gpt_models
-    print("\nUsage stats for gpt-4o-mini after test:")
-    gpt_4o_mini.print_total_usage()
-    print("\nUsage stats for gpt-4o after test:")
-    gpt_4o.print_total_usage()
-
-    # Reset stats
-    gpt_4o_mini.reset_stats()
-    gpt_4o.reset_stats()
+    models = setup_models
+    for model_name, model in models.items():
+        print(f"\nUsage stats for {model_name} after test:")
+        model.print_total_usage()
+        model.reset_stats()
 
 
-def test_filter_operation(setup_gpt_models):
-    gpt_4o_mini, _ = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o_mini)
+################################################################################
+# Standard tests
+################################################################################
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
+def test_filter_operation(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
 
     # Test filter operation on an easy dataframe
     data = {"Text": ["I am really excited to go to class today!", "I am very sad"]}
@@ -44,10 +64,84 @@ def test_filter_operation(setup_gpt_models):
     expected_df = pd.DataFrame({"Text": ["I am really excited to go to class today!"]})
     assert filtered_df.equals(expected_df)
 
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini"))
+def test_top_k(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
+
+    data = {
+        "Text": [
+            "Lionel Messi is a good soccer player",
+            "Michael Jordan is a good basketball player",
+            "Steph Curry is a good basketball player",
+            "Tom Brady is a good football player",
+        ]
+    }
+    df = pd.DataFrame(data)
+    user_instruction = "Which {Text} is most related to basketball?"
+    top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"])
+
+    strategies = ["quick", "heap", "naive"]
+    for strategy in strategies:
+        sorted_df = df.sem_topk(user_instruction, K=2, strategy=strategy)
+
+        top_2_actual = set(sorted_df["Text"].values)
+        assert top_2_expected == top_2_actual
+
+
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
+def test_join(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
+
+    data1 = {"School": ["UC Berkeley", "Stanford"]}
+    data2 = {"School Type": ["Public School", "Private School"]}
+
+    df1 = pd.DataFrame(data1)
+    df2 = pd.DataFrame(data2)
+    join_instruction = "{School} is a {School Type}"
+    joined_df = df1.sem_join(df2, join_instruction)
+    joined_pairs = set(zip(joined_df["School"], joined_df["School Type"]))
+    expected_pairs = set([("UC Berkeley", "Public School"), ("Stanford", "Private School")])
+    assert joined_pairs == expected_pairs
+
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
+def test_map_fewshot(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
+
+    data = {"School": ["UC Berkeley", "Carnegie Mellon"]}
+    df = pd.DataFrame(data)
+    examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]}
+    examples_df = pd.DataFrame(examples)
+    user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation."
+    df = df.sem_map(user_instruction, examples=examples_df, suffix="State")
+
+    pairs = set(zip(df["School"], df["State"]))
+    expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")])
+    assert pairs == expected_pairs
+
+
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini"))
+def test_agg_then_map(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
+
+    data = {"Text": ["My name is John", "My name is Jane", "My name is John"]}
+    df = pd.DataFrame(data)
+    agg_instruction = "What is the most common name in {Text}?"
+    agg_df = df.sem_agg(agg_instruction, suffix="draft_output")
+    map_instruction = "{draft_output} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
+    cleaned_df = agg_df.sem_map(map_instruction, suffix="final_output")
+    assert cleaned_df["final_output"].values[0].lower().strip(".,!?\"'") == "john"
 
-def test_filter_cascade(setup_gpt_models):
-    gpt_4o_mini, gpt_4o = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini)
+################################################################################
+# Cascade tests
+################################################################################
+@pytest.mark.skipif(not ENABLE_OPENAI_TESTS, reason="Skipping test because OpenAI tests are not enabled")
+def test_filter_cascade(setup_models):
+    models = setup_models
+    lotus.settings.configure(lm=models["gpt-4o"], helper_lm=models["gpt-4o-mini"])
 
     data = {
         "Text": [
@@ -113,50 +207,10 @@ def test_filter_cascade(setup_gpt_models):
     assert "I am very sad" not in filtered_df["Text"].values
     assert stats["filters_resolved_by_helper_model"] > 0, stats
 
-
-def test_top_k(setup_gpt_models):
-    gpt_4o_mini, _ = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o_mini)
-
-    data = {
-        "Text": [
-            "Lionel Messi is a good soccer player",
-            "Michael Jordan is a good basketball player",
-            "Steph Curry is a good basketball player",
-            "Tom Brady is a good football player",
-        ]
-    }
-    df = pd.DataFrame(data)
-    user_instruction = "Which {Text} is most related to basketball?"
-    top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"])
-
-    strategies = ["quick", "heap", "naive"]
-    for strategy in strategies:
-        sorted_df = df.sem_topk(user_instruction, K=2, strategy=strategy)
-
-        top_2_actual = set(sorted_df["Text"].values)
-        assert top_2_expected == top_2_actual
-
-
-def test_join(setup_gpt_models):
-    gpt_4o_mini, _ = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o_mini)
-
-    data1 = {"School": ["UC Berkeley", "Stanford"]}
-    data2 = {"School Type": ["Public School", "Private School"]}
-
-    df1 = pd.DataFrame(data1)
-    df2 = pd.DataFrame(data2)
-    join_instruction = "{School} is a {School Type}"
-    joined_df = df1.sem_join(df2, join_instruction)
-    joined_pairs = set(zip(joined_df["School"], joined_df["School Type"]))
-    expected_pairs = set([("UC Berkeley", "Public School"), ("Stanford", "Private School")])
-    assert joined_pairs == expected_pairs
-
-
-def test_join_cascade(setup_gpt_models):
-    gpt_4o_mini, gpt_4o = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini)
+@pytest.mark.skipif(not ENABLE_OPENAI_TESTS, reason="Skipping test because OpenAI tests are not enabled")
+def test_join_cascade(setup_models):
+    models = setup_models
+    lotus.settings.configure(lm=models["gpt-4o"], helper_lm=models["gpt-4o-mini"])
 
     data1 = {"School": ["UC Berkeley", "Stanford"]}
     data2 = {"School Type": ["Public School", "Private School"]}
@@ -180,44 +234,20 @@ def test_join_cascade(setup_gpt_models):
     assert stats["filters_resolved_by_large_model"] == 4, stats
     assert stats["filters_resolved_by_helper_model"] == 0, stats
 
+################################################################################
+# Token counting tests
+################################################################################
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2"))
+def test_count_tokens(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
 
-def test_map_fewshot(setup_gpt_models):
-    gpt_4o_mini, _ = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o_mini)
-
-    data = {"School": ["UC Berkeley", "Carnegie Mellon"]}
-    df = pd.DataFrame(data)
-    examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]}
-    examples_df = pd.DataFrame(examples)
-    user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation."
-    df = df.sem_map(user_instruction, examples=examples_df, suffix="State")
-
-    pairs = set(zip(df["School"], df["State"]))
-    expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")])
-    assert pairs == expected_pairs
-
-
-def test_agg_then_map(setup_gpt_models):
-    gpt_4o_mini, _ = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o_mini)
-
-    data = {"Text": ["My name is John", "My name is Jane", "My name is John"]}
-    df = pd.DataFrame(data)
-    agg_instruction = "What is the most common name in {Text}?"
-    agg_df = df.sem_agg(agg_instruction, suffix="draft_output")
-    map_instruction = "{draft_output} is a draft answer to the question 'What is the most common name?'. Clean up the draft answer so that there is just a single name. Your answer MUST be on word"
-    cleaned_df = agg_df.sem_map(map_instruction, suffix="final_output")
-    assert cleaned_df["final_output"].values[0] == "John"
-
-
-def test_count_tokens(setup_gpt_models):
-    gpt_4o_mini, _ = setup_gpt_models
-    lotus.settings.configure(lm=gpt_4o_mini)
-
-    tokens = gpt_4o_mini.count_tokens("Hello, world!")
-    assert gpt_4o_mini.count_tokens([{"role": "user", "content": "Hello, world!"}]) == tokens
+    tokens = lm.count_tokens("Hello, world!")
+    assert lm.count_tokens([{"role": "user", "content": "Hello, world!"}]) == tokens
     assert tokens < 100
 
+
+def test_custom_tokenizer():
     custom_tokenizer = Tokenizer.from_pretrained("gpt2")
     custom_lm = LM(model="doesn't matter", tokenizer=custom_tokenizer)
     tokens = custom_lm.count_tokens("Hello, world!")

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -54,8 +54,8 @@ jobs:
       - name: Run mypy
         run: mypy lotus/
 
-  test:
-    name: Python Tests
+  openai_lm_test:
+    name: OpenAI Language Model Tests
     runs-on: ubuntu-latest
     timeout-minutes: 5
 
@@ -78,9 +78,76 @@ jobs:
       - name: Set OpenAI API Key
         run: echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV
 
-      - name: Run Python tests
+      - name: Run LM tests
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ENABLE_OPENAI_TESTS: true
+        run: pytest .github/tests/lm_tests.py
+
+  ollama_lm_test:
+    name: Ollama Language Model Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -e .
+          pip install pytest
+
+      - name: Start Ollama container
         run: |
-          pytest .github/tests/lm_tests.py
-          pytest .github/tests/rm_tests.py
+          docker pull ollama/ollama:latest
+          docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+          # Wait for Ollama server to be ready
+          timeout=30
+          while ! curl -s http://localhost:11434/ >/dev/null; do
+            if [ $timeout -le 0 ]; then
+              echo "Timed out waiting for Ollama server"
+              exit 1
+            fi
+            echo "Waiting for Ollama server to be ready..."
+            sleep 1
+            timeout=$((timeout - 1))
+          done
+          docker exec $(docker ps -q) ollama run llama3.2
+
+      - name: Run LM tests
+        env:
+          ENABLE_OLLAMA_TESTS: true
+        run: pytest .github/tests/lm_tests.py
+
+
+  rm_test:
+    name: Retrieval Model Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -e .
+          pip install pytest
+
+      - name: Run RM tests
+        run: pytest .github/tests/rm_tests.py