From 2480012aacc8e6fc74b45f5428e3ff4072935d63 Mon Sep 17 00:00:00 2001 From: Sid Jha <45739834+sidjha1@users.noreply.github.com> Date: Thu, 14 Nov 2024 18:26:14 -0800 Subject: [PATCH] Update CI to use llama 3.1 8B (#34) Previously CI used `llama3.2 3B` which is less reliable. --- .github/tests/lm_tests.py | 14 ++++++++------ .github/workflows/tests.yml | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py index 3152c20a..90e1cec1 100644 --- a/.github/tests/lm_tests.py +++ b/.github/tests/lm_tests.py @@ -20,7 +20,7 @@ MODEL_NAME_TO_ENABLED = { "gpt-4o-mini": ENABLE_OPENAI_TESTS, "gpt-4o": ENABLE_OPENAI_TESTS, - "ollama/llama3.2": ENABLE_OLLAMA_TESTS, + "ollama/llama3.1": ENABLE_OLLAMA_TESTS, } ENABLED_MODEL_NAMES = set([model_name for model_name, is_enabled in MODEL_NAME_TO_ENABLED.items() if is_enabled]) @@ -53,7 +53,7 @@ def print_usage_after_each_test(setup_models): ################################################################################ # Standard tests ################################################################################ -@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2")) +@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.1")) def test_filter_operation(setup_models, model): lm = setup_models[model] lotus.settings.configure(lm=lm) @@ -93,7 +93,7 @@ def test_top_k(setup_models, model): assert top_2_expected == top_2_actual -@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2")) +@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.1")) def test_join(setup_models, model): lm = setup_models[model] lotus.settings.configure(lm=lm) @@ -110,7 +110,7 @@ def test_join(setup_models, model): assert joined_pairs == expected_pairs -@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2")) +@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.1")) def test_map_fewshot(setup_models, model): lm = setup_models[model] lotus.settings.configure(lm=lm) @@ -122,8 +122,10 @@ def test_map_fewshot(setup_models, model): user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation." df = df.sem_map(user_instruction, examples=examples_df, suffix="State") + # clean up the state names to be more robust to free-form text + df["State"] = df["State"].str[-2:].str.lower() pairs = set(zip(df["School"], df["State"])) - expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")]) + expected_pairs = set([("UC Berkeley", "ca"), ("Carnegie Mellon", "pa")]) assert pairs == expected_pairs @@ -285,7 +287,7 @@ def test_format_logprobs_for_filter_cascade(setup_models, model): ################################################################################ # Token counting tests ################################################################################ -@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.2")) +@pytest.mark.parametrize("model", get_enabled("gpt-4o-mini", "ollama/llama3.1")) def test_count_tokens(setup_models, model): lm = setup_models[model] lotus.settings.configure(lm=lm) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 07a9f3ea..cace5f6f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -120,7 +120,7 @@ jobs: sleep 1 timeout=$((timeout - 1)) done - docker exec $(docker ps -q) ollama run llama3.2 + docker exec $(docker ps -q) ollama run llama3.1 - name: Run LM tests env: