From 74bc65806aa94f9a43751e3ade224a7433a11cd1 Mon Sep 17 00:00:00 2001 From: glados Date: Thu, 16 Jan 2025 16:00:46 +0100 Subject: [PATCH] Implement integration tests * Implement integration tests for models `claude` and `gemini` * Implement integration tests for agents using `claude` and `gemini` * Add invoke tasks to run individual tests * Add integration tests to github CI pipeline Add integration tests Update Update Update --- .github/workflows/test.yml | 16 +- DEVELOPMENT.md | 26 ++- poetry.lock | 15 +- pyproject.toml | 1 + tasks.py | 35 +++ tests/__init__.py | 3 + tests/helpers/__init__.py | 0 tests/helpers/flaky.py | 16 ++ .../skills/user_repository/__init__.py | 0 tests/helpers/skills/user_repository/api.py | 38 ++++ tests/helpers/skills/user_repository/impl.py | 20 ++ tests/integration/conftest.py | 41 ++++ tests/integration/test_agent.py | 210 ++++++++++++++++++ tests/integration/test_model.py | 129 +++++++++++ 14 files changed, 543 insertions(+), 7 deletions(-) create mode 100644 tests/helpers/__init__.py create mode 100644 tests/helpers/flaky.py create mode 100644 tests/helpers/skills/user_repository/__init__.py create mode 100644 tests/helpers/skills/user_repository/api.py create mode 100644 tests/helpers/skills/user_repository/impl.py create mode 100644 tests/integration/conftest.py create mode 100644 tests/integration/test_agent.py create mode 100644 tests/integration/test_model.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 90b2cc8..9cb6f1b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main ] + branches: [ main, wip-integration-tests ] pull_request: branches: [ main ] @@ -10,6 +10,10 @@ jobs: test: runs-on: ubuntu-latest + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + steps: - uses: actions/checkout@v4 @@ -39,7 +43,13 @@ jobs: poetry install pip list - - name: Run tests + - name: Run unit tests + shell: bash -l {0} + run: | + poetry run pytest tests/unit + + - name: Run integration tests shell: bash -l {0} run: | - poetry run pytest -s tests + docker pull ghcr.io/gradion-ai/ipybox:basic + poetry run pytest tests/integration --no-flaky-report diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 13a9cbe..c361644 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -34,14 +34,36 @@ Install pre-commit hooks: invoke precommit-install ``` +Create a `.env` file with [Anthropic](https://console.anthropic.com/settings/keys) and [Gemini](https://aistudio.google.com/app/apikey) API keys: + +```env title=".env" +# Required for Claude 3.5 Sonnet +ANTHROPIC_API_KEY=... + +# Required for generative Google Search via Gemini 2 +GOOGLE_API_KEY=... +``` + Enforce coding conventions (done automatically by pre-commit hooks): ```bash invoke cc ``` -Run tests: +Run unit tests: + +```bash +invoke ut +``` + +Run integration tests: + +```bash +invoke it +``` + +Run all tests: ```bash -pytest -s tests +invoke test ``` diff --git a/poetry.lock b/poetry.lock index 2c84ac1..8e7fdd6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "aioconsole" @@ -610,6 +610,17 @@ docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2. testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"] typing = ["typing-extensions (>=4.12.2)"] +[[package]] +name = "flaky" +version = "3.8.1" +description = "Plugin for pytest that automatically reruns flaky tests." +optional = false +python-versions = ">=3.5" +files = [ + {file = "flaky-3.8.1-py2.py3-none-any.whl", hash = "sha256:194ccf4f0d3a22b2de7130f4b62e45e977ac1b5ccad74d4d48f3005dcc38815e"}, + {file = "flaky-3.8.1.tar.gz", hash = "sha256:47204a81ec905f3d5acfbd61daeabcada8f9d4031616d9bcb0618461729699f5"}, +] + [[package]] name = "fonttools" version = "4.55.3" @@ -3249,4 +3260,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.11,<3.14" -content-hash = "78d5ba40f33642e64ba8fd9584afd30ed6180684ca7781d4fd2ac37a03cbb717" +content-hash = "609e78e71b123b453a81dc6b34919f42ab85ac50c5d241486184866cc9f44fc9" diff --git a/pyproject.toml b/pyproject.toml index ed3e1f4..c8dfbcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ pre-commit = "^4.0" invoke = "^2.2" pytest = "^8.3" pytest-asyncio = "^0.24.0" +flaky = "^3.8.1" [tool.pytest.ini_options] asyncio_mode = "auto" diff --git a/tasks.py b/tasks.py index fb827b1..9747255 100644 --- a/tasks.py +++ b/tasks.py @@ -1,3 +1,5 @@ +from sys import platform + from invoke import task @@ -24,3 +26,36 @@ def serve_docs(c): @task def deploy_docs(c): c.run("mkdocs gh-deploy --force") + + +@task +def test(c, cov=False, cov_report=None): + _run_pytest(c, "tests", cov, cov_report) + + +@task(aliases=["ut"]) +def unit_test(c, cov=False, cov_report=None): + _run_pytest(c, "tests/unit", cov, cov_report) + + +@task(aliases=["it"]) +def integration_test(c, cov=False, cov_report=None): + _run_pytest(c, "tests/integration", cov, cov_report) + + +def _run_pytest(c, test_dir, cov=False, cov_report=None): + c.run(f"pytest {test_dir} {_pytest_cov_options(cov, cov_report)} --no-flaky-report", pty=_use_pty()) + + +def _use_pty(): + return platform != "win32" + + +def _pytest_cov_options(use_cov: bool, cov_reports: str | None): + if not use_cov: + return "" + + cov_report_types = cov_reports.split(",") if cov_reports else [] + cov_report_types = ["term"] + cov_report_types + cov_report_params = [f"--cov-report {r}" for r in cov_report_types] + return f"--cov {' '.join(cov_report_params)}" diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..adcb99d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +TEST_ROOT_PATH = Path(__file__).parent.resolve() diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/helpers/flaky.py b/tests/helpers/flaky.py new file mode 100644 index 0000000..0410a99 --- /dev/null +++ b/tests/helpers/flaky.py @@ -0,0 +1,16 @@ +import time + +from google import genai + + +def rerun_on_google_genai_resource_exhausted(wait_time_s: float): + def _filter(err, name, test, plugin): + err_class, err_value, _ = err + match err_class: + case genai.errors.ClientError: + time.sleep(wait_time_s) + return "RESOURCE_EXHAUSTED" in str(err_value) + case _: + return False + + return _filter diff --git a/tests/helpers/skills/user_repository/__init__.py b/tests/helpers/skills/user_repository/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/helpers/skills/user_repository/api.py b/tests/helpers/skills/user_repository/api.py new file mode 100644 index 0000000..d9ec38a --- /dev/null +++ b/tests/helpers/skills/user_repository/api.py @@ -0,0 +1,38 @@ +from abc import ABC, abstractmethod + + +class UserRepository(ABC): + @abstractmethod + def find_user_name(self, user_id: str) -> str: + """Finds the name of a user in the user repository. + + Args: + user_id (str): The id of the user to find. + + Returns: + str: The name of the user. + """ + pass + + @abstractmethod + def find_user_email(self, user_id: str, invalidate_cache: bool = False) -> str: + """Finds the email of a user in the user repository. + + Args: + user_id (str): The id of the user to find. + invalidate_cache (bool): Whether to invalidate all the caches before lookup. + Should typically be left as False unless explicitly needed. + + Returns: + str: The email of the user. + """ + pass + + +def create_user_repository() -> UserRepository: + """ + Creates a new instance of the UserRepository tool. + """ + from .impl import UserRepositoryImpl + + return UserRepositoryImpl() diff --git a/tests/helpers/skills/user_repository/impl.py b/tests/helpers/skills/user_repository/impl.py new file mode 100644 index 0000000..0e23295 --- /dev/null +++ b/tests/helpers/skills/user_repository/impl.py @@ -0,0 +1,20 @@ +from .api import UserRepository + +USER_ID = "user-123" + + +class UserRepositoryImpl(UserRepository): + def find_user_name(self, user_id: str) -> str: + if user_id.lower().strip() == USER_ID: + return "user_a37c1f54" + + raise ValueError(f"User {user_id} not found") + + def find_user_email(self, user_id: str, invalidate_cache: bool = False) -> str: + if not invalidate_cache: + raise ValueError("You must invalidate the cache to get the email address") + + if user_id.lower().strip() == USER_ID: + return "user.a37c1f54@mytestdomain.com" + + raise ValueError(f"User {user_id} not found") diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 0000000..1754125 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,41 @@ +from unittest.mock import AsyncMock, MagicMock + +import pytest +from dotenv import load_dotenv + +from freeact.logger import Logger +from freeact.model.claude.model import Claude +from freeact.model.gemini.model.chat import Gemini + + +@pytest.fixture(autouse=True) +def load_env(): + load_dotenv() + + +@pytest.fixture +def logger(): + logger = MagicMock(spec=Logger) + logger.context = MagicMock() + logger.context.return_value.__aenter__ = AsyncMock() + logger.context.return_value.__aexit__ = AsyncMock() + logger.log = AsyncMock() + return logger + + +@pytest.fixture +def claude(logger): + return Claude( + logger=logger, + model_name="claude-3-5-haiku-20241022", + prompt_caching=False, + ) + + +@pytest.fixture +def gemini(): + return Gemini( + model_name="gemini-2.0-flash-exp", + temperature=0.0, + max_tokens=1024, + ) diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py new file mode 100644 index 0000000..e6ee965 --- /dev/null +++ b/tests/integration/test_agent.py @@ -0,0 +1,210 @@ +import shutil +import tempfile +from pathlib import Path + +import pytest + +from freeact.agent import CodeActAgent, CodeActAgentTurn, CodeExecution, CodeExecutionResult +from freeact.executor import CodeExecutionContainer, CodeExecutor +from freeact.model.base import CodeActModelResponse, CodeActModelTurn +from freeact.model.gemini.model.chat import Gemini +from tests import TEST_ROOT_PATH +from tests.helpers.flaky import rerun_on_google_genai_resource_exhausted + +GOOGLE_GENAI_WAIT_TIME = 5 + + +@pytest.fixture(scope="module") +async def workspace(): + with tempfile.TemporaryDirectory() as temp_dir: + shared_skills_path = Path(temp_dir) / "skills" / "shared" / "user_repository" + shared_skills_path.mkdir(parents=True, exist_ok=True) + shutil.copytree( + TEST_ROOT_PATH / "helpers" / "skills" / "user_repository", shared_skills_path, dirs_exist_ok=True + ) + yield temp_dir + + +@pytest.fixture(scope="module") +async def executor(workspace: str): + async with CodeExecutionContainer( + tag="ghcr.io/gradion-ai/ipybox:basic", + workspace_path=workspace, + env={ + "PYTHONDONTWRITEBYTECODE": "1" + }, # Prevent creation of __pycache__ directories created by ipybox root container which cannot be deleted + ) as container: + async with CodeExecutor( + key="test", + port=container.port, + workspace=container.workspace, + ) as executor: + yield executor + + +@pytest.fixture(scope="module") +async def skill_sources(executor): + return await executor.get_module_sources( + module_names=["user_repository.api"], + ) + + +@pytest.fixture +def gemini(skill_sources, request): + use_skill_sources = "skill_sources" in request.node.fixturenames # check if the test requires skill sources + + return Gemini( + model_name="gemini-2.0-flash-exp", + skill_sources=skill_sources if use_skill_sources else None, + temperature=0.0, + max_tokens=1024, + ) + + +@pytest.fixture( + params=[ + pytest.param("claude"), + pytest.param( + "gemini", + marks=pytest.mark.flaky( + max_runs=5, + rerun_filter=rerun_on_google_genai_resource_exhausted(GOOGLE_GENAI_WAIT_TIME), + ), + ), + ] +) +def agent(request, executor): + model = request.getfixturevalue(request.param) + return CodeActAgent(model=model, executor=executor) + + +@pytest.fixture( + params=[ + pytest.param("claude"), + pytest.param( + "gemini_with_skills", + marks=pytest.mark.flaky( + max_runs=5, + rerun_filter=rerun_on_google_genai_resource_exhausted(GOOGLE_GENAI_WAIT_TIME), + ), + ), + ] +) +def agent_with_skills(request, executor): + model = request.getfixturevalue(request.param) + return CodeActAgent(model=model, executor=executor) + + +async def collect_output(agent_turn: CodeActAgentTurn) -> list[CodeActModelResponse | CodeExecutionResult]: + output: list[CodeActModelResponse | CodeExecutionResult] = [] + async for activity in agent_turn.stream(): + match activity: + case CodeActModelTurn() as model_turn: + model_response = await model_turn.response() + output.append(model_response) + + case CodeExecution() as execution: + execution_result = await execution.result() + output.append(execution_result) + return output + + +@pytest.mark.asyncio(loop_scope="module") +async def test_agent_returns_text_response(agent): + agent_turn = agent.run("Do not generate any code. Just respond with the text 'Hello, world!'") + output = await collect_output(agent_turn) + + assert len(output) == 1 + assert isinstance(output[0], CodeActModelResponse) + assert output[0].code is None + assert output[0].text.strip() == "Hello, world!" + + +@pytest.mark.asyncio(loop_scope="module") +async def test_agent_returns_code_response(agent): + agent_turn = agent.run("Generate python code to raise the value of 25 to the power of 5. Only output the result.") + output = await collect_output(agent_turn) + + assert len(output) == 3 + assert isinstance(output[0], CodeActModelResponse) + assert output[0].code is not None + + assert isinstance(output[1], CodeExecutionResult) + assert output[1].is_error is False + + assert isinstance(output[2], CodeActModelResponse) + assert output[2].code is None + + response = await agent_turn.response() + assert "9765625" in response.text + + +@pytest.mark.asyncio(loop_scope="module") +async def test_agent_returns_follow_up_code_response(agent): + agent_turn_1 = agent.run("Generate python code to raise the value of 25 to the power of 5. Only output the result.") + await collect_output(agent_turn_1) + + response_1 = await agent_turn_1.response() + assert "9765625" in response_1.text + + agent_turn_2 = agent.run("What is the square root of the result?") + output = await collect_output(agent_turn_2) + + assert len(output) == 3 + assert isinstance(output[0], CodeActModelResponse) + assert output[0].code is not None + + assert isinstance(output[1], CodeExecutionResult) + assert output[1].is_error is False + + assert isinstance(output[2], CodeActModelResponse) + assert output[2].code is None + + response_2 = await agent_turn_2.response() + assert "3125" in response_2.text + + +@pytest.mark.asyncio(loop_scope="module") +async def test_agent_returns_follow_up_text_response(agent): + agent_turn_1 = agent.run("Generate python code to raise the value of 25 to the power of 5. Only output the result.") + await collect_output(agent_turn_1) + + response_1 = await agent_turn_1.response() + assert "9765625" in response_1.text + + agent_turn_2 = agent.run("Reply with the result again") + output = await collect_output(agent_turn_2) + + assert len(output) == 1 + assert isinstance(output[0], CodeActModelResponse) + assert output[0].code is None + + response_2 = await agent_turn_2.response() + assert "9765625" in response_2.text + + +@pytest.mark.asyncio(loop_scope="module") +async def test_agent_uses_provided_skills(agent, skill_sources): + agent_turn = agent.run( + "What is the name of the user with id 'user-123' in the user repository?", + skill_sources=skill_sources, + ) + await collect_output(agent_turn) + + response = await agent_turn.response() + assert "user_a37c1f54" in response.text + + +@pytest.mark.asyncio(loop_scope="module") +async def test_agent_recovers_from_skill_error(agent, skill_sources): + agent_turn = agent.run( + "What is the email address of the user with id 'user-123' in the user repository?", + skill_sources=skill_sources, + ) + output = await collect_output(agent_turn) + + code_exection_result = next(o for o in output if isinstance(o, CodeExecutionResult)) + assert code_exection_result.is_error is True + + response = await agent_turn.response() + assert "user.a37c1f54@mytestdomain.com" in response.text diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py new file mode 100644 index 0000000..fd80ce2 --- /dev/null +++ b/tests/integration/test_model.py @@ -0,0 +1,129 @@ +import pytest + +from freeact.model.gemini.model.chat import Gemini +from tests.helpers.flaky import rerun_on_google_genai_resource_exhausted + +GOOGLE_GENAI_WAIT_TIME = 5 + +CUSTOM_SKILL_SOURCES = """ + def custom_pow(base, exponent): + return base ** exponent +""" + + +@pytest.fixture +def skill_sources(): + return CUSTOM_SKILL_SOURCES + + +@pytest.fixture +def gemini(skill_sources, request): + use_skill_sources = "skill_sources" in request.node.fixturenames # check if the test requires skill sources + + return Gemini( + model_name="gemini-2.0-flash-exp", + skill_sources=skill_sources if use_skill_sources else None, + temperature=0.0, + max_tokens=1024, + ) + + +@pytest.fixture( + params=[ + pytest.param("claude"), + pytest.param( + "gemini", + marks=pytest.mark.flaky( + max_runs=5, + rerun_filter=rerun_on_google_genai_resource_exhausted(GOOGLE_GENAI_WAIT_TIME), + ), + ), + ] +) +def model(request): + return request.getfixturevalue(request.param) + + +@pytest.mark.asyncio +async def test_model_returns_text_response(model): + turn = model.request("Do not generate any code. Just respond with the text 'Hello, world!'") + response = await turn.response() + + assert response.text.strip() == "Hello, world!" + assert response.code is None + assert response.is_error is False + assert response.token_usage is not None + + +@pytest.mark.asyncio +async def test_model_returns_code_response(model): + turn = model.request( + "Generate python code using the math library to raise the value of 25 to the power of 5. Only output the result." + ) + response = await turn.response() + + assert response.text is not None + assert response.code is not None + assert "math.pow(25, 5)" in response.code + + +@pytest.mark.asyncio +async def test_model_returns_text_response_on_success_feedback(model): + turn_1 = model.request( + "Generate python code using the math library to raise the value of 25 to the power of 5. Only output the result." + ) + response_1 = await turn_1.response() + + assert response_1.text is not None + assert response_1.code is not None + assert "math.pow(25, 5)" in response_1.code + + turn_2 = model.feedback( + feedback="Result: 9765625.0", + is_error=False, + tool_use_id=response_1.tool_use_id, + tool_use_name=response_1.tool_use_name, + ) + response_2 = await turn_2.response() + + assert response_2.text is not None + assert response_2.code is None + assert "9765625" in response_2.text + + +@pytest.mark.asyncio +async def test_model_returns_code_to_recover_on_error_feedback(model): + turn_1 = model.request( + "Generate python code using the math library to raise the value of 25 to the power of 5. Only output the result." + ) + response_1 = await turn_1.response() + + assert response_1.text is not None + assert response_1.code is not None + assert "math.pow(25, 5)" in response_1.code + + turn_2 = model.feedback( + feedback="NameError: name 'math' is not defined", + is_error=True, + tool_use_id=response_1.tool_use_id, + tool_use_name=response_1.tool_use_name, + ) + response_2 = await turn_2.response() + + assert response_2.text is not None + assert response_2.code is not None + assert "import math" in response_2.code + assert "math.pow(25, 5)" in response_2.code + + +@pytest.mark.asyncio +async def test_model_uses_custom_skill_in_code(model, skill_sources): + turn = model.request( + "Generate python code to raise the value of 25 to the power of 5. Use one of the custom Python modules. IMPORTANT: If you cannot find a module only print the text 'Module not found' nothing else.", + skill_sources=skill_sources, + ) + response = await turn.response() + + assert response.text is not None + assert response.code is not None + assert "custom_pow" in response.code