From 74bc65806aa94f9a43751e3ade224a7433a11cd1 Mon Sep 17 00:00:00 2001
From: glados <cstumpf@canto.com>
Date: Thu, 16 Jan 2025 16:00:46 +0100
Subject: [PATCH] Implement integration tests

* Implement integration tests for models
  `claude` and `gemini`
* Implement integration tests for agents
  using `claude` and `gemini`
* Add invoke tasks to run individual tests
* Add integration tests to github CI pipeline

Add integration tests

Update

Update

Update
---
 .github/workflows/test.yml                    |  16 +-
 DEVELOPMENT.md                                |  26 ++-
 poetry.lock                                   |  15 +-
 pyproject.toml                                |   1 +
 tasks.py                                      |  35 +++
 tests/__init__.py                             |   3 +
 tests/helpers/__init__.py                     |   0
 tests/helpers/flaky.py                        |  16 ++
 .../skills/user_repository/__init__.py        |   0
 tests/helpers/skills/user_repository/api.py   |  38 ++++
 tests/helpers/skills/user_repository/impl.py  |  20 ++
 tests/integration/conftest.py                 |  41 ++++
 tests/integration/test_agent.py               | 210 ++++++++++++++++++
 tests/integration/test_model.py               | 129 +++++++++++
 14 files changed, 543 insertions(+), 7 deletions(-)
 create mode 100644 tests/helpers/__init__.py
 create mode 100644 tests/helpers/flaky.py
 create mode 100644 tests/helpers/skills/user_repository/__init__.py
 create mode 100644 tests/helpers/skills/user_repository/api.py
 create mode 100644 tests/helpers/skills/user_repository/impl.py
 create mode 100644 tests/integration/conftest.py
 create mode 100644 tests/integration/test_agent.py
 create mode 100644 tests/integration/test_model.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 90b2cc8..9cb6f1b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, wip-integration-tests ]
   pull_request:
     branches: [ main ]
 
@@ -10,6 +10,10 @@ jobs:
   test:
     runs-on: ubuntu-latest
 
+    env:
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+
     steps:
       - uses: actions/checkout@v4
 
@@ -39,7 +43,13 @@ jobs:
           poetry install
           pip list
 
-      - name: Run tests
+      - name: Run unit tests
+        shell: bash -l {0}
+        run: |
+          poetry run pytest tests/unit
+
+      - name: Run integration tests
         shell: bash -l {0}
         run: |
-          poetry run pytest -s tests
+          docker pull ghcr.io/gradion-ai/ipybox:basic
+          poetry run pytest tests/integration --no-flaky-report
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 13a9cbe..c361644 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -34,14 +34,36 @@ Install pre-commit hooks:
 invoke precommit-install
 ```
 
+Create a `.env` file with [Anthropic](https://console.anthropic.com/settings/keys) and [Gemini](https://aistudio.google.com/app/apikey) API keys:
+
+```env title=".env"
+# Required for Claude 3.5 Sonnet
+ANTHROPIC_API_KEY=...
+
+# Required for generative Google Search via Gemini 2
+GOOGLE_API_KEY=...
+```
+
 Enforce coding conventions (done automatically by pre-commit hooks):
 
 ```bash
 invoke cc
 ```
 
-Run tests:
+Run unit tests:
+
+```bash
+invoke ut
+```
+
+Run integration tests:
+
+```bash
+invoke it
+```
+
+Run all tests:
 
 ```bash
-pytest -s tests
+invoke test
 ```
diff --git a/poetry.lock b/poetry.lock
index 2c84ac1..8e7fdd6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
 
 [[package]]
 name = "aioconsole"
@@ -610,6 +610,17 @@ docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.
 testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
 typing = ["typing-extensions (>=4.12.2)"]
 
+[[package]]
+name = "flaky"
+version = "3.8.1"
+description = "Plugin for pytest that automatically reruns flaky tests."
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "flaky-3.8.1-py2.py3-none-any.whl", hash = "sha256:194ccf4f0d3a22b2de7130f4b62e45e977ac1b5ccad74d4d48f3005dcc38815e"},
+    {file = "flaky-3.8.1.tar.gz", hash = "sha256:47204a81ec905f3d5acfbd61daeabcada8f9d4031616d9bcb0618461729699f5"},
+]
+
 [[package]]
 name = "fonttools"
 version = "4.55.3"
@@ -3249,4 +3260,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11,<3.14"
-content-hash = "78d5ba40f33642e64ba8fd9584afd30ed6180684ca7781d4fd2ac37a03cbb717"
+content-hash = "609e78e71b123b453a81dc6b34919f42ab85ac50c5d241486184866cc9f44fc9"
diff --git a/pyproject.toml b/pyproject.toml
index ed3e1f4..c8dfbcc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ pre-commit = "^4.0"
 invoke = "^2.2"
 pytest = "^8.3"
 pytest-asyncio = "^0.24.0"
+flaky = "^3.8.1"
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
diff --git a/tasks.py b/tasks.py
index fb827b1..9747255 100644
--- a/tasks.py
+++ b/tasks.py
@@ -1,3 +1,5 @@
+from sys import platform
+
 from invoke import task
 
 
@@ -24,3 +26,36 @@ def serve_docs(c):
 @task
 def deploy_docs(c):
     c.run("mkdocs gh-deploy --force")
+
+
+@task
+def test(c, cov=False, cov_report=None):
+    _run_pytest(c, "tests", cov, cov_report)
+
+
+@task(aliases=["ut"])
+def unit_test(c, cov=False, cov_report=None):
+    _run_pytest(c, "tests/unit", cov, cov_report)
+
+
+@task(aliases=["it"])
+def integration_test(c, cov=False, cov_report=None):
+    _run_pytest(c, "tests/integration", cov, cov_report)
+
+
+def _run_pytest(c, test_dir, cov=False, cov_report=None):
+    c.run(f"pytest {test_dir} {_pytest_cov_options(cov, cov_report)} --no-flaky-report", pty=_use_pty())
+
+
+def _use_pty():
+    return platform != "win32"
+
+
+def _pytest_cov_options(use_cov: bool, cov_reports: str | None):
+    if not use_cov:
+        return ""
+
+    cov_report_types = cov_reports.split(",") if cov_reports else []
+    cov_report_types = ["term"] + cov_report_types
+    cov_report_params = [f"--cov-report {r}" for r in cov_report_types]
+    return f"--cov {' '.join(cov_report_params)}"
diff --git a/tests/__init__.py b/tests/__init__.py
index e69de29..adcb99d 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+TEST_ROOT_PATH = Path(__file__).parent.resolve()
diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/helpers/flaky.py b/tests/helpers/flaky.py
new file mode 100644
index 0000000..0410a99
--- /dev/null
+++ b/tests/helpers/flaky.py
@@ -0,0 +1,16 @@
+import time
+
+from google import genai
+
+
+def rerun_on_google_genai_resource_exhausted(wait_time_s: float):
+    def _filter(err, name, test, plugin):
+        err_class, err_value, _ = err
+        match err_class:
+            case genai.errors.ClientError:
+                time.sleep(wait_time_s)
+                return "RESOURCE_EXHAUSTED" in str(err_value)
+            case _:
+                return False
+
+    return _filter
diff --git a/tests/helpers/skills/user_repository/__init__.py b/tests/helpers/skills/user_repository/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/helpers/skills/user_repository/api.py b/tests/helpers/skills/user_repository/api.py
new file mode 100644
index 0000000..d9ec38a
--- /dev/null
+++ b/tests/helpers/skills/user_repository/api.py
@@ -0,0 +1,38 @@
+from abc import ABC, abstractmethod
+
+
+class UserRepository(ABC):
+    @abstractmethod
+    def find_user_name(self, user_id: str) -> str:
+        """Finds the name of a user in the user repository.
+
+        Args:
+            user_id (str): The id of the user to find.
+
+        Returns:
+            str: The name of the user.
+        """
+        pass
+
+    @abstractmethod
+    def find_user_email(self, user_id: str, invalidate_cache: bool = False) -> str:
+        """Finds the email of a user in the user repository.
+
+        Args:
+            user_id (str): The id of the user to find.
+            invalidate_cache (bool): Whether to invalidate all the caches before lookup.
+                                     Should typically be left as False unless explicitly needed.
+
+        Returns:
+            str: The email of the user.
+        """
+        pass
+
+
+def create_user_repository() -> UserRepository:
+    """
+    Creates a new instance of the UserRepository tool.
+    """
+    from .impl import UserRepositoryImpl
+
+    return UserRepositoryImpl()
diff --git a/tests/helpers/skills/user_repository/impl.py b/tests/helpers/skills/user_repository/impl.py
new file mode 100644
index 0000000..0e23295
--- /dev/null
+++ b/tests/helpers/skills/user_repository/impl.py
@@ -0,0 +1,20 @@
+from .api import UserRepository
+
+USER_ID = "user-123"
+
+
+class UserRepositoryImpl(UserRepository):
+    def find_user_name(self, user_id: str) -> str:
+        if user_id.lower().strip() == USER_ID:
+            return "user_a37c1f54"
+
+        raise ValueError(f"User {user_id} not found")
+
+    def find_user_email(self, user_id: str, invalidate_cache: bool = False) -> str:
+        if not invalidate_cache:
+            raise ValueError("You must invalidate the cache to get the email address")
+
+        if user_id.lower().strip() == USER_ID:
+            return "user.a37c1f54@mytestdomain.com"
+
+        raise ValueError(f"User {user_id} not found")
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 0000000..1754125
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,41 @@
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+from dotenv import load_dotenv
+
+from freeact.logger import Logger
+from freeact.model.claude.model import Claude
+from freeact.model.gemini.model.chat import Gemini
+
+
+@pytest.fixture(autouse=True)
+def load_env():
+    load_dotenv()
+
+
+@pytest.fixture
+def logger():
+    logger = MagicMock(spec=Logger)
+    logger.context = MagicMock()
+    logger.context.return_value.__aenter__ = AsyncMock()
+    logger.context.return_value.__aexit__ = AsyncMock()
+    logger.log = AsyncMock()
+    return logger
+
+
+@pytest.fixture
+def claude(logger):
+    return Claude(
+        logger=logger,
+        model_name="claude-3-5-haiku-20241022",
+        prompt_caching=False,
+    )
+
+
+@pytest.fixture
+def gemini():
+    return Gemini(
+        model_name="gemini-2.0-flash-exp",
+        temperature=0.0,
+        max_tokens=1024,
+    )
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
new file mode 100644
index 0000000..e6ee965
--- /dev/null
+++ b/tests/integration/test_agent.py
@@ -0,0 +1,210 @@
+import shutil
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from freeact.agent import CodeActAgent, CodeActAgentTurn, CodeExecution, CodeExecutionResult
+from freeact.executor import CodeExecutionContainer, CodeExecutor
+from freeact.model.base import CodeActModelResponse, CodeActModelTurn
+from freeact.model.gemini.model.chat import Gemini
+from tests import TEST_ROOT_PATH
+from tests.helpers.flaky import rerun_on_google_genai_resource_exhausted
+
+GOOGLE_GENAI_WAIT_TIME = 5
+
+
+@pytest.fixture(scope="module")
+async def workspace():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        shared_skills_path = Path(temp_dir) / "skills" / "shared" / "user_repository"
+        shared_skills_path.mkdir(parents=True, exist_ok=True)
+        shutil.copytree(
+            TEST_ROOT_PATH / "helpers" / "skills" / "user_repository", shared_skills_path, dirs_exist_ok=True
+        )
+        yield temp_dir
+
+
+@pytest.fixture(scope="module")
+async def executor(workspace: str):
+    async with CodeExecutionContainer(
+        tag="ghcr.io/gradion-ai/ipybox:basic",
+        workspace_path=workspace,
+        env={
+            "PYTHONDONTWRITEBYTECODE": "1"
+        },  # Prevent creation of __pycache__ directories created by ipybox root container which cannot be deleted
+    ) as container:
+        async with CodeExecutor(
+            key="test",
+            port=container.port,
+            workspace=container.workspace,
+        ) as executor:
+            yield executor
+
+
+@pytest.fixture(scope="module")
+async def skill_sources(executor):
+    return await executor.get_module_sources(
+        module_names=["user_repository.api"],
+    )
+
+
+@pytest.fixture
+def gemini(skill_sources, request):
+    use_skill_sources = "skill_sources" in request.node.fixturenames  # check if the test requires skill sources
+
+    return Gemini(
+        model_name="gemini-2.0-flash-exp",
+        skill_sources=skill_sources if use_skill_sources else None,
+        temperature=0.0,
+        max_tokens=1024,
+    )
+
+
+@pytest.fixture(
+    params=[
+        pytest.param("claude"),
+        pytest.param(
+            "gemini",
+            marks=pytest.mark.flaky(
+                max_runs=5,
+                rerun_filter=rerun_on_google_genai_resource_exhausted(GOOGLE_GENAI_WAIT_TIME),
+            ),
+        ),
+    ]
+)
+def agent(request, executor):
+    model = request.getfixturevalue(request.param)
+    return CodeActAgent(model=model, executor=executor)
+
+
+@pytest.fixture(
+    params=[
+        pytest.param("claude"),
+        pytest.param(
+            "gemini_with_skills",
+            marks=pytest.mark.flaky(
+                max_runs=5,
+                rerun_filter=rerun_on_google_genai_resource_exhausted(GOOGLE_GENAI_WAIT_TIME),
+            ),
+        ),
+    ]
+)
+def agent_with_skills(request, executor):
+    model = request.getfixturevalue(request.param)
+    return CodeActAgent(model=model, executor=executor)
+
+
+async def collect_output(agent_turn: CodeActAgentTurn) -> list[CodeActModelResponse | CodeExecutionResult]:
+    output: list[CodeActModelResponse | CodeExecutionResult] = []
+    async for activity in agent_turn.stream():
+        match activity:
+            case CodeActModelTurn() as model_turn:
+                model_response = await model_turn.response()
+                output.append(model_response)
+
+            case CodeExecution() as execution:
+                execution_result = await execution.result()
+                output.append(execution_result)
+    return output
+
+
+@pytest.mark.asyncio(loop_scope="module")
+async def test_agent_returns_text_response(agent):
+    agent_turn = agent.run("Do not generate any code. Just respond with the text 'Hello, world!'")
+    output = await collect_output(agent_turn)
+
+    assert len(output) == 1
+    assert isinstance(output[0], CodeActModelResponse)
+    assert output[0].code is None
+    assert output[0].text.strip() == "Hello, world!"
+
+
+@pytest.mark.asyncio(loop_scope="module")
+async def test_agent_returns_code_response(agent):
+    agent_turn = agent.run("Generate python code to raise the value of 25 to the power of 5. Only output the result.")
+    output = await collect_output(agent_turn)
+
+    assert len(output) == 3
+    assert isinstance(output[0], CodeActModelResponse)
+    assert output[0].code is not None
+
+    assert isinstance(output[1], CodeExecutionResult)
+    assert output[1].is_error is False
+
+    assert isinstance(output[2], CodeActModelResponse)
+    assert output[2].code is None
+
+    response = await agent_turn.response()
+    assert "9765625" in response.text
+
+
+@pytest.mark.asyncio(loop_scope="module")
+async def test_agent_returns_follow_up_code_response(agent):
+    agent_turn_1 = agent.run("Generate python code to raise the value of 25 to the power of 5. Only output the result.")
+    await collect_output(agent_turn_1)
+
+    response_1 = await agent_turn_1.response()
+    assert "9765625" in response_1.text
+
+    agent_turn_2 = agent.run("What is the square root of the result?")
+    output = await collect_output(agent_turn_2)
+
+    assert len(output) == 3
+    assert isinstance(output[0], CodeActModelResponse)
+    assert output[0].code is not None
+
+    assert isinstance(output[1], CodeExecutionResult)
+    assert output[1].is_error is False
+
+    assert isinstance(output[2], CodeActModelResponse)
+    assert output[2].code is None
+
+    response_2 = await agent_turn_2.response()
+    assert "3125" in response_2.text
+
+
+@pytest.mark.asyncio(loop_scope="module")
+async def test_agent_returns_follow_up_text_response(agent):
+    agent_turn_1 = agent.run("Generate python code to raise the value of 25 to the power of 5. Only output the result.")
+    await collect_output(agent_turn_1)
+
+    response_1 = await agent_turn_1.response()
+    assert "9765625" in response_1.text
+
+    agent_turn_2 = agent.run("Reply with the result again")
+    output = await collect_output(agent_turn_2)
+
+    assert len(output) == 1
+    assert isinstance(output[0], CodeActModelResponse)
+    assert output[0].code is None
+
+    response_2 = await agent_turn_2.response()
+    assert "9765625" in response_2.text
+
+
+@pytest.mark.asyncio(loop_scope="module")
+async def test_agent_uses_provided_skills(agent, skill_sources):
+    agent_turn = agent.run(
+        "What is the name of the user with id 'user-123' in the user repository?",
+        skill_sources=skill_sources,
+    )
+    await collect_output(agent_turn)
+
+    response = await agent_turn.response()
+    assert "user_a37c1f54" in response.text
+
+
+@pytest.mark.asyncio(loop_scope="module")
+async def test_agent_recovers_from_skill_error(agent, skill_sources):
+    agent_turn = agent.run(
+        "What is the email address of the user with id 'user-123' in the user repository?",
+        skill_sources=skill_sources,
+    )
+    output = await collect_output(agent_turn)
+
+    code_exection_result = next(o for o in output if isinstance(o, CodeExecutionResult))
+    assert code_exection_result.is_error is True
+
+    response = await agent_turn.response()
+    assert "user.a37c1f54@mytestdomain.com" in response.text
diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py
new file mode 100644
index 0000000..fd80ce2
--- /dev/null
+++ b/tests/integration/test_model.py
@@ -0,0 +1,129 @@
+import pytest
+
+from freeact.model.gemini.model.chat import Gemini
+from tests.helpers.flaky import rerun_on_google_genai_resource_exhausted
+
+GOOGLE_GENAI_WAIT_TIME = 5
+
+CUSTOM_SKILL_SOURCES = """
+    def custom_pow(base, exponent):
+        return base ** exponent
+"""
+
+
+@pytest.fixture
+def skill_sources():
+    return CUSTOM_SKILL_SOURCES
+
+
+@pytest.fixture
+def gemini(skill_sources, request):
+    use_skill_sources = "skill_sources" in request.node.fixturenames  # check if the test requires skill sources
+
+    return Gemini(
+        model_name="gemini-2.0-flash-exp",
+        skill_sources=skill_sources if use_skill_sources else None,
+        temperature=0.0,
+        max_tokens=1024,
+    )
+
+
+@pytest.fixture(
+    params=[
+        pytest.param("claude"),
+        pytest.param(
+            "gemini",
+            marks=pytest.mark.flaky(
+                max_runs=5,
+                rerun_filter=rerun_on_google_genai_resource_exhausted(GOOGLE_GENAI_WAIT_TIME),
+            ),
+        ),
+    ]
+)
+def model(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.mark.asyncio
+async def test_model_returns_text_response(model):
+    turn = model.request("Do not generate any code. Just respond with the text 'Hello, world!'")
+    response = await turn.response()
+
+    assert response.text.strip() == "Hello, world!"
+    assert response.code is None
+    assert response.is_error is False
+    assert response.token_usage is not None
+
+
+@pytest.mark.asyncio
+async def test_model_returns_code_response(model):
+    turn = model.request(
+        "Generate python code using the math library to raise the value of 25 to the power of 5. Only output the result."
+    )
+    response = await turn.response()
+
+    assert response.text is not None
+    assert response.code is not None
+    assert "math.pow(25, 5)" in response.code
+
+
+@pytest.mark.asyncio
+async def test_model_returns_text_response_on_success_feedback(model):
+    turn_1 = model.request(
+        "Generate python code using the math library to raise the value of 25 to the power of 5. Only output the result."
+    )
+    response_1 = await turn_1.response()
+
+    assert response_1.text is not None
+    assert response_1.code is not None
+    assert "math.pow(25, 5)" in response_1.code
+
+    turn_2 = model.feedback(
+        feedback="Result: 9765625.0",
+        is_error=False,
+        tool_use_id=response_1.tool_use_id,
+        tool_use_name=response_1.tool_use_name,
+    )
+    response_2 = await turn_2.response()
+
+    assert response_2.text is not None
+    assert response_2.code is None
+    assert "9765625" in response_2.text
+
+
+@pytest.mark.asyncio
+async def test_model_returns_code_to_recover_on_error_feedback(model):
+    turn_1 = model.request(
+        "Generate python code using the math library to raise the value of 25 to the power of 5. Only output the result."
+    )
+    response_1 = await turn_1.response()
+
+    assert response_1.text is not None
+    assert response_1.code is not None
+    assert "math.pow(25, 5)" in response_1.code
+
+    turn_2 = model.feedback(
+        feedback="NameError: name 'math' is not defined",
+        is_error=True,
+        tool_use_id=response_1.tool_use_id,
+        tool_use_name=response_1.tool_use_name,
+    )
+    response_2 = await turn_2.response()
+
+    assert response_2.text is not None
+    assert response_2.code is not None
+    assert "import math" in response_2.code
+    assert "math.pow(25, 5)" in response_2.code
+
+
+@pytest.mark.asyncio
+async def test_model_uses_custom_skill_in_code(model, skill_sources):
+    turn = model.request(
+        "Generate python code to raise the value of 25 to the power of 5. Use one of the custom Python modules. IMPORTANT: If you cannot find a module only print the text 'Module not found' nothing else.",
+        skill_sources=skill_sources,
+    )
+    response = await turn.response()
+
+    assert response.text is not None
+    assert response.code is not None
+    assert "custom_pow" in response.code