From 5453a6de7e265560c6baed5b163771b7195a3e63 Mon Sep 17 00:00:00 2001
From: Mayk Caldas <maykcaldas@gmail.com>
Date: Tue, 26 Nov 2024 17:33:42 -0800
Subject: [PATCH] Added the MultipleCompletionLLMModel class used in LDP (#3)

* Added the MultipleCompletionLLMModel class used in LDP

---------

Co-authored-by: Mayk Caldas <mayk@futurehouse.org>
---
 llmclient/__init__.py                         |  30 +-
 llmclient/constants.py                        |   5 +
 llmclient/embeddings.py                       |   6 +-
 llmclient/llms.py                             | 243 +++++++++++++-
 llmclient/types.py                            |  19 +-
 llmclient/utils.py                            |   5 +
 ...ionLLMModel.test_model[gpt-3.5-turbo].yaml | 104 ++++++
 ...CompletionLLMModel.test_output_schema.yaml | 102 ++++++
 ...st_parameterizing_tool_from_arg_union.yaml | 109 +++++++
 ...image_message[gpt-4o-mini-2024-07-18].yaml | 111 +++++++
 tests/test_embeddings.py                      | 112 ++++++-
 tests/test_llms.py                            | 300 +++++++++++-------
 uv.lock                                       |   2 +-
 13 files changed, 1023 insertions(+), 125 deletions(-)
 create mode 100644 tests/cassettes/TestMultipleCompletionLLMModel.test_model[gpt-3.5-turbo].yaml
 create mode 100644 tests/cassettes/TestMultipleCompletionLLMModel.test_output_schema.yaml
 create mode 100644 tests/cassettes/TestMultipleCompletionLLMModel.test_parameterizing_tool_from_arg_union.yaml
 create mode 100644 tests/cassettes/TestMultipleCompletionLLMModel.test_text_image_message[gpt-4o-mini-2024-07-18].yaml

diff --git a/llmclient/__init__.py b/llmclient/__init__.py
index e1769b2..ea14ec7 100644
--- a/llmclient/__init__.py
+++ b/llmclient/__init__.py
@@ -1,7 +1,33 @@
-from llmclient.llms import LLMModel
-from llmclient.types import LLMResult
+from llmclient.embeddings import (
+    EmbeddingModel,
+    HybridEmbeddingModel,
+    LiteLLMEmbeddingModel,
+    SentenceTransformerEmbeddingModel,
+    SparseEmbeddingModel,
+    embedding_model_factory,
+)
+from llmclient.llms import LiteLLMModel, LLMModel, MultipleCompletionLLMModel
+from llmclient.types import (
+    Chunk,
+    Embeddable,
+    LLMResult,
+)
+from llmclient.version import __version__
 
 __all__ = [
+    "Chunk",
+    "Embeddable",
+    "EmbeddingModel",
+    "HybridEmbeddingModel",
     "LLMModel",
     "LLMResult",
+    "LLMResult",
+    "LiteLLMEmbeddingModel",
+    "LiteLLMModel",
+    "MultipleCompletionLLMModel",
+    "SentenceTransformerEmbeddingModel",
+    "SparseEmbeddingModel",
+    "__version__",
+    "embedding_model_factory",
+    "embedding_model_factory",
 ]
diff --git a/llmclient/constants.py b/llmclient/constants.py
index 3220d62..7a4ce97 100644
--- a/llmclient/constants.py
+++ b/llmclient/constants.py
@@ -2,7 +2,12 @@
 
 import litellm
 
+# Estimate from OpenAI's FAQ
+# https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
 CHARACTERS_PER_TOKEN_ASSUMPTION: float = 4.0
+# Added tokens from user/role message
+# Need to add while doing rate limits
+# Taken from empirical counts in tests
 EXTRA_TOKENS_FROM_USER_ROLE: int = 7
 
 MODEL_COST_MAP = litellm.get_model_cost_map("")
diff --git a/llmclient/embeddings.py b/llmclient/embeddings.py
index 2f4ca24..cba5a1a 100644
--- a/llmclient/embeddings.py
+++ b/llmclient/embeddings.py
@@ -16,11 +16,7 @@
 
 from llmclient.constants import CHARACTERS_PER_TOKEN_ASSUMPTION, MODEL_COST_MAP
 from llmclient.rate_limiter import GLOBAL_LIMITER
-
-
-def get_litellm_retrying_config(timeout: float = 60.0) -> dict[str, Any]:
-    """Get retrying configuration for litellm.acompletion and litellm.aembedding."""
-    return {"num_retries": 3, "timeout": timeout}
+from llmclient.utils import get_litellm_retrying_config
 
 
 class EmbeddingModes(StrEnum):
diff --git a/llmclient/llms.py b/llmclient/llms.py
index aae4e89..47c1799 100644
--- a/llmclient/llms.py
+++ b/llmclient/llms.py
@@ -1,6 +1,7 @@
 import asyncio
 import contextlib
 import functools
+import json
 from abc import ABC
 from collections.abc import (
     AsyncGenerator,
@@ -13,13 +14,18 @@
 from inspect import isasyncgenfunction, signature
 from typing import (
     Any,
+    ClassVar,
+    Self,
     TypeVar,
     cast,
 )
 
 import litellm
 from aviary.core import (
+    Message,
+    Tool,
     ToolRequestMessage,
+    ToolsAdapter,
     ToolSelector,
 )
 from pydantic import (
@@ -41,7 +47,7 @@
 from llmclient.prompts import default_system_prompt
 from llmclient.rate_limiter import GLOBAL_LIMITER
 from llmclient.types import Chunk, LLMResult
-from llmclient.utils import is_coroutine_callable
+from llmclient.utils import get_litellm_retrying_config, is_coroutine_callable
 
 if not IS_PYTHON_BELOW_312:
     _DeploymentTypedDictValidator = TypeAdapter(
@@ -120,11 +126,6 @@ async def do_callbacks(
         f(*args, **kwargs)
 
 
-def get_litellm_retrying_config(timeout: float = 60.0) -> dict[str, Any]:
-    """Get retrying configuration for litellm.acompletion and litellm.aembedding."""
-    return {"num_retries": 3, "timeout": timeout}
-
-
 class LLMModel(ABC, BaseModel):
     model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
 
@@ -582,3 +583,233 @@ async def select_tool(
             model_name=self.name, acompletion=self.router.acompletion
         )
         return await tool_selector(*selection_args, **selection_kwargs)
+
+
+class MultipleCompletionLLMModel(BaseModel):
+    """Run n completions at once, all starting from the same messages."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    # this should keep the original model
+    # if fine-tuned, this should still refer to the base model
+    name: str = "unknown"
+    config: dict = Field(
+        default={
+            "model": "gpt-3.5-turbo",  # Default model should have cheap input/output for testing
+            "temperature": 0.1,
+        }
+    )
+    encoding: Any | None = None
+
+    def __str__(self) -> str:
+        return f"{type(self).__name__} {self.name}"
+
+    @model_validator(mode="after")
+    def set_model_name(self) -> Self:
+        if (
+            self.config.get("model") in {"gpt-3.5-turbo", None}
+            and self.name != "unknown"
+        ) or (self.name != "unknown" and "model" not in self.config):
+            self.config["model"] = self.name
+        elif "model" in self.config and self.name == "unknown":
+            self.name = self.config["model"]
+        # note we do not consider case where both are set
+        # because that could be true if the model is fine-tuned
+        return self
+
+    async def achat(
+        self, messages: Iterable[Message], **kwargs
+    ) -> litellm.ModelResponse:
+        return await litellm.acompletion(
+            messages=[m.model_dump(by_alias=True) for m in messages],
+            **(self.config | kwargs),
+        )
+
+    async def achat_iter(self, messages: Iterable[Message], **kwargs) -> AsyncGenerator:
+        return cast(
+            AsyncGenerator,
+            await litellm.acompletion(
+                messages=[m.model_dump(by_alias=True) for m in messages],
+                stream=True,
+                stream_options={
+                    "include_usage": True,  # Included to get prompt token counts
+                },
+                **(self.config | kwargs),
+            ),
+        )
+
+    # SEE: https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
+    # > `required` means the model must call one or more tools.
+    TOOL_CHOICE_REQUIRED: ClassVar[str] = "required"
+
+    async def call(  # noqa: C901, PLR0915
+        self,
+        messages: list[Message],
+        callbacks: list[Callable] | None = None,
+        output_type: type[BaseModel] | None = None,
+        tools: list[Tool] | None = None,
+        tool_choice: Tool | str | None = TOOL_CHOICE_REQUIRED,
+        **chat_kwargs,
+    ) -> list[LLMResult]:
+        start_clock = asyncio.get_running_loop().time()
+
+        # Deal with tools. Note OpenAI throws a 400 response if tools is empty:
+        # > Invalid 'tools': empty array. Expected an array with minimum length 1,
+        # > but got an empty array instead.
+        # So, circumvent this behavior if tools in (None, [])
+        if tools:
+            chat_kwargs["tools"] = ToolsAdapter.dump_python(
+                tools, exclude_none=True, by_alias=True
+            )
+            if tool_choice is not None:
+                chat_kwargs["tool_choice"] = (
+                    {
+                        "type": "function",
+                        "function": {"name": tool_choice.info.name},
+                    }
+                    if isinstance(tool_choice, Tool)
+                    else tool_choice
+                )
+
+        # deal with specifying output type
+        if output_type is not None:
+            schema = json.dumps(output_type.model_json_schema(mode="serialization"))
+            schema_msg = f"Respond following this JSON schema:\n\n{schema}"
+            # Get the system prompt and its index, or the index to add it
+            i, system_prompt = next(
+                ((i, m) for i, m in enumerate(messages) if m.role == "system"),
+                (0, None),
+            )
+            messages = [
+                *messages[:i],
+                (
+                    system_prompt.append_text(schema_msg, inplace=False)
+                    if system_prompt
+                    else Message(role="system", content=schema_msg)
+                ),
+                *messages[i + 1 if system_prompt else i :],
+            ]
+            chat_kwargs["response_format"] = {"type": "json_object"}
+
+        # add static configuration to kwargs
+        chat_kwargs = self.config | chat_kwargs
+        n = chat_kwargs.get("n", 1)  # number of completions
+        if n < 1:
+            raise ValueError("Number of completions (n) must be >= 1.")
+
+        prompt = [
+            (
+                m
+                if not isinstance(m, ToolRequestMessage) or m.tool_calls
+                # OpenAI doesn't allow for empty tool_calls lists, so downcast empty
+                # ToolRequestMessage to Message here
+                else Message(role=m.role, content=m.content)
+            )
+            for m in messages
+        ]
+        results: list[LLMResult] = []
+
+        if callbacks is None:
+            completion: litellm.ModelResponse = await self.achat(prompt, **chat_kwargs)
+            if output_type is not None:
+                validate_json_completion(completion, output_type)
+
+            for choice in completion.choices:
+                if isinstance(choice, litellm.utils.StreamingChoices):
+                    raise NotImplementedError("Streaming is not yet supported.")
+
+                if (
+                    tools is not None  # Allows for empty tools list
+                    or choice.finish_reason == "tool_calls"
+                    or (getattr(choice.message, "tool_calls", None) is not None)
+                ):
+                    serialized_choice_message = choice.message.model_dump()
+                    serialized_choice_message["tool_calls"] = (
+                        serialized_choice_message.get("tool_calls") or []
+                    )
+                    output_messages: list[Message | ToolRequestMessage] = [
+                        ToolRequestMessage(**serialized_choice_message)
+                    ]
+                else:
+                    output_messages = [Message(**choice.message.model_dump())]
+
+                results.append(
+                    LLMResult(
+                        model=self.name,
+                        config=chat_kwargs,
+                        prompt=prompt,
+                        messages=output_messages,
+                        logprob=sum_logprobs(choice),
+                        system_fingerprint=completion.system_fingerprint,
+                        # Note that these counts are aggregated over all choices
+                        completion_count=completion.usage.completion_tokens,  # type: ignore[attr-defined,unused-ignore]
+                        prompt_count=completion.usage.prompt_tokens,  # type: ignore[attr-defined,unused-ignore]
+                    )
+                )
+        else:
+            if tools:
+                raise NotImplementedError("Using tools with callbacks is not supported")
+            if n > 1:
+                raise NotImplementedError(
+                    "Multiple completions with callbacks is not supported"
+                )
+            result = LLMResult(model=self.name, config=chat_kwargs, prompt=prompt)
+
+            sync_callbacks = [f for f in callbacks if not is_coroutine_callable(f)]
+            async_callbacks = [f for f in callbacks if is_coroutine_callable(f)]
+            stream_completion = await self.achat_iter(messages, **chat_kwargs)
+            text_result = []
+            role = "assistant"
+
+            async for chunk in stream_completion:
+                delta = chunk.choices[0].delta
+                role = delta.role or role
+                if delta.content:
+                    s = delta.content
+                    if result.seconds_to_first_token == 0:
+                        result.seconds_to_first_token = (
+                            asyncio.get_running_loop().time() - start_clock
+                        )
+                    text_result.append(s)
+                    [await f(s) for f in async_callbacks]
+                    [f(s) for f in sync_callbacks]
+                if hasattr(chunk, "usage"):
+                    result.prompt_count = chunk.usage.prompt_tokens
+
+            output = "".join(text_result)
+            result.completion_count = litellm.token_counter(
+                model=self.name,
+                text=output,
+            )
+            # TODO: figure out how tools stream, and log probs
+            result.messages = [Message(role=role, content=output)]
+            results.append(result)
+
+        if not results:
+            # This happens in unit tests. We should probably not keep this block around
+            # long-term. Previously, we would emit an empty ToolRequestMessage if
+            # completion.choices were empty, so  I am replicating that here.
+            results.append(
+                LLMResult(
+                    model=self.name,
+                    config=chat_kwargs,
+                    prompt=prompt,
+                    messages=[ToolRequestMessage(tool_calls=[])],
+                )
+            )
+
+        end_clock = asyncio.get_running_loop().time()
+
+        for result in results:
+            # Manually update prompt count if not set, which can
+            # happen if the target model doesn't support 'include_usage'
+            if not result.prompt_count and result.messages:
+                result.prompt_count = litellm.token_counter(
+                    model=self.name,
+                    messages=[m.model_dump() for m in result.messages],
+                )
+
+            # update with server-side counts
+            result.seconds_to_last_token = end_clock - start_clock
+
+        return results
diff --git a/llmclient/types.py b/llmclient/types.py
index d4eeea1..3ad1df9 100644
--- a/llmclient/types.py
+++ b/llmclient/types.py
@@ -5,6 +5,7 @@
 from uuid import UUID, uuid4
 
 import litellm
+from aviary.core import Message
 from pydantic import (
     BaseModel,
     ConfigDict,
@@ -67,11 +68,14 @@ class LLMResult(BaseModel):
         alias="answer_id",
     )
     name: str | None = None
-    prompt: str | list[dict] | None = Field(
+    prompt: str | list[dict] | Message | list[Message] | None = Field(
         default=None,
         description="Optional prompt (str) or list of serialized prompts (list[dict]).",
     )
     text: str = ""
+    messages: list[Message] | None = Field(
+        default=None, description="Messages received from the LLM."
+    )
     prompt_count: int = 0
     completion_count: int = 0
     model: str
@@ -82,6 +86,9 @@ class LLMResult(BaseModel):
     seconds_to_last_token: float = Field(
         default=0.0, description="Delta time (sec) to last response token's arrival."
     )
+    logprob: float | None = Field(
+        default=None, description="Sum of logprobs in the completion."
+    )
 
     def __str__(self) -> str:
         return self.text
@@ -98,3 +105,13 @@ def cost(self) -> float:
             except KeyError:
                 logger.warning(f"Could not find cost for model {self.model}.")
         return 0.0
+
+    # These two methods were implemented in ldp, but not in pqa. Check if they're necessary
+    # @property
+    # def provider(self) -> str:
+    #     """Get the model provider's name (e.g. "openai", "mistral")."""
+    #     return litellm.get_llm_provider(self.model)[1]
+
+    # def get_supported_openai_params(self) -> list[str] | None:
+    #     """Get the supported OpenAI parameters for the model."""
+    #     return litellm.get_supported_openai_params(self.model)
diff --git a/llmclient/utils.py b/llmclient/utils.py
index 304bf95..e99c8e9 100644
--- a/llmclient/utils.py
+++ b/llmclient/utils.py
@@ -12,6 +12,11 @@
 import pymupdf
 
 
+def get_litellm_retrying_config(timeout: float = 60.0) -> dict[str, Any]:
+    """Get retrying configuration for litellm.acompletion and litellm.aembedding."""
+    return {"num_retries": 3, "timeout": timeout}
+
+
 def encode_image_to_base64(img: "np.ndarray") -> str:
     """Encode an image to a base64 string, to be included as an image_url in a Message."""
     try:
diff --git a/tests/cassettes/TestMultipleCompletionLLMModel.test_model[gpt-3.5-turbo].yaml b/tests/cassettes/TestMultipleCompletionLLMModel.test_model[gpt-3.5-turbo].yaml
new file mode 100644
index 0000000..0f134c0
--- /dev/null
+++ b/tests/cassettes/TestMultipleCompletionLLMModel.test_model[gpt-3.5-turbo].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "system", "content": "Respond with single words."},
+        {"role": "user", "content": "Hello, how are you?"}], "model": "gpt-3.5-turbo",
+        "n": 2}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "161"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA9RTPU/DMBDd8yssz23Vph+IbjBQBIIJBoRQ5NqXxNTxWfYFUar+d+S0NKlaJFYW
+          D+/de353Z28SxrhWfM64LAXJypn+1csQ1rPF0/P9nf6aSPexus4fdfGgb8Hd8F5U4PIdJP2oBhIr
+          Z4A02h0tPQiC6Dq6GKez2eVwmjZEhQpMlBWO+uPBtE+1X2J/OEqne2WJWkLgc/aaMMbYpjljRqvg
+          k8/ZsPeDVBCCKIDPD0WMcY8mIlyEoAMJS7zXkhItgW1iLxBVl/KQ10HEaLY2Zo9vD3cZLJzHZdjz
+          BzzXVocy8yAC2ugbCB1POuKTBkb/poGEsbdmKfVRTO48Vo4ywhXYaJimOzvePoMOuecISZgOPOmd
+          McsUkNAmdEbCpZAlqFbZPgBRK40dojv20yznvHdta1v8xb4lpARHoDLnQWl53G9b5iH+kd/KDiNu
+          AvOwDgRVlmtbgHdeN0tuNrlNvgEAAP//AwDXzTxTpAMAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e8e2a2a8b36ebf3-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 27 Nov 2024 00:57:32 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=6uK9hXTAaX3GtQrh0wymC8uS1CnY22_CRYaLunDpYWc-1732669052-1.0.1.1-OMQFbfRQjfc9bjIfLNCZrZa25fy_pzJ61f1ImKFfmaA0uPjcJncalq9EcklmUYJavMmbHKxk.JuvT7OjHmVBkw;
+            path=/; expires=Wed, 27-Nov-24 01:27:32 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=AyWxlj6WhbGDdCX9HNNYmuke4avEYVHR7LCnqCXSIyo-1732669052770-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "108"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "12000"
+        x-ratelimit-limit-tokens:
+          - "1000000"
+        x-ratelimit-remaining-requests:
+          - "11999"
+        x-ratelimit-remaining-tokens:
+          - "999953"
+        x-ratelimit-reset-requests:
+          - 5ms
+        x-ratelimit-reset-tokens:
+          - 2ms
+        x-request-id:
+          - req_e85a8fca3654ef15d3b91c4e8b039c0d
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestMultipleCompletionLLMModel.test_output_schema.yaml b/tests/cassettes/TestMultipleCompletionLLMModel.test_output_schema.yaml
new file mode 100644
index 0000000..f929fa6
--- /dev/null
+++ b/tests/cassettes/TestMultipleCompletionLLMModel.test_output_schema.yaml
@@ -0,0 +1,102 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "system", "content": "Respond following this JSON
+        schema:\n\n{\"properties\": {\"name\": {\"title\": \"Name\", \"type\": \"string\"},
+        \"age\": {\"title\": \"Age\", \"type\": \"integer\"}}, \"required\": [\"name\",
+        \"age\"], \"title\": \"DummyOutputSchema\", \"type\": \"object\"}"}, {"role":
+        "user", "content": "My name is Claude and I am 1 year old. What is my name and
+        age?"}], "model": "gpt-3.5-turbo", "n": 2, "response_format": {"type": "json_object"}}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "480"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA9xTy07DMBC85yusPaeING0RuSFOgBAnUBFBkWtvU4NjW/ZGolT9d+T0kVSAxJlL
+          DjM7oxnvZpMwBkpCwUCsOInG6dHV/PNpLm7C7ePdOl9ML++nD8/8rnlb4/x+AmlU2MUbCjqozoRt
+          nEZS1uxo4ZETRtfsIh/PZpPZOO+IxkrUUVY7GuVn0xG1fmFH59l4uleurBIYoGAvCWOMbbpvzGgk
+          fkDBztMD0mAIvEYojkOMgbc6IsBDUIG4IUh7UlhDaLrYm9JEqATDGyyhYCVca95KLCE9ULzumKw0
+          26GLx2UbeGxhWq33+PYYS9vaebsIe/6IL5VRYVV55MGaGCGQdZAMxN+6Zv+xa8LYa7fq9qQROG8b
+          RxXZdzTR8DLf2UF/XD2ZT/YkWeK6x7PxRfqDXSWRuNJh8H4guFih7KX9YfFWKjsghjv6nuYn711x
+          Zeq/2PeEEOgIZeU8SiVOG/djHuO/99vY8ZG7wBDWgbCplsrU6J1X3UV0u9wmXwAAAP//AwCJiYkd
+          /AMAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e8dbe092bc16441-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 26 Nov 2024 23:43:44 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "309"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "12000"
+        x-ratelimit-limit-tokens:
+          - "1000000"
+        x-ratelimit-remaining-requests:
+          - "11999"
+        x-ratelimit-remaining-tokens:
+          - "999894"
+        x-ratelimit-reset-requests:
+          - 5ms
+        x-ratelimit-reset-tokens:
+          - 6ms
+        x-request-id:
+          - req_5f3e5551351be13c737ce50667811e1d
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestMultipleCompletionLLMModel.test_parameterizing_tool_from_arg_union.yaml b/tests/cassettes/TestMultipleCompletionLLMModel.test_parameterizing_tool_from_arg_union.yaml
new file mode 100644
index 0000000..9eb680c
--- /dev/null
+++ b/tests/cassettes/TestMultipleCompletionLLMModel.test_parameterizing_tool_from_arg_union.yaml
@@ -0,0 +1,109 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "Please win."}], "model": "gpt-3.5-turbo",
+        "n": 2, "tool_choice": "required", "tools": [{"type": "function", "function":
+        {"name": "play", "description": "Play one turn by choosing a move.", "parameters":
+        {"type": "object", "properties": {"move": {"anyOf": [{"type": "integer"}, {"type":
+        "null"}], "description": "Choose an integer to lose, choose None to win.", "title":
+        "Move"}}, "required": ["move"]}}}]}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "448"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA+xUTWvbQBC961csc7aD7dRKrVuhh0LakFLqBpIi1quRtPV+dXcVahv/97KSopWd
+          FHINVAexzJs3H29m95AQAryAjACrqWfSiOmHu/36x6fN17IW2/W327V8vOViWd18/DL7rWASGHrz
+          C5l/Yl0wLY1Az3UPM4vUY4g6v7pcpOm7dLFoAakLFIFWGT+9vFhOfWM3ejqbL5Y9s9acoYOM3CeE
+          EHJo/6FGVeAfyMhs8mSR6BytELLBiRCwWgQLUOe481R5mESQaeVRhbJVI8QI8FqLnFEhYuLuO4zO
+          USgqRH6DV/v1kq3S+npfbRE/f+fv765LPsrXhd6ZtqCyUWwQaIQP9uwsGSGgqGy5RtDdGY8QoLZq
+          JCofaobDA0j9iA+QhdaOcOJ8TF46/xwJYLFsHBW9Mr39OEgtdGWs3rgz5aDkirs6t0hd2wE4r02X
+          uyc/m9/8//zexvySPgM0J1MCY7U0Pvd6iyoETFddOIiPQAS7a99OyFMR7at+DU6j5QV6ytshDnvD
+          KKuxiMx4/2lTcD0Cxmv3vJiXYnd9c1W9JnwEGEPjsciNxYKz04ajm8XwRP7LbdC4LRjcznmUeclV
+          hdZYPix5ckz+AgAA//8DAI3rDWKjBQAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e8dbe004a387afa-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 26 Nov 2024 23:43:42 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=KgM_CbOJ8Or7Ox8J9BqngSRIXR83eqEH5DurocJaLj8-1732664622-1.0.1.1-YOsognHowLn83_5.SYdcV3Mk6t0JC0F2tRMWWu7zhfUKaJ0nDeSqQxaG2ouaLAaqzGd4v.AgNvIt1dINZ.gNYQ;
+            path=/; expires=Wed, 27-Nov-24 00:13:42 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=RGkXqcA_646HnnLEb..txOv0aByKWatHt2QrW.dppHY-1732664622457-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "229"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "12000"
+        x-ratelimit-limit-tokens:
+          - "1000000"
+        x-ratelimit-remaining-requests:
+          - "11999"
+        x-ratelimit-remaining-tokens:
+          - "999963"
+        x-ratelimit-reset-requests:
+          - 5ms
+        x-ratelimit-reset-tokens:
+          - 2ms
+        x-request-id:
+          - req_b5bea59bcffa604abbecf90237318698
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestMultipleCompletionLLMModel.test_text_image_message[gpt-4o-mini-2024-07-18].yaml b/tests/cassettes/TestMultipleCompletionLLMModel.test_text_image_message[gpt-4o-mini-2024-07-18].yaml
new file mode 100644
index 0000000..f648b73
--- /dev/null
+++ b/tests/cassettes/TestMultipleCompletionLLMModel.test_text_image_message[gpt-4o-mini-2024-07-18].yaml
@@ -0,0 +1,111 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": [{"type": "image_url", "image_url":
+        {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAIAAAD8GO2jAAAAKElEQVR4nO3NMQEAAAjDMMC/ZzDBvlRA01vZJvwHAAAAAAAAAAAAbx2jxAE/i2AjOgAAAABJRU5ErkJggg=="}},
+        {"type": "text", "text": "What color is this square? Respond only with the color
+        name."}]}], "model": "gpt-4o-mini-2024-07-18", "n": 2}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "381"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA9STP2/CMBDF93wKyzOpQqBA2bqgVuqEVNqqqiJjH8Hg+Cz7kPpHfPfKIZAgWqlr
+          lwz3u/fyfGd/JYxxrfiUcbkWJCtn0tvnz8XLbL6dfcJGDhZLhLunhw096vsqk7wXFbjcgKSj6kpi
+          5QyQRnvA0oMgiK798SAfjYajfFiDChWYKCsdpUNMK211mmf5MM3GaX/SqNeoJQQ+Za8JY4x91d+Y
+          0yp451OW9Y6VCkIQJfDpqYkx7tHEChch6EDCEu+1UKIlsHX0Oagu8bDaBRHT2Z0xTX1/+pXB0nlc
+          hoaf6ittdVgXHkRAG20DoeNJR3yRv/9f8ieMvdUr2Z2l5M5j5agg3IKNhpPrvNkJb69Ci/OGEZIw
+          Z6ojOTMsFJDQJnSmwqWQa1Cttr0CYqc0dkB38pdpfvI+HF3b8i/2LZASHIEqnAel5fmJ2zYP8aX8
+          1nYacx2Yh49AUBUrbUvwzuvDnleuGCjo55NJNrrhyT75BgAA//8DAHMIZsi1AwAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e8dbe02c857176b-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 26 Nov 2024 23:43:44 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=FLcfXsLHocORp9BaX8WRXlKmbbwQLQT1aaZHgSHfOwM-1732664624-1.0.1.1-EMMzivzKoNvNES87qYj.8X3tabrd2Y0z7mUhxOXUBl_ApeNQMRYqOeomdi8VD9YY1AAfg9NvNYhuESasFB2r1Q;
+            path=/; expires=Wed, 27-Nov-24 00:13:44 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=xB_0vUwejpcRnbBxzZuL293zZT452oWlLskSKDKBGqE-1732664624374-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "1744"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-input-images:
+          - "50000"
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-input-images:
+          - "49999"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999187"
+        x-ratelimit-reset-input-images:
+          - 1ms
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_d887de09a40a121d235f796b4dc6a0c6
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
index c91cc5a..5c5f2d7 100644
--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@@ -1,6 +1,13 @@
 import pytest
 
-from llmclient.embeddings import MODEL_COST_MAP, LiteLLMEmbeddingModel
+from llmclient.embeddings import (
+    MODEL_COST_MAP,
+    HybridEmbeddingModel,
+    LiteLLMEmbeddingModel,
+    SentenceTransformerEmbeddingModel,
+    SparseEmbeddingModel,
+    embedding_model_factory,
+)
 
 
 class TestLiteLLMEmbeddingModel:
@@ -65,3 +72,106 @@ async def test_embed_documents(self, embedding_model, mocker):
 
         embeddings = await embedding_model.embed_documents(texts)
         assert embeddings == [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_sentence_transformer() -> None:
+    """Test that the factory creates a SentenceTransformerEmbeddingModel when given an 'st-' prefix."""
+    embedding = "st-multi-qa-MiniLM-L6-cos-v1"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, SentenceTransformerEmbeddingModel
+    ), "Factory did not create SentenceTransformerEmbeddingModel"
+    assert model.name == "multi-qa-MiniLM-L6-cos-v1", "Incorrect model name assigned"
+
+    # Test embedding functionality
+    texts = ["Hello world", "Test sentence"]
+    embeddings = await model.embed_documents(texts)
+    assert len(embeddings) == 2, "Incorrect number of embeddings returned"
+    assert all(
+        isinstance(embed, list) for embed in embeddings
+    ), "Embeddings are not in list format"
+    assert all(len(embed) > 0 for embed in embeddings), "Embeddings should not be empty"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_hybrid_with_sentence_transformer() -> None:
+    """Test that the factory creates a HybridEmbeddingModel containing a SentenceTransformerEmbeddingModel."""
+    embedding = "hybrid-st-multi-qa-MiniLM-L6-cos-v1"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, HybridEmbeddingModel
+    ), "Factory did not create HybridEmbeddingModel"
+    assert len(model.models) == 2, "Hybrid model should contain two component models"
+    assert isinstance(
+        model.models[0], SentenceTransformerEmbeddingModel
+    ), "First component should be SentenceTransformerEmbeddingModel"
+    assert isinstance(
+        model.models[1], SparseEmbeddingModel
+    ), "Second component should be SparseEmbeddingModel"
+
+    # Test embedding functionality
+    texts = ["Hello world", "Test sentence"]
+    embeddings = await model.embed_documents(texts)
+    assert len(embeddings) == 2, "Incorrect number of embeddings returned"
+    expected_length = len((await model.models[0].embed_documents(texts))[0]) + len(
+        (await model.models[1].embed_documents(texts))[0]
+    )
+    assert all(
+        len(embed) == expected_length for embed in embeddings
+    ), "Embeddings do not match expected combined length"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_invalid_st_prefix() -> None:
+    """Test that the factory raises a ValueError when 'st-' prefix is provided without a model name."""
+    embedding = "st-"
+    with pytest.raises(
+        ValueError,
+        match="SentenceTransformer model name must be specified after 'st-'.",
+    ):
+        embedding_model_factory(embedding)
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_unknown_prefix() -> None:
+    """Test that the factory defaults to LiteLLMEmbeddingModel when an unknown prefix is provided."""
+    embedding = "unknown-prefix-model"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, LiteLLMEmbeddingModel
+    ), "Factory did not default to LiteLLMEmbeddingModel for unknown prefix"
+    assert model.name == "unknown-prefix-model", "Incorrect model name assigned"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_sparse() -> None:
+    """Test that the factory creates a SparseEmbeddingModel when 'sparse' is provided."""
+    embedding = "sparse"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, SparseEmbeddingModel
+    ), "Factory did not create SparseEmbeddingModel"
+    assert model.name == "sparse", "Incorrect model name assigned"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_litellm() -> None:
+    """Test that the factory creates a LiteLLMEmbeddingModel when 'litellm-' prefix is provided."""
+    embedding = "litellm-text-embedding-3-small"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, LiteLLMEmbeddingModel
+    ), "Factory did not create LiteLLMEmbeddingModel"
+    assert model.name == "text-embedding-3-small", "Incorrect model name assigned"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_default() -> None:
+    """Test that the factory defaults to LiteLLMEmbeddingModel when no known prefix is provided."""
+    embedding = "default-model"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, LiteLLMEmbeddingModel
+    ), "Factory did not default to LiteLLMEmbeddingModel"
+    assert model.name == "default-model", "Incorrect model name assigned"
diff --git a/tests/test_llms.py b/tests/test_llms.py
index 7ad7358..5d45ddc 100644
--- a/tests/test_llms.py
+++ b/tests/test_llms.py
@@ -1,19 +1,23 @@
 import pathlib
 import pickle
-from typing import Any
-from unittest.mock import patch
+from enum import StrEnum
+from typing import Any, ClassVar
+from unittest.mock import Mock, patch
 
 import litellm
+import numpy as np
 import pytest
+from aviary.core import Message, Tool, ToolRequestMessage
+from pydantic import BaseModel
 
-from llmclient.embeddings import (
-    HybridEmbeddingModel,
-    LiteLLMEmbeddingModel,
-    SentenceTransformerEmbeddingModel,
-    SparseEmbeddingModel,
-    embedding_model_factory,
+from llmclient.exceptions import JSONSchemaValidationError
+from llmclient.llms import (
+    Chunk,
+    LiteLLMModel,
+    MultipleCompletionLLMModel,
+    validate_json_completion,
 )
-from llmclient.llms import Chunk, LiteLLMModel
+from llmclient.types import LLMResult
 from tests.conftest import VCR_DEFAULT_MATCH_ON
 
 
@@ -158,104 +162,182 @@ def test_pickling(self, tmp_path: pathlib.Path) -> None:
         assert llm.router.deployment_names == rehydrated_llm.router.deployment_names
 
 
-@pytest.mark.asyncio
-async def test_embedding_model_factory_sentence_transformer() -> None:
-    """Test that the factory creates a SentenceTransformerEmbeddingModel when given an 'st-' prefix."""
-    embedding = "st-multi-qa-MiniLM-L6-cos-v1"
-    model = embedding_model_factory(embedding)
-    assert isinstance(
-        model, SentenceTransformerEmbeddingModel
-    ), "Factory did not create SentenceTransformerEmbeddingModel"
-    assert model.name == "multi-qa-MiniLM-L6-cos-v1", "Incorrect model name assigned"
-
-    # Test embedding functionality
-    texts = ["Hello world", "Test sentence"]
-    embeddings = await model.embed_documents(texts)
-    assert len(embeddings) == 2, "Incorrect number of embeddings returned"
-    assert all(
-        isinstance(embed, list) for embed in embeddings
-    ), "Embeddings are not in list format"
-    assert all(len(embed) > 0 for embed in embeddings), "Embeddings should not be empty"
-
-
-@pytest.mark.asyncio
-async def test_embedding_model_factory_hybrid_with_sentence_transformer() -> None:
-    """Test that the factory creates a HybridEmbeddingModel containing a SentenceTransformerEmbeddingModel."""
-    embedding = "hybrid-st-multi-qa-MiniLM-L6-cos-v1"
-    model = embedding_model_factory(embedding)
-    assert isinstance(
-        model, HybridEmbeddingModel
-    ), "Factory did not create HybridEmbeddingModel"
-    assert len(model.models) == 2, "Hybrid model should contain two component models"
-    assert isinstance(
-        model.models[0], SentenceTransformerEmbeddingModel
-    ), "First component should be SentenceTransformerEmbeddingModel"
-    assert isinstance(
-        model.models[1], SparseEmbeddingModel
-    ), "Second component should be SparseEmbeddingModel"
-
-    # Test embedding functionality
-    texts = ["Hello world", "Test sentence"]
-    embeddings = await model.embed_documents(texts)
-    assert len(embeddings) == 2, "Incorrect number of embeddings returned"
-    expected_length = len((await model.models[0].embed_documents(texts))[0]) + len(
-        (await model.models[1].embed_documents(texts))[0]
+class CILLMModelNames(StrEnum):
+    """Models to use for generic CI testing."""
+
+    ANTHROPIC = "claude-3-haiku-20240307"  # Cheap and not Anthropic's cutting edge
+    OPENAI = "gpt-4o-mini-2024-07-18"  # Cheap and not OpenAI's cutting edge
+
+
+class DummyOutputSchema(BaseModel):
+    name: str
+    age: int
+
+
+class TestMultipleCompletionLLMModel:
+    NUM_COMPLETIONS: ClassVar[int] = 2
+    DEFAULT_CONFIG: ClassVar[dict] = {"n": NUM_COMPLETIONS}
+    MODEL_CLS: ClassVar[type[MultipleCompletionLLMModel]] = MultipleCompletionLLMModel
+
+    async def call_model(
+        self, model: MultipleCompletionLLMModel, *args, **kwargs
+    ) -> list[LLMResult]:
+        return await model.call(*args, **kwargs)
+
+    @pytest.mark.parametrize(
+        "model_name", ["gpt-3.5-turbo", CILLMModelNames.ANTHROPIC.value]
     )
-    assert all(
-        len(embed) == expected_length for embed in embeddings
-    ), "Embeddings do not match expected combined length"
-
-
-@pytest.mark.asyncio
-async def test_embedding_model_factory_invalid_st_prefix() -> None:
-    """Test that the factory raises a ValueError when 'st-' prefix is provided without a model name."""
-    embedding = "st-"
-    with pytest.raises(
-        ValueError,
-        match="SentenceTransformer model name must be specified after 'st-'.",
-    ):
-        embedding_model_factory(embedding)
-
-
-@pytest.mark.asyncio
-async def test_embedding_model_factory_unknown_prefix() -> None:
-    """Test that the factory defaults to LiteLLMEmbeddingModel when an unknown prefix is provided."""
-    embedding = "unknown-prefix-model"
-    model = embedding_model_factory(embedding)
-    assert isinstance(
-        model, LiteLLMEmbeddingModel
-    ), "Factory did not default to LiteLLMEmbeddingModel for unknown prefix"
-    assert model.name == "unknown-prefix-model", "Incorrect model name assigned"
-
-
-@pytest.mark.asyncio
-async def test_embedding_model_factory_sparse() -> None:
-    """Test that the factory creates a SparseEmbeddingModel when 'sparse' is provided."""
-    embedding = "sparse"
-    model = embedding_model_factory(embedding)
-    assert isinstance(
-        model, SparseEmbeddingModel
-    ), "Factory did not create SparseEmbeddingModel"
-    assert model.name == "sparse", "Incorrect model name assigned"
-
-
-@pytest.mark.asyncio
-async def test_embedding_model_factory_litellm() -> None:
-    """Test that the factory creates a LiteLLMEmbeddingModel when 'litellm-' prefix is provided."""
-    embedding = "litellm-text-embedding-3-small"
-    model = embedding_model_factory(embedding)
-    assert isinstance(
-        model, LiteLLMEmbeddingModel
-    ), "Factory did not create LiteLLMEmbeddingModel"
-    assert model.name == "text-embedding-3-small", "Incorrect model name assigned"
-
-
-@pytest.mark.asyncio
-async def test_embedding_model_factory_default() -> None:
-    """Test that the factory defaults to LiteLLMEmbeddingModel when no known prefix is provided."""
-    embedding = "default-model"
-    model = embedding_model_factory(embedding)
-    assert isinstance(
-        model, LiteLLMEmbeddingModel
-    ), "Factory did not default to LiteLLMEmbeddingModel"
-    assert model.name == "default-model", "Incorrect model name assigned"
+    @pytest.mark.asyncio
+    async def test_achat(self, model_name: str) -> None:
+        model = MultipleCompletionLLMModel(name=model_name)
+        response = await model.achat(
+            messages=[
+                Message(content="What are three things I should do today?"),
+            ]
+        )
+
+        assert len(response.choices) == 1
+
+        # Check we can iterate through the response
+        async for chunk in await model.achat_iter(
+            messages=[
+                Message(content="What are three things I should do today?"),
+            ]
+        ):
+            assert len(chunk.choices) == 1
+
+    @pytest.mark.vcr(match_on=[*VCR_DEFAULT_MATCH_ON, "body"])
+    @pytest.mark.parametrize("model_name", ["gpt-3.5-turbo"])
+    @pytest.mark.asyncio
+    async def test_model(self, model_name: str) -> None:
+        # Make model_name an arg so that TestLLMModel can parametrize it
+        # only testing OpenAI, as other APIs don't support n>1
+        model = self.MODEL_CLS(name=model_name, config=self.DEFAULT_CONFIG)
+        messages = [
+            Message(role="system", content="Respond with single words."),
+            Message(content="Hello, how are you?"),
+        ]
+        results = await self.call_model(model, messages)
+        assert len(results) == self.NUM_COMPLETIONS
+
+        for result in results:
+            assert result.prompt_count > 0
+            assert result.completion_count > 0
+            assert result.cost > 0
+            assert result.logprob is None or result.logprob <= 0
+
+    @pytest.mark.parametrize(
+        "model_name", [CILLMModelNames.ANTHROPIC.value, "gpt-3.5-turbo"]
+    )
+    @pytest.mark.asyncio
+    async def test_streaming(self, model_name: str) -> None:
+        model = self.MODEL_CLS(name=model_name, config=self.DEFAULT_CONFIG)
+        messages = [
+            Message(role="system", content="Respond with single words."),
+            Message(content="Hello, how are you?"),
+        ]
+
+        def callback(_) -> None:
+            return
+
+        with pytest.raises(
+            NotImplementedError,
+            match="Multiple completions with callbacks is not supported",
+        ):
+            await self.call_model(model, messages, [callback])
+
+    @pytest.mark.vcr
+    @pytest.mark.asyncio
+    async def test_parameterizing_tool_from_arg_union(self) -> None:
+        def play(move: int | None) -> None:
+            """Play one turn by choosing a move.
+
+            Args:
+                move: Choose an integer to lose, choose None to win.
+            """
+
+        results = await self.call_model(
+            self.MODEL_CLS(name="gpt-3.5-turbo", config=self.DEFAULT_CONFIG),
+            messages=[Message(content="Please win.")],
+            tools=[Tool.from_function(play)],
+        )
+        assert len(results) == self.NUM_COMPLETIONS
+        for result in results:
+            assert result.messages
+            assert len(result.messages) == 1
+            assert isinstance(result.messages[0], ToolRequestMessage)
+            assert result.messages[0].tool_calls
+            assert result.messages[0].tool_calls[0].function.arguments["move"] is None
+
+    @pytest.mark.asyncio
+    @pytest.mark.vcr
+    async def test_output_schema(self) -> None:
+        model = self.MODEL_CLS(name="gpt-3.5-turbo", config=self.DEFAULT_CONFIG)
+        messages = [
+            Message(
+                content=(
+                    "My name is Claude and I am 1 year old. What is my name and age?"
+                )
+            ),
+        ]
+        results = await self.call_model(model, messages, output_type=DummyOutputSchema)
+        assert len(results) == self.NUM_COMPLETIONS
+        for result in results:
+            assert result.messages
+            assert len(result.messages) == 1
+            assert result.messages[0].content
+            DummyOutputSchema.model_validate_json(result.messages[0].content)
+
+    @pytest.mark.parametrize("model_name", [CILLMModelNames.OPENAI.value])
+    @pytest.mark.asyncio
+    @pytest.mark.vcr
+    async def test_text_image_message(self, model_name: str) -> None:
+        model = self.MODEL_CLS(name=model_name, config=self.DEFAULT_CONFIG)
+
+        # An RGB image of a red square
+        image = np.zeros((32, 32, 3), dtype=np.uint8)
+        image[:] = [255, 0, 0]  # (255 red, 0 green, 0 blue) is maximum red in RGB
+
+        results = await self.call_model(
+            model,
+            messages=[
+                Message.create_message(
+                    text="What color is this square? Respond only with the color name.",
+                    image=image,
+                )
+            ],
+        )
+        assert len(results) == self.NUM_COMPLETIONS
+        for result in results:
+            assert (
+                result.messages is not None
+            ), "Expected messages in result, but got None"
+            assert (
+                result.messages[-1].content is not None
+            ), "Expected content in message, but got None"
+            assert "red" in result.messages[-1].content.lower()
+
+
+def test_json_schema_validation() -> None:
+    # Invalid JSON
+    mock_completion1 = Mock()
+    mock_completion1.choices = [Mock()]
+    mock_completion1.choices[0].message.content = "not a json"
+    # Invalid schema
+    mock_completion2 = Mock()
+    mock_completion2.choices = [Mock()]
+    mock_completion2.choices[0].message.content = '{"name": "John", "age": "nan"}'
+    # Valid schema
+    mock_completion3 = Mock()
+    mock_completion3.choices = [Mock()]
+    mock_completion3.choices[0].message.content = '{"name": "John", "age": 30}'
+
+    class DummyModel(BaseModel):
+        name: str
+        age: int
+
+    with pytest.raises(JSONSchemaValidationError):
+        validate_json_completion(mock_completion1, DummyModel)
+    with pytest.raises(JSONSchemaValidationError):
+        validate_json_completion(mock_completion2, DummyModel)
+    validate_json_completion(mock_completion3, DummyModel)
diff --git a/uv.lock b/uv.lock
index b1dcc0f..01f65ad 100644
--- a/uv.lock
+++ b/uv.lock
@@ -585,7 +585,7 @@ wheels = [
 
 [[package]]
 name = "fh-llm-client"
-version = "0.1.dev37+g63f57b5.d20241126"
+version = "0.1.dev26+g4e95024.d20241126"
 source = { editable = "." }
 dependencies = [
     { name = "aiofiles" },