From 4e950245c729792fa9ed481bcf313095c6bdf31c Mon Sep 17 00:00:00 2001
From: Mayk Caldas <maykcaldas@gmail.com>
Date: Mon, 25 Nov 2024 18:21:40 -0800
Subject: [PATCH] Updated client (#2)

* implemented tests workflow

* updated llm-client accordingly to the current pqa version

* solved mypy and ruff errors

* removed placeholders for future features

* changed the package name

* updated uv.lock

---------

Co-authored-by: Mayk Caldas <mayk@futurehouse.org>
---
 .github/workflows/test.yaml                   |  64 ++
 .gitignore                                    |   1 +
 .pre-commit-config.yaml                       |  56 +-
 LICENSE                                       | 201 ++++++
 README.md                                     |   8 +-
 llmclient/__init__.py                         |   7 +
 llmclient/constants.py                        |  29 +
 llmclient/embeddings.py                       | 272 ++++++++
 llmclient/exceptions.py                       |   2 +
 llmclient/llms.py                             | 584 +++++++++++++++++
 .../constants.py => llmclient/prompts.py      |   0
 llmclient/rate_limiter.py                     | 397 +++++++++++
 src/llmclient/result.py => llmclient/types.py | 115 ++--
 src/llmclient/util.py => llmclient/utils.py   |  57 +-
 pyproject.toml                                |  59 +-
 src/llmclient/__init__.py                     |   7 -
 src/llmclient/model.py                        | 504 --------------
 tests/__init__.py                             |   0
 ...est_max_token_truncation[with-router].yaml | 103 +++
 ..._max_token_truncation[without-router].yaml | 103 +++
 ...LLMModel.test_run_prompt[with-router].yaml | 495 ++++++++++++++
 ...Model.test_run_prompt[without-router].yaml | 501 ++++++++++++++
 tests/conftest.py                             |  75 +++
 tests/test_embeddings.py                      |  67 ++
 tests/test_llms.py                            | 261 ++++++++
 tests/test_rate_limiter.py                    | 297 +++++++++
 uv.lock                                       | 618 ++++++++++++------
 27 files changed, 4056 insertions(+), 827 deletions(-)
 create mode 100644 .github/workflows/test.yaml
 create mode 100644 LICENSE
 create mode 100644 llmclient/__init__.py
 create mode 100644 llmclient/constants.py
 create mode 100644 llmclient/embeddings.py
 create mode 100644 llmclient/exceptions.py
 create mode 100644 llmclient/llms.py
 rename src/llmclient/constants.py => llmclient/prompts.py (100%)
 create mode 100644 llmclient/rate_limiter.py
 rename src/llmclient/result.py => llmclient/types.py (50%)
 rename src/llmclient/util.py => llmclient/utils.py (54%)
 delete mode 100644 src/llmclient/__init__.py
 delete mode 100644 src/llmclient/model.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/cassettes/TestLiteLLMModel.test_max_token_truncation[with-router].yaml
 create mode 100644 tests/cassettes/TestLiteLLMModel.test_max_token_truncation[without-router].yaml
 create mode 100644 tests/cassettes/TestLiteLLMModel.test_run_prompt[with-router].yaml
 create mode 100644 tests/cassettes/TestLiteLLMModel.test_run_prompt[without-router].yaml
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_embeddings.py
 create mode 100644 tests/test_llms.py
 create mode 100644 tests/test_rate_limiter.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
new file mode 100644
index 0000000..da4238a
--- /dev/null
+++ b/.github/workflows/test.yaml
@@ -0,0 +1,64 @@
+name: Lint and Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'pull_request' # pre-commit-ci/lite-action only runs here
+    strategy:
+      matrix:
+        python-version: [3.11, 3.12] # Our min and max supported Python versions
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # For setuptools-scm, replace with fetch-tags after https://github.com/actions/checkout/issues/1471
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit-ci/lite-action@v1.1.0
+        if: always()
+  lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.11] # Our min supported Python version
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - run: uv python pin ${{ matrix.python-version }}
+      - uses: hynek/build-and-inspect-python-package@v2
+      - run: uv sync --python-preference=only-managed
+      - run: uv run refurb llmclient tests
+      - run: uv run pylint llmclient
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.11, 3.12] # Our min and max supported Python versions
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - run: uv python pin ${{ matrix.python-version }}
+      - run: uv sync --python-preference=only-managed
+      - name: Cache datasets
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface/datasets
+          key: ${{ runner.os }}-datasets-${{ hashFiles('paperqa') }}
+          restore-keys: ${{ runner.os }}-datasets-
+      - run: uv run pytest -n auto
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }}
+          CROSSREF_API_KEY: ${{ secrets.CROSSREF_API_KEY }}
diff --git a/.gitignore b/.gitignore
index 01a52fb..6de69c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,6 +100,7 @@ fabric.properties
 !.vscode/launch.json
 !.vscode/extensions.json
 !.vscode/*.code-snippets
+.vscode/
 
 # Local History for Visual Studio Code
 .history/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25dcbe1..b80c0d2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,24 +19,18 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.1
+    rev: v0.8.0
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
-      - id: ruff-format
   - repo: https://github.com/rbubley/mirrors-prettier
     rev: v3.3.3
     hooks:
       - id: prettier
-  - repo: https://github.com/Yelp/detect-secrets
-    rev: v1.5.0
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.10.0
     hooks:
-      - id: detect-secrets
-        additional_dependencies: [".[word_list]"]
-        args:
-          - --word-list=.secrets.allowlist
-          - --exclude-files=.secrets.baseline$
-        exclude: tests/cassettes
+      - id: black
   - repo: https://github.com/jumanjihouse/pre-commit-hooks
     rev: 3.0.0
     hooks:
@@ -48,7 +42,7 @@ repos:
         additional_dependencies: [".[toml]"]
         exclude_types: [jupyter]
   - repo: https://github.com/pappasam/toml-sort
-    rev: v0.23.1
+    rev: v0.24.2
     hooks:
       - id: toml-sort-fix
   - repo: https://github.com/srstevenson/nb-clean
@@ -57,29 +51,45 @@ repos:
       - id: nb-clean
         args: [--preserve-cell-outputs, --remove-empty-cells]
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.22
+    rev: v0.23
     hooks:
       - id: validate-pyproject
         additional_dependencies:
-          - "validate-pyproject-schema-store[all]>=2024.08.19" # For Ruff renaming RUF025 to C420
+          - "validate-pyproject-schema-store[all]>=2024.06.24" # For Ruff renaming RUF025 to C420
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.4.29
+    rev: 0.4.30
     hooks:
       - id: uv-lock
+  - repo: https://github.com/jsh9/markdown-toc-creator
+    rev: 0.0.8
+    hooks:
+      - id: markdown-toc-creator
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.13.0
     hooks:
       - id: mypy
+        args: [--pretty, --ignore-missing-imports]
         additional_dependencies:
-          - fastapi>=0.109 # Match pyproject.toml
+          - aiohttp
+          - PyMuPDF>=1.24.12
+          - anyio
+          - coredis
+          - fhaviary[llm]>=0.10.0 # Match pyproject.toml
+          - ldp>=0.12 # Match pyproject.toml
+          - html2text
+          - litellm>=1.44 # Match pyproject.toml
           - httpx
-          - litellm>=1.40.15,!=1.49.4,!=1.49.5,!=1.49.6 # Match pyproject.toml
-          - numpy>=1.20 # Match pyproject.toml
-          - openai>=1,<1.47 # Match pyproject.toml
+          - limits
+          - pybtex
+          - numpy
+          - pandas-stubs
           - pydantic~=2.0 # Match pyproject.toml
+          - pydantic-settings
+          - rich
+          - tantivy
           - tenacity
-          - torch
-          - types-aiofiles
-          - types-tqdm
-          - usearch>=2.13 # Match pyproject.toml
-          - wandb
\ No newline at end of file
+          - tiktoken>=0.4.0 # Match pyproject.toml
+          - types-setuptools
+          - types-PyYAML
+          - sentence-transformers
+          - pyzotero
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 495d951..519b499 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,20 @@
 # llm-client
-Central LLM client for use by LDP and PaperQA
+
+Central FutureHouse LLM client library.
 
 ## Quick Start
+
 ```
 $ pip install -e .
-$ uv sync && uv run pytest
+$ uv sync && uv run pytest -n auto
 ```
 
 ## Clients
+
 - LLMModel
 - LLMResult
 
 ## Examples
+
 - [PaperQA](https://github.com/Future-House/paper-qa/compare/main...llm-result-client)
 - [LDP](https://github.com/Future-House/ldp/compare/main...llm-result-client)
diff --git a/llmclient/__init__.py b/llmclient/__init__.py
new file mode 100644
index 0000000..e1769b2
--- /dev/null
+++ b/llmclient/__init__.py
@@ -0,0 +1,7 @@
+from llmclient.llms import LLMModel
+from llmclient.types import LLMResult
+
+__all__ = [
+    "LLMModel",
+    "LLMResult",
+]
diff --git a/llmclient/constants.py b/llmclient/constants.py
new file mode 100644
index 0000000..3220d62
--- /dev/null
+++ b/llmclient/constants.py
@@ -0,0 +1,29 @@
+from sys import version_info
+
+import litellm
+
+CHARACTERS_PER_TOKEN_ASSUMPTION: float = 4.0
+EXTRA_TOKENS_FROM_USER_ROLE: int = 7
+
+MODEL_COST_MAP = litellm.get_model_cost_map("")
+
+DEFAULT_VERTEX_SAFETY_SETTINGS: list[dict[str, str]] = [
+    {
+        "category": "HARM_CATEGORY_HARASSMENT",
+        "threshold": "BLOCK_ONLY_HIGH",
+    },
+    {
+        "category": "HARM_CATEGORY_HATE_SPEECH",
+        "threshold": "BLOCK_ONLY_HIGH",
+    },
+    {
+        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+        "threshold": "BLOCK_ONLY_HIGH",
+    },
+    {
+        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+        "threshold": "BLOCK_ONLY_HIGH",
+    },
+]
+
+IS_PYTHON_BELOW_312 = version_info < (3, 12)
diff --git a/llmclient/embeddings.py b/llmclient/embeddings.py
new file mode 100644
index 0000000..2f4ca24
--- /dev/null
+++ b/llmclient/embeddings.py
@@ -0,0 +1,272 @@
+from __future__ import annotations
+
+import asyncio
+from abc import ABC, abstractmethod
+from enum import StrEnum
+from typing import Any
+
+import litellm
+import numpy as np
+import tiktoken
+from pydantic import (
+    BaseModel,
+    Field,
+    field_validator,
+)
+
+from llmclient.constants import CHARACTERS_PER_TOKEN_ASSUMPTION, MODEL_COST_MAP
+from llmclient.rate_limiter import GLOBAL_LIMITER
+
+
+def get_litellm_retrying_config(timeout: float = 60.0) -> dict[str, Any]:
+    """Get retrying configuration for litellm.acompletion and litellm.aembedding."""
+    return {"num_retries": 3, "timeout": timeout}
+
+
+class EmbeddingModes(StrEnum):
+    DOCUMENT = "document"
+    QUERY = "query"
+
+
+class EmbeddingModel(ABC, BaseModel):
+    name: str
+    config: dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "Optional `rate_limit` key, value must be a RateLimitItem or RateLimitItem"
+            " string for parsing"
+        ),
+    )
+
+    async def check_rate_limit(self, token_count: float, **kwargs) -> None:
+        if "rate_limit" in self.config:
+            await GLOBAL_LIMITER.try_acquire(
+                ("client", self.name),
+                self.config["rate_limit"],
+                weight=max(int(token_count), 1),
+                **kwargs,
+            )
+
+    def set_mode(self, mode: EmbeddingModes) -> None:
+        """Several embedding models have a 'mode' or prompt which affects output."""
+
+    @abstractmethod
+    async def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        pass
+
+
+class LiteLLMEmbeddingModel(EmbeddingModel):
+
+    name: str = Field(default="text-embedding-3-small")
+    config: dict[str, Any] = Field(
+        default_factory=dict,  # See below field_validator for injection of kwargs
+        description=(
+            "The optional `rate_limit` key's value must be a RateLimitItem or"
+            " RateLimitItem string for parsing. The optional `kwargs` key is keyword"
+            " arguments to pass to the litellm.aembedding function. Note that LiteLLM's"
+            " Router is not used here."
+        ),
+    )
+
+    @field_validator("config", mode="before")
+    @classmethod
+    def set_up_default_config(cls, value: dict[str, Any]) -> dict[str, Any]:
+        if "kwargs" not in value:
+            value["kwargs"] = get_litellm_retrying_config(
+                timeout=120,  # 2-min timeout seemed reasonable
+            )
+        return value
+
+    def _truncate_if_large(self, texts: list[str]) -> list[str]:
+        """Truncate texts if they are too large by using litellm cost map."""
+        if self.name not in MODEL_COST_MAP:
+            return texts
+        max_tokens = MODEL_COST_MAP[self.name]["max_input_tokens"]
+        # heuristic about ratio of tokens to characters
+        conservative_char_token_ratio = 3
+        maybe_too_large = max_tokens * conservative_char_token_ratio
+        if any(len(t) > maybe_too_large for t in texts):
+            try:
+                enct = tiktoken.encoding_for_model("cl100k_base")
+                enc_batch = enct.encode_ordinary_batch(texts)
+                return [enct.decode(t[:max_tokens]) for t in enc_batch]
+            except KeyError:
+                return [t[: max_tokens * conservative_char_token_ratio] for t in texts]
+
+        return texts
+
+    async def embed_documents(
+        self, texts: list[str], batch_size: int = 16
+    ) -> list[list[float]]:
+        texts = self._truncate_if_large(texts)
+        N = len(texts)
+        embeddings = []
+        for i in range(0, N, batch_size):
+
+            await self.check_rate_limit(
+                sum(
+                    len(t) / CHARACTERS_PER_TOKEN_ASSUMPTION
+                    for t in texts[i : i + batch_size]
+                )
+            )
+
+            response = await litellm.aembedding(
+                self.name,
+                input=texts[i : i + batch_size],
+                **self.config.get("kwargs", {}),
+            )
+            embeddings.extend([e["embedding"] for e in response.data])
+
+        return embeddings
+
+
+class SparseEmbeddingModel(EmbeddingModel):
+    """This is a very simple keyword search model - probably best to be mixed with others."""
+
+    name: str = "sparse"
+    ndim: int = 256
+    enc: Any = Field(default_factory=lambda: tiktoken.get_encoding("cl100k_base"))
+
+    async def embed_documents(self, texts) -> list[list[float]]:
+        enc_batch = self.enc.encode_ordinary_batch(texts)
+        # now get frequency of each token rel to length
+        return [
+            np.bincount([xi % self.ndim for xi in x], minlength=self.ndim).astype(float)  # type: ignore[misc]
+            / len(x)
+            for x in enc_batch
+        ]
+
+
+class HybridEmbeddingModel(EmbeddingModel):
+    name: str = "hybrid-embed"
+    models: list[EmbeddingModel]
+
+    async def embed_documents(self, texts):
+        all_embeds = await asyncio.gather(
+            *[m.embed_documents(texts) for m in self.models]
+        )
+        return np.concatenate(all_embeds, axis=1)
+
+    def set_mode(self, mode: EmbeddingModes) -> None:
+        # Set mode for all component models
+        for model in self.models:
+            model.set_mode(mode)
+
+
+class SentenceTransformerEmbeddingModel(EmbeddingModel):
+    """An embedding model using SentenceTransformers."""
+
+    name: str = Field(default="multi-qa-MiniLM-L6-cos-v1")
+    config: dict[str, Any] = Field(default_factory=dict)
+    _model: Any = None
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        try:
+            from sentence_transformers import SentenceTransformer
+        except ImportError as exc:
+            raise ImportError(
+                "Please install fh-llm-client[local] to use"
+                " SentenceTransformerEmbeddingModel."
+            ) from exc
+
+        self._model = SentenceTransformer(self.name)
+
+    def set_mode(self, mode: EmbeddingModes) -> None:
+        # SentenceTransformer does not support different modes.
+        pass
+
+    async def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        """
+        Asynchronously embed a list of documents using SentenceTransformer.
+
+        Args:
+            texts: A list of text documents to embed.
+
+        Returns:
+            A list of embedding vectors.
+        """
+        # Extract additional configurations if needed
+        batch_size = self.config.get("batch_size", 32)
+        device = self.config.get("device", "cpu")
+
+        # Update the model's device if necessary
+        if device:
+            self._model.to(device)
+
+        # Run the synchronous encode method in a thread pool to avoid blocking the event loop.
+        embeddings = await asyncio.to_thread(
+            lambda: self._model.encode(
+                texts,
+                convert_to_numpy=True,
+                show_progress_bar=False,  # Disabled progress bar
+                batch_size=batch_size,
+                device=device,
+            ),
+        )
+        # If embeddings are returned as numpy arrays, convert them to lists.
+        if isinstance(embeddings, np.ndarray):
+            embeddings = embeddings.tolist()
+        return embeddings
+
+
+def embedding_model_factory(embedding: str, **kwargs) -> EmbeddingModel:
+    """
+    Factory function to create an appropriate EmbeddingModel based on the embedding string.
+
+    Supports:
+    - SentenceTransformer models prefixed with "st-" (e.g., "st-multi-qa-MiniLM-L6-cos-v1")
+    - LiteLLM models (default if no prefix is provided)
+    - Hybrid embeddings prefixed with "hybrid-", contains a sparse and a dense model
+
+    Args:
+        embedding: The embedding model identifier. Supports prefixes like "st-" for SentenceTransformer
+                   and "hybrid-" for combining multiple embedding models.
+        **kwargs: Additional keyword arguments for the embedding model.
+    """
+    embedding = embedding.strip()  # Remove any leading/trailing whitespace
+
+    if embedding.startswith("hybrid-"):
+        # Extract the component embedding identifiers after "hybrid-"
+        dense_name = embedding[len("hybrid-") :]
+
+        if not dense_name:
+            raise ValueError(
+                "Hybrid embedding must contain at least one component embedding."
+            )
+
+        # Recursively create each component embedding model
+        dense_model = embedding_model_factory(dense_name, **kwargs)
+        sparse_model = SparseEmbeddingModel(**kwargs)
+
+        return HybridEmbeddingModel(models=[dense_model, sparse_model])
+
+    if embedding.startswith("st-"):
+        # Extract the SentenceTransformer model name after "st-"
+        model_name = embedding[len("st-") :].strip()
+        if not model_name:
+            raise ValueError(
+                "SentenceTransformer model name must be specified after 'st-'."
+            )
+
+        return SentenceTransformerEmbeddingModel(
+            name=model_name,
+            config=kwargs,
+        )
+
+    if embedding.startswith("litellm-"):
+        # Extract the LiteLLM model name after "litellm-"
+        model_name = embedding[len("litellm-") :].strip()
+        if not model_name:
+            raise ValueError("model name must be specified after 'litellm-'.")
+
+        return LiteLLMEmbeddingModel(
+            name=model_name,
+            config=kwargs,
+        )
+
+    if embedding == "sparse":
+        return SparseEmbeddingModel(**kwargs)
+
+    # Default to LiteLLMEmbeddingModel if no special prefix is found
+    return LiteLLMEmbeddingModel(name=embedding, config=kwargs)
diff --git a/llmclient/exceptions.py b/llmclient/exceptions.py
new file mode 100644
index 0000000..aea488d
--- /dev/null
+++ b/llmclient/exceptions.py
@@ -0,0 +1,2 @@
+class JSONSchemaValidationError(ValueError):
+    """Raised when the completion does not match the specified schema."""
diff --git a/llmclient/llms.py b/llmclient/llms.py
new file mode 100644
index 0000000..aae4e89
--- /dev/null
+++ b/llmclient/llms.py
@@ -0,0 +1,584 @@
+import asyncio
+import contextlib
+import functools
+from abc import ABC
+from collections.abc import (
+    AsyncGenerator,
+    AsyncIterable,
+    AsyncIterator,
+    Awaitable,
+    Callable,
+    Iterable,
+)
+from inspect import isasyncgenfunction, signature
+from typing import (
+    Any,
+    TypeVar,
+    cast,
+)
+
+import litellm
+from aviary.core import (
+    ToolRequestMessage,
+    ToolSelector,
+)
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    TypeAdapter,
+    ValidationError,
+    model_validator,
+)
+
+from llmclient.constants import (
+    CHARACTERS_PER_TOKEN_ASSUMPTION,
+    DEFAULT_VERTEX_SAFETY_SETTINGS,
+    EXTRA_TOKENS_FROM_USER_ROLE,
+    IS_PYTHON_BELOW_312,
+)
+from llmclient.exceptions import JSONSchemaValidationError
+from llmclient.prompts import default_system_prompt
+from llmclient.rate_limiter import GLOBAL_LIMITER
+from llmclient.types import Chunk, LLMResult
+from llmclient.utils import is_coroutine_callable
+
+if not IS_PYTHON_BELOW_312:
+    _DeploymentTypedDictValidator = TypeAdapter(
+        list[litellm.DeploymentTypedDict],
+        config=ConfigDict(arbitrary_types_allowed=True),
+    )
+
+
+def sum_logprobs(choice: litellm.utils.Choices) -> float | None:
+    """Calculate the sum of the log probabilities of an LLM completion (a Choices object).
+
+    Args:
+        choice: A sequence of choices from the completion.
+
+    Returns:
+        The sum of the log probabilities of the choice.
+    """
+    try:
+        logprob_obj = choice.logprobs
+    except AttributeError:
+        return None
+    if isinstance(logprob_obj, dict):
+        if logprob_obj.get("content"):
+            return sum(
+                logprob_info["logprob"] for logprob_info in logprob_obj["content"]
+            )
+    elif choice.logprobs.content:
+        return sum(logprob_info.logprob for logprob_info in choice.logprobs.content)
+    return None
+
+
+def validate_json_completion(
+    completion: litellm.ModelResponse, output_type: type[BaseModel]
+) -> None:
+    """Validate a completion against a JSON schema.
+
+    Args:
+        completion: The completion to validate.
+        output_type: The Pydantic model to validate the completion against.
+    """
+    try:
+        for choice in completion.choices:
+            if not hasattr(choice, "message") or not choice.message.content:
+                continue
+            # make sure it is a JSON completion, even if None
+            # We do want to modify the underlying message
+            # so that users of it can just parse it as expected
+            choice.message.content = (
+                choice.message.content.split("```json")[-1].split("```")[0] or ""
+            )
+            output_type.model_validate_json(choice.message.content)
+    except ValidationError as err:
+        raise JSONSchemaValidationError(
+            "The completion does not match the specified schema."
+        ) from err
+
+
+def prepare_args(func: Callable, chunk: str, name: str | None) -> tuple[tuple, dict]:
+    with contextlib.suppress(TypeError):
+        if "name" in signature(func).parameters:
+            return (chunk,), {"name": name}
+    return (chunk,), {}
+
+
+async def do_callbacks(
+    async_callbacks: Iterable[Callable[..., Awaitable]],
+    sync_callbacks: Iterable[Callable[..., Any]],
+    chunk: str,
+    name: str | None,
+) -> None:
+    for f in async_callbacks:
+        args, kwargs = prepare_args(f, chunk, name)
+        await f(*args, **kwargs)
+    for f in sync_callbacks:
+        args, kwargs = prepare_args(f, chunk, name)
+        f(*args, **kwargs)
+
+
+def get_litellm_retrying_config(timeout: float = 60.0) -> dict[str, Any]:
+    """Get retrying configuration for litellm.acompletion and litellm.aembedding."""
+    return {"num_retries": 3, "timeout": timeout}
+
+
+class LLMModel(ABC, BaseModel):
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+
+    llm_type: str | None = None
+    name: str
+    llm_result_callback: (
+        Callable[[LLMResult], None] | Callable[[LLMResult], Awaitable[None]] | None
+    ) = Field(
+        default=None,
+        description=(
+            "An async callback that will be executed on each"
+            " LLMResult (different than callbacks that execute on each chunk)"
+        ),
+        exclude=True,
+    )
+    config: dict = Field(default_factory=dict)
+
+    async def acomplete(self, prompt: str) -> Chunk:
+        """Return the completion as string and the number of tokens in the prompt and completion."""
+        raise NotImplementedError
+
+    async def acomplete_iter(self, prompt: str) -> AsyncIterable[Chunk]:
+        """Return an async generator that yields chunks of the completion.
+
+        Only the last tuple will be non-zero.
+        """
+        raise NotImplementedError
+        if False:  # type: ignore[unreachable]  # pylint: disable=using-constant-test
+            yield  # Trick mypy: https://github.com/python/mypy/issues/5070#issuecomment-1050834495
+
+    async def achat(self, messages: Iterable[dict[str, str]]) -> Chunk:
+        """Return the completion as string and the number of tokens in the prompt and completion."""
+        raise NotImplementedError
+
+    async def achat_iter(
+        self, messages: Iterable[dict[str, str]]
+    ) -> AsyncIterable[Chunk]:
+        """Return an async generator that yields chunks of the completion.
+
+        Only the last tuple will be non-zero.
+        """
+        raise NotImplementedError
+        if False:  # type: ignore[unreachable]  # pylint: disable=using-constant-test
+            yield  # Trick mypy: https://github.com/python/mypy/issues/5070#issuecomment-1050834495
+
+    def infer_llm_type(self) -> str:
+        return "completion"
+
+    def count_tokens(self, text: str) -> int:
+        return len(text) // 4  # gross approximation
+
+    async def run_prompt(
+        self,
+        prompt: str,
+        data: dict,
+        callbacks: list[Callable] | None = None,
+        name: str | None = None,
+        system_prompt: str | None = default_system_prompt,
+    ) -> LLMResult:
+        if self.llm_type is None:
+            self.llm_type = self.infer_llm_type()
+        if self.llm_type == "chat":
+            return await self._run_chat(prompt, data, callbacks, name, system_prompt)
+        if self.llm_type == "completion":
+            return await self._run_completion(
+                prompt, data, callbacks, name, system_prompt
+            )
+        raise ValueError(f"Unknown llm_type {self.llm_type!r}.")
+
+    async def _run_chat(
+        self,
+        prompt: str,
+        data: dict,
+        callbacks: list[Callable] | None = None,
+        name: str | None = None,
+        system_prompt: str | None = default_system_prompt,
+    ) -> LLMResult:
+        """Run a chat prompt.
+
+        Args:
+            prompt: Prompt to use.
+            data: Keys for the input variables that will be formatted into prompt.
+            callbacks: Optional functions to call with each chunk of the completion.
+            name: Optional name for the result.
+            system_prompt: System prompt to use, or None/empty string to not use one.
+
+        Returns:
+            Result of the chat.
+        """
+        human_message_prompt = {"role": "user", "content": prompt}
+        messages = [
+            {"role": m["role"], "content": m["content"].format(**data)}
+            for m in (
+                [{"role": "system", "content": system_prompt}, human_message_prompt]
+                if system_prompt
+                else [human_message_prompt]
+            )
+        ]
+        result = LLMResult(
+            model=self.name,
+            name=name,
+            prompt=messages,
+            prompt_count=(
+                sum(self.count_tokens(m["content"]) for m in messages)
+                + sum(self.count_tokens(m["role"]) for m in messages)
+            ),
+        )
+
+        start_clock = asyncio.get_running_loop().time()
+        if callbacks is None:
+            chunk = await self.achat(messages)
+            output = chunk.text
+        else:
+            sync_callbacks = [f for f in callbacks if not is_coroutine_callable(f)]
+            async_callbacks = [f for f in callbacks if is_coroutine_callable(f)]
+            completion = await self.achat_iter(messages)  # type: ignore[misc]
+            text_result = []
+            async for chunk in completion:
+                if chunk.text:
+                    if result.seconds_to_first_token == 0:
+                        result.seconds_to_first_token = (
+                            asyncio.get_running_loop().time() - start_clock
+                        )
+                    text_result.append(chunk.text)
+                    await do_callbacks(
+                        async_callbacks, sync_callbacks, chunk.text, name
+                    )
+            output = "".join(text_result)
+        usage = chunk.prompt_tokens, chunk.completion_tokens
+        if sum(usage) > 0:
+            result.prompt_count, result.completion_count = usage
+        elif output:
+            result.completion_count = self.count_tokens(output)
+        result.text = output or ""
+        result.seconds_to_last_token = asyncio.get_running_loop().time() - start_clock
+        if self.llm_result_callback:
+            if is_coroutine_callable(self.llm_result_callback):
+                await self.llm_result_callback(result)  # type: ignore[misc]
+            else:
+                self.llm_result_callback(result)
+        return result
+
+    async def _run_completion(
+        self,
+        prompt: str,
+        data: dict,
+        callbacks: Iterable[Callable] | None = None,
+        name: str | None = None,
+        system_prompt: str | None = default_system_prompt,
+    ) -> LLMResult:
+        """Run a completion prompt.
+
+        Args:
+            prompt: Prompt to use.
+            data: Keys for the input variables that will be formatted into prompt.
+            callbacks: Optional functions to call with each chunk of the completion.
+            name: Optional name for the result.
+            system_prompt: System prompt to use, or None/empty string to not use one.
+
+        Returns:
+            Result of the completion.
+        """
+        formatted_prompt: str = (
+            system_prompt + "\n\n" + prompt if system_prompt else prompt
+        ).format(**data)
+        result = LLMResult(
+            model=self.name,
+            name=name,
+            prompt=formatted_prompt,
+            prompt_count=self.count_tokens(formatted_prompt),
+        )
+
+        start_clock = asyncio.get_running_loop().time()
+        if callbacks is None:
+            chunk = await self.acomplete(formatted_prompt)
+            output = chunk.text
+        else:
+            sync_callbacks = [f for f in callbacks if not is_coroutine_callable(f)]
+            async_callbacks = [f for f in callbacks if is_coroutine_callable(f)]
+
+            completion = self.acomplete_iter(formatted_prompt)
+            text_result = []
+            async for chunk in completion:
+                if chunk.text:
+                    if result.seconds_to_first_token == 0:
+                        result.seconds_to_first_token = (
+                            asyncio.get_running_loop().time() - start_clock
+                        )
+                    text_result.append(chunk.text)
+                    await do_callbacks(
+                        async_callbacks, sync_callbacks, chunk.text, name
+                    )
+            output = "".join(text_result)
+        usage = chunk.prompt_tokens, chunk.completion_tokens
+        if sum(usage) > 0:
+            result.prompt_count, result.completion_count = usage
+        elif output:
+            result.completion_count = self.count_tokens(output)
+        result.text = output or ""
+        result.seconds_to_last_token = asyncio.get_running_loop().time() - start_clock
+        if self.llm_result_callback:
+            if is_coroutine_callable(self.llm_result_callback):
+                await self.llm_result_callback(result)  # type: ignore[misc]
+            else:
+                self.llm_result_callback(result)
+        return result
+
+
+LLMModelOrChild = TypeVar("LLMModelOrChild", bound=LLMModel)
+
+
+def rate_limited(
+    func: Callable[[LLMModelOrChild, Any], Awaitable[Chunk] | AsyncIterable[Chunk]],
+) -> Callable[
+    [LLMModelOrChild, Any, Any],
+    Awaitable[Chunk | AsyncIterator[Chunk] | AsyncIterator[LLMModelOrChild]],
+]:
+    """Decorator to rate limit relevant methods of an LLMModel."""
+
+    @functools.wraps(func)
+    async def wrapper(
+        self: LLMModelOrChild, *args: Any, **kwargs: Any
+    ) -> Chunk | AsyncIterator[Chunk] | AsyncIterator[LLMModelOrChild]:
+
+        if not hasattr(self, "check_rate_limit"):
+            raise NotImplementedError(
+                f"Model {self.name} must have a `check_rate_limit` method."
+            )
+
+        # Estimate token count based on input
+        if func.__name__ in {"acomplete", "acomplete_iter"}:
+            prompt = args[0] if args else kwargs.get("prompt", "")
+            token_count = (
+                len(prompt) / CHARACTERS_PER_TOKEN_ASSUMPTION
+                + EXTRA_TOKENS_FROM_USER_ROLE
+            )
+        elif func.__name__ in {"achat", "achat_iter"}:
+            messages = args[0] if args else kwargs.get("messages", [])
+            token_count = len(str(messages)) / CHARACTERS_PER_TOKEN_ASSUMPTION
+        else:
+            token_count = 0  # Default if method is unknown
+
+        await self.check_rate_limit(token_count)
+
+        # If wrapping a generator, count the tokens for each
+        # portion before yielding
+        if isasyncgenfunction(func):
+
+            async def rate_limited_generator() -> AsyncGenerator[LLMModelOrChild, None]:
+                async for item in func(self, *args, **kwargs):
+                    token_count = 0
+                    if isinstance(item, Chunk):
+                        token_count = int(
+                            len(item.text or "") / CHARACTERS_PER_TOKEN_ASSUMPTION
+                        )
+                    await self.check_rate_limit(token_count)
+                    yield item
+
+            return rate_limited_generator()
+
+        result = await func(self, *args, **kwargs)  # type: ignore[misc]
+
+        if func.__name__ in {"acomplete", "achat"} and isinstance(result, Chunk):
+            await self.check_rate_limit(result.completion_tokens)
+        return result
+
+    return wrapper
+
+
+class PassThroughRouter(litellm.Router):  # TODO: add rate_limited
+    """Router that is just a wrapper on LiteLLM's normal free functions."""
+
+    def __init__(self, **kwargs):
+        self._default_kwargs = kwargs
+
+    async def atext_completion(self, *args, **kwargs):
+        return await litellm.atext_completion(*args, **(self._default_kwargs | kwargs))
+
+    async def acompletion(self, *args, **kwargs):
+        return await litellm.acompletion(*args, **(self._default_kwargs | kwargs))
+
+
+class LiteLLMModel(LLMModel):
+    """A wrapper around the litellm library."""
+
+    config: dict = Field(
+        default_factory=dict,
+        description=(
+            "Configuration of this model containing several important keys. The"
+            " optional `model_list` key stores a list of all model configurations"
+            " (SEE: https://docs.litellm.ai/docs/routing). The optional"
+            " `router_kwargs` key is keyword arguments to pass to the Router class."
+            " Inclusion of a key `pass_through_router` with a truthy value will lead"
+            " to using not using LiteLLM's Router, instead just LiteLLM's free"
+            f" functions (see {PassThroughRouter.__name__}). Rate limiting applies"
+            " regardless of `pass_through_router` being present. The optional"
+            " `rate_limit` key is a dictionary keyed by model group name with values"
+            " of type limits.RateLimitItem (in tokens / minute) or valid"
+            " limits.RateLimitItem string for parsing."
+        ),
+    )
+    name: str = "gpt-4o-mini"
+    _router: litellm.Router | None = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def maybe_set_config_attribute(cls, data: dict[str, Any]) -> dict[str, Any]:
+        """If a user only gives a name, make a sensible config dict for them."""
+        if "config" not in data:
+            data["config"] = {}
+        if "name" in data and "model_list" not in data["config"]:
+            data["config"] = {
+                "model_list": [
+                    {
+                        "model_name": data["name"],
+                        "litellm_params": {"model": data["name"]}
+                        | (
+                            {}
+                            if "gemini" not in data["name"]
+                            else {"safety_settings": DEFAULT_VERTEX_SAFETY_SETTINGS}
+                        ),
+                    }
+                ],
+            } | data["config"]
+
+        if "router_kwargs" not in data["config"]:
+            data["config"]["router_kwargs"] = {}
+        data["config"]["router_kwargs"] = (
+            get_litellm_retrying_config() | data["config"]["router_kwargs"]
+        )
+        if not data["config"].get("pass_through_router"):
+            data["config"]["router_kwargs"] = {"retry_after": 5} | data["config"][
+                "router_kwargs"
+            ]
+
+        # we only support one "model name" for now, here we validate
+        model_list = data["config"]["model_list"]
+        if IS_PYTHON_BELOW_312:
+            if not isinstance(model_list, list):
+                # Work around https://github.com/BerriAI/litellm/issues/5664
+                raise TypeError(f"model_list must be a list, not a {type(model_list)}.")
+        else:
+            # pylint: disable-next=possibly-used-before-assignment
+            _DeploymentTypedDictValidator.validate_python(model_list)
+        if len({m["model_name"] for m in model_list}) > 1:
+            raise ValueError("Only one model name per model list is supported for now.")
+        return data
+
+    def __getstate__(self):
+        # Prevent _router from being pickled, SEE: https://stackoverflow.com/a/2345953
+        state = super().__getstate__()
+        state["__dict__"] = state["__dict__"].copy()
+        state["__dict__"].pop("_router", None)
+        return state
+
+    @property
+    def router(self) -> litellm.Router:
+        if self._router is None:
+            router_kwargs: dict = self.config.get("router_kwargs", {})
+            if self.config.get("pass_through_router"):
+                self._router = PassThroughRouter(**router_kwargs)
+            else:
+                self._router = litellm.Router(
+                    model_list=self.config["model_list"], **router_kwargs
+                )
+        return self._router
+
+    async def check_rate_limit(self, token_count: float, **kwargs) -> None:
+        if "rate_limit" in self.config:
+            await GLOBAL_LIMITER.try_acquire(
+                ("client", self.name),
+                self.config["rate_limit"].get(self.name, None),
+                weight=max(int(token_count), 1),
+                **kwargs,
+            )
+
+    @rate_limited
+    async def acomplete(self, prompt: str) -> Chunk:  # type: ignore[override]
+        response = await self.router.atext_completion(model=self.name, prompt=prompt)
+        return Chunk(
+            text=response.choices[0].text,
+            prompt_tokens=response.usage.prompt_tokens,
+            completion_tokens=response.usage.completion_tokens,
+        )
+
+    @rate_limited
+    async def acomplete_iter(  # type: ignore[override]
+        self, prompt: str
+    ) -> AsyncIterable[Chunk]:
+        completion = await self.router.atext_completion(
+            model=self.name,
+            prompt=prompt,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        async for chunk in completion:
+            yield Chunk(
+                text=chunk.choices[0].text, prompt_tokens=0, completion_tokens=0
+            )
+        if hasattr(chunk, "usage") and hasattr(chunk.usage, "prompt_tokens"):
+            yield Chunk(
+                text=chunk.choices[0].text, prompt_tokens=0, completion_tokens=0
+            )
+
+    @rate_limited
+    async def achat(  # type: ignore[override]
+        self, messages: Iterable[dict[str, str]]
+    ) -> Chunk:
+        response = await self.router.acompletion(self.name, list(messages))
+        return Chunk(
+            text=cast(litellm.Choices, response.choices[0]).message.content,
+            prompt_tokens=response.usage.prompt_tokens,  # type: ignore[attr-defined]
+            completion_tokens=response.usage.completion_tokens,  # type: ignore[attr-defined]
+        )
+
+    @rate_limited
+    async def achat_iter(  # type: ignore[override]
+        self, messages: Iterable[dict[str, str]]
+    ) -> AsyncIterable[Chunk]:
+        completion = await self.router.acompletion(
+            self.name,
+            list(messages),
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        async for chunk in completion:
+            yield Chunk(
+                text=chunk.choices[0].delta.content,
+                prompt_tokens=0,
+                completion_tokens=0,
+            )
+        if hasattr(chunk, "usage") and hasattr(chunk.usage, "prompt_tokens"):
+            yield Chunk(
+                text=None,
+                prompt_tokens=chunk.usage.prompt_tokens,
+                completion_tokens=chunk.usage.completion_tokens,
+            )
+
+    def infer_llm_type(self) -> str:
+        if all(
+            "text-completion" in m.get("litellm_params", {}).get("model", "")
+            for m in self.config["model_list"]
+        ):
+            return "completion"
+        return "chat"
+
+    def count_tokens(self, text: str) -> int:
+        return litellm.token_counter(model=self.name, text=text)
+
+    async def select_tool(
+        self, *selection_args, **selection_kwargs
+    ) -> ToolRequestMessage:
+        """Shim to aviary.core.ToolSelector that supports tool schemae."""
+        tool_selector = ToolSelector(
+            model_name=self.name, acompletion=self.router.acompletion
+        )
+        return await tool_selector(*selection_args, **selection_kwargs)
diff --git a/src/llmclient/constants.py b/llmclient/prompts.py
similarity index 100%
rename from src/llmclient/constants.py
rename to llmclient/prompts.py
diff --git a/llmclient/rate_limiter.py b/llmclient/rate_limiter.py
new file mode 100644
index 0000000..8afbe3d
--- /dev/null
+++ b/llmclient/rate_limiter.py
@@ -0,0 +1,397 @@
+import asyncio
+import logging
+import os
+from collections.abc import Collection
+from typing import ClassVar, Literal
+from urllib.parse import urlparse
+
+import aiohttp
+from coredis import Redis
+from limits import (
+    RateLimitItem,
+    RateLimitItemPerMinute,
+    RateLimitItemPerSecond,
+)
+from limits import (
+    parse as limit_parse,
+)
+from limits.aio.storage import MemoryStorage, RedisStorage
+from limits.aio.strategies import MovingWindowRateLimiter
+
+logger = logging.getLogger(__name__)
+
+
+SEMANTIC_SCHOLAR_HOST = "api.semanticscholar.org"
+SEMANTIC_SCHOLAR_BASE_URL = f"https://{SEMANTIC_SCHOLAR_HOST}"
+
+
+CROSSREF_HOST = "api.crossref.org"
+CROSSREF_BASE_URL = f"https://{CROSSREF_HOST}"
+
+GLOBAL_RATE_LIMITER_TIMEOUT = float(os.environ.get("RATE_LIMITER_TIMEOUT", "60"))
+
+MATCH_ALL = None
+MatchAllInputs = Literal[None]
+MATCH_MACHINE_ID = "<machine_id>"
+
+FALLBACK_RATE_LIMIT = RateLimitItemPerSecond(3, 1)
+TOKEN_FALLBACK_RATE_LIMIT = RateLimitItemPerMinute(30_000, 1)
+
+# RATE_CONFIG keys are tuples, corresponding to a namespace and primary key.
+# Anything defined with MATCH_ALL variable, will match all non-matched requests for that namespace.
+# For the "get" namespace, all primary key urls will be parsed down to the domain level.
+# For example, you're trying to do a get request to "https://google.com", "google.com" will get
+# its own limit, and it will use the ("get", MATCH_ALL) for its limits.
+# machine_id is a unique identifier for the machine making the request, it's used to limit the
+# rate of requests per machine. If the primary_key is in the NO_MACHINE_ID_EXTENSIONS list, then
+# the dynamic IP of the machine will be used to limit the rate of requests, otherwise the
+# user input machine_id will be used.
+
+RATE_CONFIG: dict[tuple[str, str | MatchAllInputs], RateLimitItem] = {
+    ("get", CROSSREF_BASE_URL): RateLimitItemPerSecond(30, 1),
+    ("get", SEMANTIC_SCHOLAR_BASE_URL): RateLimitItemPerSecond(15, 1),
+    ("client", MATCH_ALL): TOKEN_FALLBACK_RATE_LIMIT,
+    # MATCH_MACHINE_ID is a sentinel for the machine_id passed in by the caller
+    (f"get|{MATCH_MACHINE_ID}", MATCH_ALL): FALLBACK_RATE_LIMIT,
+}
+
+UNKNOWN_IP: str = "0.0.0.0"  # noqa: S104
+
+
+class GlobalRateLimiter:
+    """Rate limiter for all requests within or between processes.
+
+    Supports both Redis and in-memory storage.
+    'Global' refers to being able to limit the rate
+    of requests across processes with Redis.
+    """
+
+    WAIT_INCREMENT: ClassVar[float] = 0.01  # seconds
+    # list of public free outbount IP services
+    # generated initially w. claude, then filtered
+    IP_CHECK_SERVICES: ClassVar[Collection[str]] = {
+        "https://api.ipify.org",
+        "https://ifconfig.me",
+        "http://icanhazip.com",
+        "https://ipecho.net/plain",
+    }
+    # the following will use IP scope for limiting, rather
+    # than user input machine ID
+    NO_MACHINE_ID_EXTENSIONS: ClassVar[Collection[str]] = {"crossref.org"}
+
+    def __init__(
+        self,
+        rate_config: (
+            None | dict[tuple[str, str | MatchAllInputs], RateLimitItem]
+        ) = None,
+        use_in_memory: bool = False,
+    ):
+        self.rate_config = RATE_CONFIG if rate_config is None else rate_config
+        self.use_in_memory = use_in_memory
+        self._storage: RedisStorage | MemoryStorage | None = None
+        self._rate_limiter: MovingWindowRateLimiter | None = None
+        self._current_ip: str | None = None
+
+    @staticmethod
+    async def get_outbound_ip(session: aiohttp.ClientSession, url: str) -> str | None:
+        try:
+            async with session.get(url, timeout=aiohttp.ClientTimeout(5)) as response:
+                if response.ok:
+                    return await response.text()
+        except TimeoutError:
+            logger.warning(f"Timeout occurred while connecting to {url}")
+        except aiohttp.ClientError:
+            logger.warning(f"Error occurred while connecting to {url}.", exc_info=True)
+        return None
+
+    async def outbount_ip(self) -> str:
+        if self._current_ip is None:
+            async with aiohttp.ClientSession() as session:
+                for service in self.IP_CHECK_SERVICES:
+                    ip = await self.get_outbound_ip(session, service)
+                    if ip:
+                        logger.info(f"Successfully retrieved IP from {service}")
+                        self._current_ip = ip.strip()
+                        break
+                if self._current_ip is None:
+                    logger.error("Failed to retrieve IP from all services")
+                    self._current_ip = UNKNOWN_IP
+        return self._current_ip
+
+    @property
+    def storage(self) -> RedisStorage | MemoryStorage:
+        if self._storage is None:
+            if os.environ.get("REDIS_URL") and not self.use_in_memory:
+                self._storage = RedisStorage(f"async+redis://{os.environ['REDIS_URL']}")
+                logger.info("Connected to redis instance for rate limiting.")
+            else:
+                self._storage = MemoryStorage()
+                logger.info("Using in-memory rate limiter.")
+
+        return self._storage
+
+    @property
+    def rate_limiter(self) -> MovingWindowRateLimiter:
+        if self._rate_limiter is None:
+            self._rate_limiter = MovingWindowRateLimiter(self.storage)
+        return self._rate_limiter
+
+    async def parse_namespace_and_primary_key(
+        self, namespace_and_key: tuple[str, str], machine_id: int = 0
+    ) -> tuple[str, str]:
+        """Turn namespace_and_key tuple into a namespace and primary-key.
+
+        If using a namespace starting with "get", then the primary key will be url parsed.
+        "get" namespaces will also have their machine_ids appended to the namespace here,
+        unless the primary key is in the NO_MACHINE_ID_EXTENSIONS list, in which case
+        the outbound IP will be used.
+        """
+        namespace, primary_key = namespace_and_key
+
+        if namespace.startswith("get") and primary_key is not None:
+            # for URLs to be parsed correctly, they need a protocol
+            if not primary_key.startswith(("http://", "https://")):
+                primary_key = "https://" + primary_key
+
+            primary_key = urlparse(primary_key).netloc or urlparse(primary_key).path
+
+            if any(ext in primary_key for ext in self.NO_MACHINE_ID_EXTENSIONS):
+                namespace = f"{namespace}|{await self.outbount_ip()}"
+            else:
+                namespace = f"{namespace}|{machine_id}"
+
+        return namespace, primary_key
+
+    def parse_rate_limits_and_namespace(
+        self,
+        namespace: str,
+        primary_key: str | MatchAllInputs,
+    ) -> tuple[RateLimitItem, str]:
+        """Get rate limit and new namespace for a given namespace and primary_key.
+
+        This parsing logic finds the correct rate limits for a namespace/primary_key.
+        It's a bit complex due to the <MATCH ALL> and <MATCH MACHINE ID> placeholders.
+        These allow users to match
+
+        """
+        # the namespace may have a machine_id in it -- we replace if that's the case
+        namespace_w_stub_machine_id = namespace
+        namespace_w_machine_id_stripped = namespace
+
+        # strip off the machine_id, and replace it with the MATCH_MACHINE_ID placeholder
+        if namespace.startswith("get"):
+            machine_id = namespace.split("|")[-1]
+            if machine_id != "get":
+                namespace_w_stub_machine_id = namespace.replace(
+                    machine_id, MATCH_MACHINE_ID, 1
+                )
+                # try stripping the machine id for the namespace for shared limits
+                # i.e. matching to one rate limit across ALL machines
+                # these limits are in RATE_CONFIG WITHOUT a MATCH_MACHINE_ID placeholder
+                namespace_w_machine_id_stripped = "|".join(namespace.split("|")[:-1])
+
+        # here we want to use namespace_w_machine_id_stripped -- the rate should be shared
+        # this needs to be checked first, since it's more specific than the stub machine id
+        if (namespace_w_machine_id_stripped, primary_key) in self.rate_config:
+            return (
+                self.rate_config[(namespace_w_machine_id_stripped, primary_key)],
+                namespace_w_machine_id_stripped,
+            )
+        # we keep the old namespace if we match on the namespace_w_stub_machine_id
+        if (namespace_w_stub_machine_id, primary_key) in self.rate_config:
+            return (
+                self.rate_config[(namespace_w_stub_machine_id, primary_key)],
+                namespace,
+            )
+        # again we only want the original namespace, keep the old namespace
+        if (namespace_w_stub_machine_id, MATCH_ALL) in self.rate_config:
+            return (
+                self.rate_config[(namespace_w_stub_machine_id, MATCH_ALL)],
+                namespace,
+            )
+        # again we want to use the stripped namespace if it matches
+        if (namespace_w_machine_id_stripped, MATCH_ALL) in self.rate_config:
+            return (
+                self.rate_config[(namespace_w_machine_id_stripped, MATCH_ALL)],
+                namespace_w_machine_id_stripped,
+            )
+        return FALLBACK_RATE_LIMIT, namespace
+
+    def parse_key(
+        self, key: str
+    ) -> tuple[RateLimitItem, tuple[str, str | MatchAllInputs]]:
+        """Parse the rate limit item from a redis/in-memory key.
+
+        Args:
+            key (str): is created with RateLimitItem.key_for(*identifiers),
+            the first key is the namespace, then the next two will be our identifiers.
+
+        """
+        namespace, primary_key = key.split("/")[1:3]
+        rate_limit, new_namespace = self.parse_rate_limits_and_namespace(
+            namespace, primary_key
+        )
+        return (
+            rate_limit,
+            (new_namespace, primary_key),
+        )
+
+    async def get_rate_limit_keys(
+        self, cursor_scan_count: int = 100
+    ) -> list[tuple[RateLimitItem, tuple[str, str | MatchAllInputs]]]:
+        """Returns a list of current RateLimitItems with tuples of namespace and primary key."""
+        host, port = os.environ.get("REDIS_URL", ":").split(":", maxsplit=2)
+
+        if not (host and port):
+            raise ValueError(f'Invalid REDIS_URL: {os.environ.get("REDIS_URL")}.')
+
+        if not isinstance(self.storage, RedisStorage):
+            raise NotImplementedError(
+                "get_rate_limit_keys only works with RedisStorage."
+            )
+
+        client = Redis(host=host, port=int(port))
+
+        try:
+            cursor: int | bytes = b"0"
+            matching_keys: list[bytes] = []
+            while cursor:
+                cursor, keys = await client.scan(
+                    int(cursor),
+                    match=f"{self.storage.PREFIX}*",
+                    count=cursor_scan_count,
+                )
+                matching_keys.extend(list(keys))
+        finally:
+            await client.quit()
+
+        return [self.parse_key(key.decode()) for key in matching_keys]
+
+    def get_in_memory_limit_keys(
+        self,
+    ) -> list[tuple[RateLimitItem, tuple[str, str | MatchAllInputs]]]:
+        """Returns a list of current RateLimitItems with tuples of namespace and primary key."""
+        if not isinstance(self.storage, MemoryStorage):
+            raise NotImplementedError(
+                "get_in_memory_limit_keys only works with MemoryStorage."
+            )
+        return [self.parse_key(key) for key in self.storage.events]
+
+    async def get_limit_keys(
+        self,
+    ) -> list[tuple[RateLimitItem, tuple[str, str | MatchAllInputs]]]:
+        if os.environ.get("REDIS_URL") and not self.use_in_memory:
+            return await self.get_rate_limit_keys()
+        return self.get_in_memory_limit_keys()
+
+    async def rate_limit_status(self):
+
+        limit_status = {}
+
+        for rate_limit, (namespace, primary_key) in await self.get_limit_keys():
+            period_start, n_items_in_period = await self.storage.get_moving_window(
+                rate_limit.key_for(*(namespace, primary_key or "")),
+                rate_limit.amount,
+                rate_limit.get_expiry(),
+            )
+            limit_status[(namespace, primary_key)] = {
+                "period_start": period_start,
+                "n_items_in_period": n_items_in_period,
+                "period_seconds": rate_limit.GRANULARITY.seconds,
+                "period_name": rate_limit.GRANULARITY.name,
+                "period_cap": rate_limit.amount,
+            }
+        return limit_status
+
+    async def try_acquire(
+        self,
+        namespace_and_key: tuple[str, str],
+        rate_limit: RateLimitItem | str | None = None,
+        machine_id: int = 0,
+        acquire_timeout: float = GLOBAL_RATE_LIMITER_TIMEOUT,
+        weight: int = 1,
+        raise_impossible_limits: bool = False,
+    ) -> None:
+        """Returns when the limit is satisfied for the namespace_and_key.
+
+        Args:
+            namespace_and_key (:obj:`tuple[str, str]`): is
+                composed of a tuple with namespace (e.g. "get") and a primary-key
+                (e.g. "arxiv.org"). namespaces can be nested with multiple '|',
+                primary-keys in the "get" namespace will be stripped to the domain.
+            rate_limit (:obj:`RateLimitItem | str | None`, optional): Optional
+                RateLimitItem to be used for the namespace and primary-key.
+                If not provided, RATE_CONFIG will be used to find the rate limit.
+                Can also use a string of the form:
+                [count] [per|/] [n (optional)] [second|minute|hour|day|month|year]
+            machine_id (:obj:`int`, optional): will be used to modify the namespace
+                of GET requests if the primary key is not in the
+                NO_MACHINE_ID_EXTENSIONS list. In that case, the outbound IP will be
+                used to modify the namespace.
+            acquire_timeout (:obj:`float`, optional): is the maximum time (in seconds) to
+                wait for the rate limit to be satisfied.
+            weight (:obj:`int`, optional): is the cost of the request,
+                default is 1. (could be tokens for example)
+            raise_impossible_limits (:obj:`bool`, optional): flag will raise a
+                ValueError for weights that exceed the rate.
+
+        Raises:
+            TimeoutError: if the acquire_timeout is exceeded.
+            ValueError: if the weight exceeds the rate limit and raise_impossible_limits is True.
+        """
+        namespace, primary_key = await self.parse_namespace_and_primary_key(
+            namespace_and_key, machine_id=machine_id
+        )
+
+        _rate_limit, new_namespace = self.parse_rate_limits_and_namespace(
+            namespace, primary_key
+        )
+
+        if isinstance(rate_limit, str):
+            rate_limit = limit_parse(rate_limit)
+
+        rate_limit = rate_limit or _rate_limit
+
+        if rate_limit.amount < weight and raise_impossible_limits:
+            raise ValueError(
+                f"Weight ({weight}) > RateLimit ({rate_limit}), cannot satisfy rate"
+                " limit."
+            )
+        while True:
+            elapsed = 0.0
+            while (
+                not (
+                    await self.rate_limiter.test(
+                        rate_limit,
+                        new_namespace,
+                        primary_key,
+                        cost=min(weight, rate_limit.amount),
+                    )
+                )
+                and elapsed < acquire_timeout
+            ):
+                await asyncio.sleep(self.WAIT_INCREMENT)
+                elapsed += self.WAIT_INCREMENT
+            if elapsed >= acquire_timeout:
+                raise TimeoutError(
+                    f"Timeout ({elapsed} secs): rate limit for key: {namespace_and_key}"
+                )
+
+            # If the rate limit hit is False, then we're violating the limit, so we
+            # need to wait again. This can happen in race conditions.
+            if await self.rate_limiter.hit(
+                rate_limit,
+                new_namespace,
+                primary_key,
+                cost=min(weight, rate_limit.amount),
+            ):
+                # we need to keep trying when we have an "impossible" limit
+                if rate_limit.amount < weight:
+                    weight -= rate_limit.amount
+                    acquire_timeout = max(acquire_timeout - elapsed, 1.0)
+                    continue
+                break
+            acquire_timeout = max(acquire_timeout - elapsed, 1.0)
+
+
+GLOBAL_LIMITER = GlobalRateLimiter()
diff --git a/src/llmclient/result.py b/llmclient/types.py
similarity index 50%
rename from src/llmclient/result.py
rename to llmclient/types.py
index c5571cc..d4eeea1 100644
--- a/src/llmclient/result.py
+++ b/llmclient/types.py
@@ -1,19 +1,16 @@
+import contextvars
+import logging
+from contextlib import contextmanager
+from datetime import datetime
+from uuid import UUID, uuid4
+
+import litellm
 from pydantic import (
     BaseModel,
-    Field,
     ConfigDict,
+    Field,
     computed_field,
 )
-from typing import Union, List, Optional
-from uuid import UUID, uuid4
-from datetime import datetime
-from contextlib import contextmanager
-
-import contextvars
-import litellm
-import logging
-
-from aviary.core import Message
 
 logger = logging.getLogger(__name__)
 
@@ -30,61 +27,64 @@ def set_llm_session_ids(session_id: UUID):
         cvar_session_id.reset(token)
 
 
+class Embeddable(BaseModel):
+    embedding: list[float] | None = Field(default=None, repr=False)
+
+
+class Chunk(BaseModel):
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    text: str | None
+    prompt_tokens: int
+    completion_tokens: int
+
+    def __str__(self):
+        return self.text
+
+
 class LLMResult(BaseModel):
-    """A unified class to hold the result of a LLM completion, replacing two prior versions."""
+    """A class to hold the result of a LLM completion.
+
+    To associate a group of LLMResults, you can use the `set_llm_session_ids` context manager:
+
+    ```python
+    my_session_id = uuid4()
+    with set_llm_session_ids(my_session_id):
+        # code that generates LLMResults
+        pass
+    ```
+
+    and all the LLMResults generated within the context will have the same `session_id`.
+    This can be combined with LLMModels `llm_result_callback` to store all LLMResults.
+    """
+
+    model_config = ConfigDict(populate_by_name=True)
 
     id: UUID = Field(default_factory=uuid4)
-    model_config: ConfigDict = ConfigDict(populate_by_name=True)
-    name: Optional[str] = None
-    model: str = ""
-    text: str = ""
-    prompt_count: int = Field(default=0, description="Count of prompt tokens.")
-    completion_count: int = Field(default=0, description="Count of completion tokens.")
-    date: str = Field(default_factory=lambda: datetime.now().isoformat())
-    seconds_to_first_token: Optional[float] = Field(
-        default=0.0, description="Delta time (sec) to first response token's arrival."
-    )
-    seconds_to_last_token: float = Field(
-        default=0.0, description="Delta time (sec) to last response token's arrival."
-    )
-    system_fingerprint: Optional[str] = Field(
-        default=None, description="System fingerprint received from the LLM."
+    session_id: UUID | None = Field(
+        default_factory=cvar_session_id.get,
+        description="A persistent ID to associate a group of LLMResults",
+        alias="answer_id",
     )
-    prompt: Union[str, List[dict], List[Message], None] = Field(
+    name: str | None = None
+    prompt: str | list[dict] | None = Field(
         default=None,
         description="Optional prompt (str) or list of serialized prompts (list[dict]).",
     )
-    config: Optional[dict] = None
-    messages: Optional[List[Message]] = Field(
-        default=None, description="Messages received from the LLM."
-    )
-    session_id: Optional[UUID] = Field(
-        default_factory=cvar_session_id.get,
-        description="A persistent ID to associate a group of LLMResults",
-        alias="answer_id",
+    text: str = ""
+    prompt_count: int = 0
+    completion_count: int = 0
+    model: str
+    date: str = Field(default_factory=datetime.now().isoformat)
+    seconds_to_first_token: float = Field(
+        default=0.0, description="Delta time (sec) to first response token's arrival."
     )
-    logprob: Optional[float] = Field(
-        default=None, description="Sum of logprobs in the completion."
+    seconds_to_last_token: float = Field(
+        default=0.0, description="Delta time (sec) to last response token's arrival."
     )
-    finish_reason: str = ""
 
-    @property
-    def prompt_and_completion_costs(self) -> tuple[float, float]:
-        """Get a two-tuple of prompt tokens cost and completion tokens cost, in USD."""
-        return litellm.cost_per_token(
-            self.model,
-            prompt_tokens=self.prompt_count,
-            completion_tokens=self.completion_count,
-        )
-
-    @property
-    def provider(self) -> str:
-        """Get the model provider's name (e.g. 'openai', 'mistral')."""
-        return litellm.get_llm_provider(self.model)[1]
-
-    def get_supported_openai_params(self) -> Optional[List[str]]:
-        """Get the supported OpenAI parameters for the model."""
-        return litellm.get_supported_openai_params(self.model)
+    def __str__(self) -> str:
+        return self.text
 
     @computed_field  # type: ignore[prop-decorator]
     @property
@@ -98,6 +98,3 @@ def cost(self) -> float:
             except KeyError:
                 logger.warning(f"Could not find cost for model {self.model}.")
         return 0.0
-
-    def __str__(self) -> str:
-        return self.text
diff --git a/src/llmclient/util.py b/llmclient/utils.py
similarity index 54%
rename from src/llmclient/util.py
rename to llmclient/utils.py
index 41665e0..304bf95 100644
--- a/src/llmclient/util.py
+++ b/llmclient/utils.py
@@ -1,10 +1,15 @@
 import base64
-import io
 import contextlib
-
-from collections.abc import Callable, Iterable
-from typing import Any
+import io
+import logging
+import logging.config
+from collections.abc import Callable
 from inspect import iscoroutinefunction, isfunction, signature
+from typing import Any
+
+import litellm
+import numpy as np
+import pymupdf
 
 
 def encode_image_to_base64(img: "np.ndarray") -> str:
@@ -25,20 +30,9 @@ def encode_image_to_base64(img: "np.ndarray") -> str:
     )
 
 
-async def do_callbacks(
-    callbacks: Iterable[Callable[..., Any]],
-    chunk: str,
-    name: str = None,
-) -> None:
-    for f in callbacks:
-        args, kwargs = prepare_args(f, chunk, name)
-        if iscoroutinefunction(f):
-            await f(*args, **kwargs)
-        else:
-            f(*args, **kwargs)
-
-
-def prepare_args(func: Callable, chunk: str, name: str = None) -> tuple[tuple, dict]:
+def prepare_args(
+    func: Callable, chunk: str, name: str | None = None
+) -> tuple[tuple, dict]:
     with contextlib.suppress(TypeError):
         if "name" in signature(func).parameters:
             return (chunk,), {"name": name}
@@ -59,3 +53,30 @@ def partial_format(value: str, **formats: dict[str, Any]) -> str:
         with contextlib.suppress(KeyError):
             value = value.format(**{template_key: template_value})
     return value
+
+
+def setup_default_logs() -> None:
+    """Configure logs to reasonable defaults."""
+    # Trigger PyMuPDF to use Python logging
+    # SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics
+    pymupdf.set_messages(pylogging=True)
+
+    # Set sane default LiteLLM logging configuration
+    # SEE: https://docs.litellm.ai/docs/observability/telemetry
+    litellm.telemetry = False
+
+    logging.config.dictConfig(
+        {
+            "version": 1,
+            "disable_existing_loggers": False,
+            # Lower level for verbose logs
+            "loggers": {
+                "httpcore": {"level": "WARNING"},
+                "httpx": {"level": "WARNING"},
+                # SEE: https://github.com/BerriAI/litellm/issues/2256
+                "LiteLLM": {"level": "WARNING"},
+                "LiteLLM Router": {"level": "WARNING"},
+                "LiteLLM Proxy": {"level": "WARNING"},
+            },
+        }
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 3fa1c51..74d95a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,9 +5,9 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
 [dependency-groups]
 codeflash = [
     "codeflash>=0.7",  # Pin to keep recent
-    "llm-client[dev]",
+    "fh-llm-client[dev]",
 ]
-dev = ["llm-client[dev]"]
+dev = ["fh-llm-client[dev]"]
 
 [project]
 authors = [
@@ -22,42 +22,42 @@ classifiers = [
     "Programming Language :: Python",
 ]
 dependencies = [
+    "PyMuPDF>=1.24.12",  # For pymupdf.set_messages addition
     "aiofiles",
+    "coredis",
     "dm-tree",
     "fhaviary>=0.8.2",  # For core namespace
     "httpx",
-    "litellm>=1.40.15",  # For LITELLM_LOG addition
+    "limits",
+    "litellm>=1.44",  # For LITELLM_LOG addition
     "networkx[default]~=3.4",  # Pin for pydot fix
-    "numpy>=1.20",  # For numpy.typing
-    "openai>=1",
+    "numpy",
     "pydantic~=2.0",
     "tenacity",
-    "tiktoken",
+    "tiktoken>=0.4.0",
     "tqdm",
     "typing-extensions; python_version <= '3.11'",  # for typing.override
     "usearch>=2.13",  # For py.typed
 ]
-description = "Agent framework for constructing language model agents and training on constructive tasks."
+description = "A client to provide LLM responses for FutureHouse applications."
 dynamic = ["version"]
 license = {file = "LICENSE"}
-name = "llm-client"
+name = "fh-llm-client"
 readme = "README.md"
 requires-python = ">=3.11"
 
 [project.optional-dependencies]
 dev = [
+    "fh-llm-client[monitor,nn,rich,server,typing,visualization,local]",
     "fhaviary[xml]",
     "ipython>=8",  # Pin to keep recent
-    "llm-client[monitor,nn,rich,server,typing,visualization]",
-    "litellm!=1.49.4,!=1.49.5,!=1.49.6",  # For https://github.com/BerriAI/litellm/issues/6216
     "mypy>=1.8",  # Pin for mutable-override
-    "openai<1.47",  # Pin for https://github.com/BerriAI/litellm/issues/5854
     "pre-commit>=3.4",  # Pin to keep recent
-    "pydantic~=2.9",  # Pydantic 2.9 changed JSON schema exports 'allOf', so ensure tests match
+    "pydantic~=2.0",
     "pylint-pydantic",
-    "pylint>=3.2",  # Pin to keep recent
     "pytest-asyncio",
     "pytest-mock",
+    "pytest-mock",
     "pytest-recording",
     "pytest-rerunfailures",
     "pytest-subtests",
@@ -65,8 +65,12 @@ dev = [
     "pytest-timer[colorama]",
     "pytest-xdist",
     "pytest>=8",  # Pin to keep recent
+    "python-dotenv",
     "refurb>=2",  # Pin to keep recent
 ]
+local = [
+    "sentence-transformers",
+]
 monitor = [
     "wandb",
 ]
@@ -232,6 +236,7 @@ disable = [
     "too-many-positional-arguments",  # Don't care to enforce this
     "too-many-return-statements",  # Rely on ruff PLR0911 for this
     "too-many-statements",  # Rely on ruff PLR0915 for this
+    "undefined-loop-variable",  # Don't care to enforce this
     "ungrouped-imports",  # Rely on ruff I001 for this
     "unidiomatic-typecheck",  # Rely on ruff E721 for this
     "unreachable",  # Rely on mypy unreachable for this
@@ -258,9 +263,6 @@ score = false
 min-similarity-lines = 12
 
 [tool.pytest.ini_options]
-# Add the specified OPTS to the set of command line arguments as if they had been
-# specified by the user.
-addopts = "--doctest-modules"
 # Sets a list of filters and actions that should be taken for matched warnings.
 # By default all warnings emitted during the test session will be displayed in
 # a summary at the end of the test session.
@@ -268,13 +270,15 @@ filterwarnings = [
     "ignore:Support for class-based `config` is deprecated, use ConfigDict instead",  # SEE: https://github.com/BerriAI/litellm/issues/5648
     "ignore:The `dict` method is deprecated; use `model_dump` instead",  # SEE: https://github.com/BerriAI/litellm/issues/5987
     "ignore:Use 'content=<...>' to upload raw bytes/text content:DeprecationWarning",  # SEE: https://github.com/BerriAI/litellm/issues/5986
+    "ignore:builtin type (SwigPyPacked|SwigPyObject|swigvarlink) has no __module__:DeprecationWarning:importlib._bootstrap",  # SEE: https://github.com/pymupdf/PyMuPDF/issues/3931 --> https://github.com/swig/swig/issues/2881#issuecomment-2332652634
     'ignore:open_text is deprecated. Use files\(\) instead:DeprecationWarning',  # SEE: https://github.com/BerriAI/litellm/issues/5647
+    'ignore:pkg_resources is deprecated as an API.:DeprecationWarning:pybtex',  # SEE: https://bitbucket.org/pybtex-devs/pybtex/issues/169/replace-pkg_resources-with
 ]
 # List of directories that should be searched for tests when no specific directories,
 # files or test ids are given in the command line when executing pytest from the rootdir
 # directory. File system paths may use shell-style wildcards, including the recursive **
 # pattern.
-testpaths = ["src", "tests"]
+testpaths = ["tests"]
 
 [tool.refurb]
 enable_all = true
@@ -292,11 +296,7 @@ ignore = [
 
 [tool.ruff]
 # Line length to use when enforcing long-lines violations (like `E501`).
-line-length = 88
-# The minimum Python version to target, e.g., when considering automatic code
-# upgrades, like rewriting type annotations. Ruff will not propose changes
-# using features that are not available in the given version.
-target-version = "py311"
+line-length = 120
 # Enable application of unsafe fixes.
 unsafe-fixes = true
 
@@ -386,9 +386,9 @@ ignore = [
     "S311",  # Ok to use python random
     "SLF001",  # Overly pedantic
     "T201",  # Overly pedantic
-    "TCH001",  # TCH001, TCH002, TCH003: don't care to enforce type checking blocks
-    "TCH002",
-    "TCH003",
+    "TC001",  # TCH001, TCH002, TCH003: don't care to enforce type checking blocks
+    "TC002",
+    "TC003",
     "TD002",  # Don't care for TODO author
     "TD003",  # Don't care for TODO links
     "TRY003",  # Overly pedantic
@@ -412,6 +412,7 @@ mypy-init-return = true
     "F841",  # Tests can have unused locals
     "N802",  # Tests function names can match class names
     "PLR2004",  # Tests can have magic values
+    "S301",  # can test pickle
 ]
 "docs/**.ipynb" = [
     "PLE1142",  # allow async
@@ -432,14 +433,16 @@ max-line-length = 120
 convention = "google"
 
 [tool.setuptools.packages.find]
-where = ["src"]
+exclude = ["tests"]
+include = ["llmclient"]
+where = ["."]
 
 [tool.setuptools_scm]
-version_file = "src/llmclient/version.py"
+version_file = "llmclient/version.py"
 
 [tool.tomlsort]
 all = true
 in_place = true
 spaces_before_inline_comment = 2  # Match Python PEP 8
 spaces_indent_inline_array = 4  # Match Python PEP 8
-trailing_comma_inline_array = true
\ No newline at end of file
+trailing_comma_inline_array = true
diff --git a/src/llmclient/__init__.py b/src/llmclient/__init__.py
deleted file mode 100644
index df3cc4a..0000000
--- a/src/llmclient/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from llmclient.model import LLMModel
-from llmclient.result import LLMResult
-
-__all__ = [
-    "LLMModel",
-    "LLMResult",
-]
diff --git a/src/llmclient/model.py b/src/llmclient/model.py
deleted file mode 100644
index c2c433a..0000000
--- a/src/llmclient/model.py
+++ /dev/null
@@ -1,504 +0,0 @@
-import asyncio
-import json
-from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable
-from typing import Any, AsyncIterable, ClassVar, Self, cast
-
-import litellm
-from aviary.core import (
-    Message,
-    Tool,
-    ToolRequestMessage,
-    ToolsAdapter,
-)
-from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
-
-from llmclient.constants import default_system_prompt
-from llmclient.result import LLMResult
-from llmclient.util import do_callbacks, is_coroutine_callable
-
-
-class Chunk(BaseModel):
-    model_config = ConfigDict(extra="forbid", frozen=True)
-
-    text: str | None
-    prompt_tokens: int
-    completion_tokens: int
-
-    def __str__(self):
-        return self.text
-
-
-class JSONSchemaValidationError(ValueError):
-    """Raised when the completion does not match the specified schema."""
-
-
-def sum_logprobs(choice: litellm.utils.Choices) -> float | None:
-    """Calculate the sum of the log probabilities of an LLM completion (a Choices object).
-
-    Args:
-        choice: A sequence of choices from the completion.
-
-    Returns:
-        The sum of the log probabilities of the choice.
-    """
-    try:
-        logprob_obj = choice.logprobs
-    except AttributeError:
-        return None
-    if isinstance(logprob_obj, dict):
-        if logprob_obj.get("content"):
-            return sum(
-                logprob_info["logprob"] for logprob_info in logprob_obj["content"]
-            )
-    elif choice.logprobs.content:
-        return sum(logprob_info.logprob for logprob_info in choice.logprobs.content)
-    return None
-
-
-def validate_json_completion(
-    completion: litellm.ModelResponse, output_type: type[BaseModel]
-) -> None:
-    """Validate a completion against a JSON schema.
-
-    Args:
-        completion: The completion to validate.
-        output_type: The Pydantic model to validate the completion against.
-    """
-    try:
-        for choice in completion.choices:
-            if not hasattr(choice, "message") or not choice.message.content:
-                continue
-            # make sure it is a JSON completion, even if None
-            # We do want to modify the underlying message
-            # so that users of it can just parse it as expected
-            choice.message.content = (
-                choice.message.content.split("```json")[-1].split("```")[0] or ""
-            )
-            output_type.model_validate_json(choice.message.content)
-    except ValidationError as err:
-        raise JSONSchemaValidationError(
-            "The completion does not match the specified schema."
-        ) from err
-
-
-class LLMModel(BaseModel):
-    """Run n completions at once, all starting from the same messages."""
-
-    model_config = ConfigDict(extra="forbid")
-
-    # this should keep the original model
-    # if fine-tuned, this should still refer to the base model
-    name: str = "unknown"
-    llm_type: str | None = None
-    llm_result_callback: (
-        Callable[[LLMResult], None] | Callable[[LLMResult], Awaitable[None]] | None
-    ) = Field(
-        default=None,
-        description=(
-            "An async callback that will be executed on each"
-            " LLMResult (different than callbacks that execute on each chunk)"
-        ),
-        exclude=True,
-    )
-    config: dict = Field(
-        default={
-            "model": "gpt-3.5-turbo",  # Default model should have cheap input/output for testing
-            "temperature": 0.1,
-        }
-    )
-    encoding: Any | None = None
-
-    def __str__(self) -> str:
-        return f"{type(self).__name__} {self.name}"
-
-    def infer_llm_type(self) -> str:
-        return "completion"
-
-    def count_tokens(self, text: str) -> int:
-        return len(text) // 4  # gross approximation
-
-    async def run_prompt(
-        self,
-        prompt: str,
-        data: dict,
-        callbacks: list[Callable] | None = None,
-        name: str | None = None,
-        skip_system: bool = False,
-        system_prompt: str = default_system_prompt,
-    ) -> LLMResult:
-        if not self.llm_type:
-            self.llm_type = self.infer_llm_type()
-
-        run = getattr(self, "_run_" + self.llm_type)
-        if not run:
-            raise ValueError(f"Unknown llm_type {self.llm_type!r}.")
-
-        return await run(prompt, data, callbacks, name, skip_system, system_prompt)
-
-    async def get_result(self, usage, result, output, start_clock):
-        if sum(usage) > 0:
-            result.prompt_count, result.completion_count = usage
-        elif output:
-            result.completion_count = self.count_tokens(output)
-
-        result.text = output
-        result.seconds_to_last_token = asyncio.get_running_loop().time() - start_clock
-
-        if self.llm_result_callback:
-            if is_coroutine_callable(self.llm_result_callback):
-                await self.llm_result_callback(result)  # type: ignore[misc]
-            else:
-                self.llm_result_callback(result)
-        return result
-
-    async def add_chunk_text(
-        self, result, callbacks, chunk, text_result, start_clock, name
-    ):
-        if not chunk.text:
-            return
-
-        if result.seconds_to_first_token == 0:
-            result.seconds_to_first_token = (
-                asyncio.get_running_loop().time() - start_clock
-            )
-
-        text_result.append(chunk.text)
-        await do_callbacks(callbacks, chunk.text, name)
-
-    async def _run_chat(
-        self,
-        prompt: str,
-        data: dict,
-        callbacks: list[Callable] | None = None,
-        name: str | None = None,
-        skip_system: bool = False,
-        system_prompt: str = default_system_prompt,
-    ) -> LLMResult:
-        """Run a chat prompt.
-
-        Args:
-            prompt: Prompt to use.
-            data: Keys for the input variables that will be formatted into prompt.
-            callbacks: Optional functions to call with each chunk of the completion.
-            name: Optional name for the result.
-            skip_system: Set True to skip the system prompt.
-            system_prompt: System prompt to use.
-
-        Returns:
-            Result of the chat.
-        """
-        system_message_prompt = {"role": "system", "content": system_prompt}
-        human_message_prompt = {"role": "user", "content": prompt}
-        messages = [
-            {"role": m["role"], "content": m["content"].format(**data)}
-            for m in (
-                [human_message_prompt]
-                if skip_system
-                else [system_message_prompt, human_message_prompt]
-            )
-        ]
-        result = LLMResult(
-            model=self.name,
-            name=name,
-            prompt=messages,
-            prompt_count=(
-                sum(self.count_tokens(m["content"]) for m in messages)
-                + sum(self.count_tokens(m["role"]) for m in messages)
-            ),
-        )
-
-        start_clock = asyncio.get_running_loop().time()
-        if not callbacks:
-            chunk = await self.achat(messages)
-            output = chunk.text
-        else:
-            completion = await self.achat_iter(messages)  # type: ignore[misc]
-            text_result = []
-            async for chunk in completion:
-                await self.add_chunk_text(
-                    result, callbacks, chunk, text_result, start_clock, name
-                )
-            output = "".join(text_result)
-
-        usage = chunk.prompt_tokens, chunk.completion_tokens
-        return await self.get_result(usage, result, output, start_clock)
-
-    async def _run_completion(
-        self,
-        prompt: str,
-        data: dict,
-        callbacks: Iterable[Callable] | None = None,
-        name: str | None = None,
-        skip_system: bool = False,
-        system_prompt: str = default_system_prompt,
-    ) -> LLMResult:
-        """Run a completion prompt.
-
-        Args:
-            prompt: Prompt to use.
-            data: Keys for the input variables that will be formatted into prompt.
-            callbacks: Optional functions to call with each chunk of the completion.
-            name: Optional name for the result.
-            skip_system: Set True to skip the system prompt.
-            system_prompt: System prompt to use.
-
-        Returns:
-            Result of the completion.
-        """
-        formatted_prompt: str = (
-            prompt if skip_system else system_prompt + "\n\n" + prompt
-        ).format(**data)
-        result = LLMResult(
-            model=self.name,
-            name=name,
-            prompt=formatted_prompt,
-            prompt_count=self.count_tokens(formatted_prompt),
-        )
-
-        start_clock = asyncio.get_running_loop().time()
-        if not callbacks:
-            chunk = await self.acomplete(formatted_prompt)
-            output = chunk.text
-        else:
-            completion = self.acomplete_iter(formatted_prompt)
-            text_result = []
-            async for chunk in completion:
-                await self.add_chunk_text(
-                    result, callbacks, chunk, text_result, start_clock, name
-                )
-            output = "".join(text_result)
-
-        usage = chunk.prompt_tokens, chunk.completion_tokens
-        return await self.get_result(usage, result, output, start_clock)
-
-    @model_validator(mode="after")
-    def set_model_name(self) -> Self:
-        if self.name != "unknown" and self.config.get("model", "unknown") in (
-            "gpt-3.5-turbo",
-            None,
-        ):
-            self.config["model"] = self.name
-        elif "model" in self.config and self.name == "unknown":
-            self.name = self.config["model"]
-        # note we do not consider case where both are set
-        # because that could be true if the model is fine-tuned
-        return self
-
-    async def acomplete(self, prompt: str) -> Chunk:
-        """Return the completion as string and the number of tokens in the prompt and completion."""
-        raise NotImplementedError
-
-    async def acomplete_iter(self, prompt: str) -> AsyncIterable[Chunk]:  # noqa: ARG002
-        """Return an async generator that yields chunks of the completion.
-
-        Only the last tuple will be non-zero.
-        """
-        raise NotImplementedError
-
-    async def achat(
-        self, messages: Iterable[Message], **kwargs
-    ) -> litellm.ModelResponse:
-        return await litellm.acompletion(
-            messages=[m.model_dump(by_alias=True) for m in messages],
-            **(self.config | kwargs),
-        )
-
-    async def achat_iter(self, messages: Iterable[Message], **kwargs) -> AsyncGenerator:
-        return cast(
-            AsyncGenerator,
-            await litellm.acompletion(
-                messages=[m.model_dump(by_alias=True) for m in messages],
-                stream=True,
-                stream_options={
-                    "include_usage": True,  # Included to get prompt token counts
-                },
-                **(self.config | kwargs),
-            ),
-        )
-
-    # SEE: https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
-    # > `required` means the model must call one or more tools.
-    TOOL_CHOICE_REQUIRED: ClassVar[str] = "required"
-
-    async def handle_callbacks(
-        self, tools, n, chat_kwargs, prompt, callbacks, messages, start_clock, results
-    ):
-        if tools:
-            raise NotImplementedError("Using tools with callbacks is not supported")
-        if n > 1:
-            raise NotImplementedError(
-                "Multiple completions with callbacks is not supported"
-            )
-        result = LLMResult(model=self.name, config=chat_kwargs, prompt=prompt)
-        stream_completion = await self.achat_iter(messages, **chat_kwargs)
-        role = "assistant"
-        text_result = []
-
-        async for chunk in stream_completion:
-            delta = chunk.choices[0].delta
-            role = delta.role or role
-            if hasattr(chunk, "usage"):
-                result.prompt_count = chunk.usage.prompt_tokens
-
-            if not delta.content:
-                continue
-
-            if result.seconds_to_first_token == 0:
-                result.seconds_to_first_token = (
-                    asyncio.get_running_loop().time() - start_clock
-                )
-
-            text_result.append(delta.content)
-            await do_callbacks(callbacks, delta.content)
-
-        output = "".join(text_result)
-        result.completion_count = litellm.token_counter(
-            model=self.name,
-            text=output,
-        )
-        # TODO: figure out how tools stream, and log probs
-        result.messages = [Message(role=role, content=output)]
-        results.append(result)
-
-    async def handle_no_callbacks(
-        self, tools, chat_kwargs, prompt, results, output_type
-    ):
-        completion: litellm.ModelResponse = await self.achat(prompt, **chat_kwargs)
-        if output_type:
-            validate_json_completion(completion, output_type)
-
-        for choice in completion.choices:
-            if isinstance(choice, litellm.utils.StreamingChoices):
-                raise NotImplementedError("Streaming is not yet supported.")
-
-            if (
-                tools is not None  # Allows for empty tools list
-                or choice.finish_reason == "tool_calls"
-                or (getattr(choice.message, "tool_calls", None) is not None)
-            ):
-                serialized_choice_message = choice.message.model_dump()
-                serialized_choice_message["tool_calls"] = (
-                    serialized_choice_message.get("tool_calls") or []
-                )
-                output_messages: list[Message | ToolRequestMessage] = [
-                    ToolRequestMessage(**serialized_choice_message)
-                ]
-            else:
-                output_messages = [Message(**choice.message.model_dump())]
-
-            results.append(
-                LLMResult(
-                    model=self.name,
-                    config=chat_kwargs,
-                    prompt=prompt,
-                    messages=output_messages,
-                    logprob=sum_logprobs(choice),
-                    system_fingerprint=completion.system_fingerprint,
-                    # Note that these counts are aggregated over all choices
-                    completion_count=completion.usage.completion_tokens,  # type: ignore[attr-defined,unused-ignore]
-                    prompt_count=completion.usage.prompt_tokens,  # type: ignore[attr-defined,unused-ignore]
-                )
-            )
-
-    async def call(  # noqa: C901, PLR0915
-        self,
-        messages: list[Message],
-        callbacks: list[Callable] | None = None,
-        output_type: type[BaseModel] | None = None,
-        tools: list[Tool] | None = None,
-        tool_choice: Tool | str | None = TOOL_CHOICE_REQUIRED,
-        **chat_kwargs,
-    ) -> list[LLMResult]:
-        start_clock = asyncio.get_running_loop().time()
-
-        # Deal with tools. OpenAI throws an error if tool list is empty,
-        # so skip this block if tools in (None, [])
-        if tools:
-            chat_kwargs["tools"] = ToolsAdapter.dump_python(
-                tools, exclude_none=True, by_alias=True
-            )
-            if tool_choice is not None:
-                chat_kwargs["tool_choice"] = (
-                    {
-                        "type": "function",
-                        "function": {"name": tool_choice.info.name},
-                    }
-                    if isinstance(tool_choice, Tool)
-                    else tool_choice
-                )
-
-        # deal with specifying output type
-        if output_type:
-            schema = json.dumps(output_type.model_json_schema(mode="serialization"))
-            schema_msg = f"Respond following this JSON schema:\n\n{schema}"
-            # Get the system prompt and its index, or the index to add it
-            i, system_prompt = next(
-                ((i, m) for i, m in enumerate(messages) if m.role == "system"),
-                (0, None),
-            )
-            messages = [
-                *messages[:i],
-                (
-                    system_prompt.append_text(schema_msg, inplace=False)
-                    if system_prompt
-                    else Message(role="system", content=schema_msg)
-                ),
-                *messages[i + 1 if system_prompt else i :],
-            ]
-            chat_kwargs["response_format"] = {"type": "json_object"}
-
-        # add static configuration to kwargs
-        chat_kwargs = self.config | chat_kwargs
-        n = chat_kwargs.get("n", 1)  # number of completions
-        if n < 1:
-            raise ValueError("Number of completions (n) must be >= 1.")
-
-        prompt = [
-            (
-                m
-                if not isinstance(m, ToolRequestMessage) or m.tool_calls
-                # OpenAI doesn't allow for empty tool_calls lists, so downcast empty
-                # ToolRequestMessage to Message here
-                else Message(role=m.role, content=m.content)
-            )
-            for m in messages
-        ]
-        results: list[LLMResult] = []
-
-        if callbacks:
-            await self.handle_callbacks(
-                tools, n, chat_kwargs, prompt, callbacks, messages, start_clock, results
-            )
-        else:
-            await self.handle_no_callbacks(
-                tools, chat_kwargs, prompt, results, output_type
-            )
-
-        if not results:
-            # This happens in unit tests. We should probably not keep this block around
-            # long-term. Previously, we would emit an empty ToolRequestMessage if
-            # completion.choices were empty, so  I am replicating that here.
-            results.append(
-                LLMResult(
-                    model=self.name,
-                    config=chat_kwargs,
-                    prompt=prompt,
-                    messages=[ToolRequestMessage(tool_calls=[])],
-                )
-            )
-
-        end_clock = asyncio.get_running_loop().time()
-
-        for result in results:
-            # Manually update prompt count if not set, which can
-            # happen if the target model doesn't support 'include_usage'
-            if not result.prompt_count:
-                result.prompt_count = litellm.token_counter(
-                    model=self.name,
-                    messages=[m.model_dump() for m in result.messages],  # type: ignore[union-attr]
-                )
-
-            # update with server-side counts
-            result.seconds_to_last_token = end_clock - start_clock
-
-        return results
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cassettes/TestLiteLLMModel.test_max_token_truncation[with-router].yaml b/tests/cassettes/TestLiteLLMModel.test_max_token_truncation[with-router].yaml
new file mode 100644
index 0000000..6a2c0d8
--- /dev/null
+++ b/tests/cassettes/TestLiteLLMModel.test_max_token_truncation[with-router].yaml
@@ -0,0 +1,103 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "Please tell me a story"}], "model":
+        "gpt-4o-mini", "max_tokens": 3}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "110"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4ySUUvDMBSF3/srQp5b6bp13fomE3wRRBEUREqW3LaZaRKTFBxj/13SdmuHE3zJ
+          w/3uOTn3JocAIcwZzhGmNXG00SK6fSNfyw1/TWdsV64fnjYZZC/ts7in+/oOh16htjug7qS6oarR
+          AhxXssfUAHHgXWfZPEmX62y96kCjGAgvq7SLFipquORREieLKM6i2WpQ14pTsDhH7wFCCB260+eU
+          DL5xjuLwVGnAWlIBzs9NCGGjhK9gYi23jkiHwxFSJR3ILvqjpIBarSQi0w4DZWuJTylbIYb68Xyl
+          UJU2amsHfq6XXHJbFwaIVdLbC5CVq3HHjwFCH91w7UVerI1qtCuc+gTpLWdJb4jHlY5wPjCnHBET
+          TRpeMSsYOMKFnewGU0JrYKNyXCRpGVcTEEyG/p3lmnc/OJfVf+xHQCloB6zQBhinl/OObQb8f/ur
+          7bziLjC2e+ugKUouKzDa8P61S13EWZxuy1VGYxwcgx8AAAD//wMAIAzc+vsCAAA=
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2f90c81230e-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Mon, 25 Nov 2024 21:23:18 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=KqKw89zgaG32GNn3lg4IvjG2X2zLmPKRiY1oedcDUVM-1732569798-1.0.1.1-y_oblt_Jp3n1T.HtHFHrxbRegDqoC8gojQPBSV52IMBH.bx8c0QNAUrWotLzzQGqqbIDjdhl0AUutzvWk20psg;
+            path=/; expires=Mon, 25-Nov-24 21:53:18 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=F_vLQWJJbY8GvEB4YIomOCy2NMswE7Ex8TL0Z4OIxgg-1732569798934-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "256"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29997"
+        x-ratelimit-remaining-tokens:
+          - "149998170"
+        x-ratelimit-reset-requests:
+          - 4ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_77977a66fa96e40ffe5c3bc7840c0948
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLiteLLMModel.test_max_token_truncation[without-router].yaml b/tests/cassettes/TestLiteLLMModel.test_max_token_truncation[without-router].yaml
new file mode 100644
index 0000000..1b1690c
--- /dev/null
+++ b/tests/cassettes/TestLiteLLMModel.test_max_token_truncation[without-router].yaml
@@ -0,0 +1,103 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "Please tell me a story"}], "model":
+        "gpt-4o-mini", "max_tokens": 3}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "110"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4ySMW+DMBCFd36F5RkqQkJI2DpnqBSpUtqqQo59gFtju7aRkkb575UhAaKmUhcP
+          9917fnf2KUAIc4ZzhGlNHG20iB535GvewmZ53MR8szhu29ftLnmx3/T5UOHQK9T+A6i7qh6oarQA
+          x5XsMTVAHHjXWTZP0uU6W6cdaBQD4WWVdtFCRQ2XPEriZBHFWTRbXdS14hQsztFbgBBCp+70OSWD
+          A85RHF4rDVhLKsD50IQQNkr4CibWcuuIdDgcIVXSgeyiP0kKqNVKIjLtMFC2lviUshXiUj8PVwpV
+          aaP29sKHesklt3VhgFglvb0AWbkad/wcIPTeDdfe5MXaqEa7wqlPkN5ylvSGeFzpCOcX5pQjYqJJ
+          wztmBQNHuLCT3WBKaA1sVI6LJC3jagKCydC/s9zz7gfnsvqP/QgoBe2AFdoA4/R23rHNgP9vf7UN
+          K+4CY3u0Dpqi5LICow3vX7vURZzF6b5cZTTGwTn4AQAA//8DAGJIF8n7AgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2e46a3e5c18-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Mon, 25 Nov 2024 21:23:15 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=pmbS4O0SdzCjzvZjensXpq5w1I1GUEOUOh_2ExJ8_Rc-1732569795-1.0.1.1-RsW0ExCXu..OFPcHXSvL3vh7_PqZu9gX0DgJI0BGjr2oborEPzdC6ZSsqZTfP9zf3YOigH1hcfDePksbYyIO8A;
+            path=/; expires=Mon, 25-Nov-24 21:53:15 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=v_wT.VKOzzmIot1JtDgHglmHPmgOB.YvZxQznjpUEiA-1732569795663-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "150"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999990"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_817b41328aed9a1e4a84bd7ec79e22f7
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLiteLLMModel.test_run_prompt[with-router].yaml b/tests/cassettes/TestLiteLLMModel.test_run_prompt[with-router].yaml
new file mode 100644
index 0000000..e234281
--- /dev/null
+++ b/tests/cassettes/TestLiteLLMModel.test_run_prompt[with-router].yaml
@@ -0,0 +1,495 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "The duck says"}], "model": "gpt-4o-mini",
+        "max_tokens": 56, "stream": true, "stream_options": {"include_usage": true},
+        "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "179"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string:
+          'data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"The"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          duck"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          says"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          \""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"qu"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"ack"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"!\""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          What"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          else"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          would"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          you"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          like"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          to"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          know"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          about"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          ducks"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          or"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          their"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          sounds"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3wHeRRIuOofrbBi9d44mS2uhs","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[],"usage":{"prompt_tokens":10,"completion_tokens":20,"total_tokens":30,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}
+
+
+          data: [DONE]
+
+
+          '
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2e5a826fb34-SJC
+        Connection:
+          - keep-alive
+        Content-Type:
+          - text/event-stream; charset=utf-8
+        Date:
+          - Mon, 25 Nov 2024 21:23:15 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=EPEcKxQAwr.FHIAtJH7QaNJ8DYP5ttMHPLWWG9mlovI-1732569795-1.0.1.1-Zi_lzLa.4UpZcO4ApjlwTQCvgvKFV5K08QpJsarCNOrGVdpVg732lx_eRJlTSq3F0xrwgvKi4S.YOGn1drcoqQ;
+            path=/; expires=Mon, 25-Nov-24 21:53:15 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=na9.ja6VE.k8hCQXCpou8cLN_mMFPF5vEW9J4lpkdCI-1732569795943-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "374"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999938"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_43f164b53abd2b51e63195007476701e
+      status:
+        code: 200
+        message: OK
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "The duck says"}], "model": "gpt-4o-mini",
+        "max_tokens": 56, "stream": false, "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "137"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4xSTY/TMBS851c8fG5Q2m03214QEuLICcFqWRS59ktj4vi5fvaWsup/R0k/0gqQ
+          uPgw82Y88+zXDEAYLVYgVCOj6rzN3z/K7bz79FKUH7b7Rfu05OnT7teX9PL4ca3FpFfQ+geqeFa9
+          VdR5i9GQO9IqoIzYu07Lu9niflku7weiI422l218zOeUd8aZfFbM5nlR5tOHk7oho5DFCr5lAACv
+          w9nndBp/ihUUkzPSIbPcoFhdhgBEINsjQjIbjtJFMRlJRS6iG6J/bhB0Ui2w3DM8i22Sqn3zLOBr
+          IyOgZYQdJathTwmsaREiQetoB3JNKQ5aBgoQGzQBmJLT/O76soB1YtkXdsnaE364pLe08YHWfOIv
+          eG2c4aYKKJlcn5QjeTGwhwzg+7CldFNc+ECdj1WkFl1vOD0tSYxvM5KzMxkpSjvid2f8xq3SGKWx
+          fLVloaRqUI/K8Ulk0oauiOyq859h/uZ97G3c5n/sR0Ip9BF15QNqo24Lj2MB+5/7r7HLjofAgvcc
+          satq4zYYfDDHf1P7qiiLxbp+KFUhskP2GwAA//8DALtwqC9FAwAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2ea2b04172a-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Mon, 25 Nov 2024 21:23:16 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=VzsSiHm0Z_BUTeSN65gluQiQrzN30OY3FbFYc2f2GGI-1732569796-1.0.1.1-qLA_SlZNrcg_rchFZXQL8x1i44Xf0JSNTu5cVR2qroTv40NhPne58JGlR6an_biXT6kILUb7UkRQuFktt2.CbA;
+            path=/; expires=Mon, 25-Nov-24 21:53:16 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=EBPncJv5hpEuGElVMHoZQPGVCQJ0dmgebSzO3h1NrW4-1732569796811-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "510"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999938"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_12734547c5251069ac4d6765baab7c39
+      status:
+        code: 200
+        message: OK
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "The duck says"}], "model": "gpt-4o-mini",
+        "max_tokens": 56, "stream": true, "stream_options": {"include_usage": true},
+        "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "179"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string:
+          'data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"The"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          duck"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          says"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          \""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"qu"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"ack"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"!\""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          It''s"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          a"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          classic"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          sound"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          associated"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          with"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          ducks"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"."},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          Is"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          there"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          something"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          specific"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          you''d"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          like"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          to"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          know"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          about"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          ducks"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          or"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          their"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          sounds"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq4vpiIBmDV1ZDl16ItHXocRR4R","object":"chat.completion.chunk","created":1732569796,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[],"usage":{"prompt_tokens":10,"completion_tokens":29,"total_tokens":39,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}
+
+
+          data: [DONE]
+
+
+          '
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2ee2ea2fb34-SJC
+        Connection:
+          - keep-alive
+        Content-Type:
+          - text/event-stream; charset=utf-8
+        Date:
+          - Mon, 25 Nov 2024 21:23:17 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "223"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999938"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_718a08157d6246d33e3155fab603aa96
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLiteLLMModel.test_run_prompt[without-router].yaml b/tests/cassettes/TestLiteLLMModel.test_run_prompt[without-router].yaml
new file mode 100644
index 0000000..082fa72
--- /dev/null
+++ b/tests/cassettes/TestLiteLLMModel.test_run_prompt[without-router].yaml
@@ -0,0 +1,501 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "The duck says"}], "model": "gpt-4o-mini",
+        "max_tokens": 56, "stream": true, "stream_options": {"include_usage": true},
+        "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "179"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string:
+          'data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"The"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          duck"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          says"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          \""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"qu"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"ack"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"!\""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          Ducks"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          are"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          known"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          for"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          their"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          distinctive"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          qu"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"acking"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          sound"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"."},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          Is"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          there"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          something"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          specific"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          you"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          would"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          like"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          to"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          know"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          about"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          ducks"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          or"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          their"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          sounds"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq3IR0z8uEvoVJE0vyqiQTgtOWX","object":"chat.completion.chunk","created":1732569795,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[],"usage":{"prompt_tokens":10,"completion_tokens":32,"total_tokens":42,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}
+
+
+          data: [DONE]
+
+
+          '
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2e55beacf0a-SJC
+        Connection:
+          - keep-alive
+        Content-Type:
+          - text/event-stream; charset=utf-8
+        Date:
+          - Mon, 25 Nov 2024 21:23:15 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=yEIUr9nquP_ccNfJrvwHg8LrvAJbcRrseCICyMhmTOU-1732569795-1.0.1.1-wJWO00pGrQLCiAnCpi3CgxNTF6QY1KT8.LAbPFNGzHYuumm_kDTw6l5BnZz4DkH0_XPdFgVv2jgZmWma2GNF2Q;
+            path=/; expires=Mon, 25-Nov-24 21:53:15 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=JXYFaWG3HJZsfGDbb62wzCtTr6fxOAAubmPRa36h4Wk-1732569795729-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "212"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999938"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_2e462744dab3bbcaa3bacabf4a8941f4
+      status:
+        code: 200
+        message: OK
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "The duck says"}], "model": "gpt-4o-mini",
+        "max_tokens": 56, "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "120"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4xSTYvbMBC9+1dMddlLvDjZTZ3kUhbaQrq9FFpI6RajyGNbjaxRNDLbsOS/L3I+
+          nNAWevHhvXnP783oJQEQuhQLEKqRQbXOpA8ruZ1+284y+vxl9V7Wqw+fvqtJ9dE8Pj4sxSgqaP0L
+          VTipbhW1zmDQZA+08igDRtdxfjeZvp3n87wnWirRRFntQnpPaautTifZ5D7N8nQ8O6ob0gpZLOBH
+          AgDw0n9jTlvib7GAbHRCWmSWNYrFeQhAeDIREZJZc5A2iNFAKrIBbR/9a4NQdmoDLHcMT2LbSbV5
+          8yRgGW4YJCgTHRQwdbYEyUxKx1rwrEPTK/kWlgyhQY/A1GJotK2BHSpdaQU76m5KMHqDEAg2lp5B
+          rqkLBy2Qj1LtDz/gd5cxPVYdy7gq2xlzxPfn3oZq52nNR/6MV9pqbgqPksnGjhzIiZ7dJwA/+/12
+          VysTzlPrQhFogzYajo/rFcNVB3IyP5KBgjQDfnfCr9yKEoPUhi/uI5RUDZaDcjim7EpNF0Ry0fnP
+          MH/zPvTWtv4f+4FQCl3AsnAeS62uCw9jHuOb/9fYecd9YME7DtgWlbY1euf14cVVrsjybLquZrnK
+          RLJPXgEAAP//AwCKfB1wfwMAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2ef7f0dcf0a-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Mon, 25 Nov 2024 21:23:17 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "692"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999938"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_b7fc03e0f47f932ab987eb23b736f3f1
+      status:
+        code: 200
+        message: OK
+  - request:
+      body:
+        '{"messages": [{"role": "user", "content": "The duck says"}], "model": "gpt-4o-mini",
+        "max_tokens": 56, "stream": true, "stream_options": {"include_usage": true},
+        "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "179"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.46.1
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.46.1
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string:
+          'data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"The"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          duck"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          says"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          \""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"qu"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"ack"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"!\""},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          What"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          else"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          would"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          you"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          like"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          to"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          know"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          about"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          ducks"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          or"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          their"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"
+          sounds"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}],"usage":null}
+
+
+          data: {"id":"chatcmpl-AXaq6Oo60rPu2e3GjHLbqHmfCibSr","object":"chat.completion.chunk","created":1732569798,"model":"gpt-4o-mini-2024-07-18","system_fingerprint":"fp_0705bf87c0","choices":[],"usage":{"prompt_tokens":10,"completion_tokens":20,"total_tokens":30,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}
+
+
+          data: [DONE]
+
+
+          '
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8e84b2f52d9ccf0a-SJC
+        Connection:
+          - keep-alive
+        Content-Type:
+          - text/event-stream; charset=utf-8
+        Date:
+          - Mon, 25 Nov 2024 21:23:18 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "106"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999938"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_f253a75b18af17320fdaa40a9449ee60
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..b0f5177
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import logging
+import shutil
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any
+
+# from unittest.mock import patch
+import pytest
+from dotenv import load_dotenv
+
+TESTS_DIR = Path(__file__).parent
+CASSETTES_DIR = TESTS_DIR / "cassettes"
+
+
+@pytest.fixture(autouse=True, scope="session")
+def _load_env() -> None:
+    load_dotenv()
+
+
+OPENAI_API_KEY_HEADER = "authorization"
+ANTHROPIC_API_KEY_HEADER = "x-api-key"
+CROSSREF_HEADER_KEY = "Crossref-Plus-API-Token"
+SEMANTIC_SCHOLAR_HEADER_KEY = "x-api-key"
+# SEE: https://github.com/kevin1024/vcrpy/blob/v6.0.1/vcr/config.py#L43
+VCR_DEFAULT_MATCH_ON = "method", "scheme", "host", "port", "path", "query"
+
+
+@pytest.fixture(scope="session", name="vcr_config")
+def fixture_vcr_config() -> dict[str, Any]:
+    return {
+        "filter_headers": [
+            CROSSREF_HEADER_KEY,
+            SEMANTIC_SCHOLAR_HEADER_KEY,
+            OPENAI_API_KEY_HEADER,
+            ANTHROPIC_API_KEY_HEADER,
+            "cookie",
+        ],
+        "record_mode": "once",
+        "allow_playback_repeats": True,
+        "cassette_library_dir": str(CASSETTES_DIR),
+    }
+
+
+@pytest.fixture
+def tmp_path_cleanup(tmp_path: Path) -> Iterator[Path]:
+    yield tmp_path
+    # Cleanup after the test
+    if tmp_path.exists():
+        shutil.rmtree(tmp_path, ignore_errors=True)
+
+
+@pytest.fixture(scope="session", name="stub_data_dir")
+def fixture_stub_data_dir() -> Path:
+    return Path(__file__).parent / "stub_data"
+
+
+@pytest.fixture(name="reset_log_levels")
+def fixture_reset_log_levels(caplog) -> Iterator[None]:
+    logging.getLogger().setLevel(logging.DEBUG)
+
+    for name in logging.root.manager.loggerDict:
+        logger = logging.getLogger(name)
+        logger.setLevel(logging.DEBUG)
+        logger.propagate = True
+
+    caplog.set_level(logging.DEBUG)
+
+    yield
+
+    for name in logging.root.manager.loggerDict:
+        logger = logging.getLogger(name)
+        logger.setLevel(logging.NOTSET)
+        logger.propagate = True
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
new file mode 100644
index 0000000..c91cc5a
--- /dev/null
+++ b/tests/test_embeddings.py
@@ -0,0 +1,67 @@
+import pytest
+
+from llmclient.embeddings import MODEL_COST_MAP, LiteLLMEmbeddingModel
+
+
+class TestLiteLLMEmbeddingModel:
+    @pytest.fixture
+    def embedding_model(self):
+        return LiteLLMEmbeddingModel()
+
+    def test_default_config_injection(self, embedding_model):
+        # field_validator is only triggered if the attribute is passed
+        embedding_model = LiteLLMEmbeddingModel(config={})
+
+        config = embedding_model.config
+        assert "kwargs" in config
+        assert config["kwargs"]["timeout"] == 120
+
+    def test_truncate_if_large_no_truncation(self, embedding_model):
+        texts = ["short text", "another short text"]
+        truncated_texts = embedding_model._truncate_if_large(texts)
+        assert truncated_texts == texts
+
+    def test_truncate_if_large_with_truncation(self, embedding_model, mocker):
+        texts = ["a" * 10000, "b" * 10000]
+        mocker.patch.dict(
+            MODEL_COST_MAP, {embedding_model.name: {"max_input_tokens": 100}}
+        )
+        mocker.patch(
+            "tiktoken.encoding_for_model",
+            return_value=mocker.Mock(
+                encode_ordinary_batch=lambda texts: [[1] * 1000 for _ in texts],
+                decode=lambda text: "truncated text",  # noqa: ARG005
+            ),
+        )
+        truncated_texts = embedding_model._truncate_if_large(texts)
+        assert truncated_texts == ["truncated text", "truncated text"]
+
+    def test_truncate_if_large_key_error(self, embedding_model, mocker):
+        texts = ["a" * 10000, "b" * 10000]
+        mocker.patch.dict(
+            MODEL_COST_MAP, {embedding_model.name: {"max_input_tokens": 100}}
+        )
+        mocker.patch("tiktoken.encoding_for_model", side_effect=KeyError)
+        truncated_texts = embedding_model._truncate_if_large(texts)
+        assert truncated_texts == ["a" * 300, "b" * 300]
+
+    @pytest.mark.asyncio
+    async def test_embed_documents(self, embedding_model, mocker):
+        texts = ["short text", "another short text"]
+        mocker.patch(
+            "llmclient.embeddings.LiteLLMEmbeddingModel._truncate_if_large",
+            return_value=texts,
+        )
+        mocker.patch(
+            "llmclient.embeddings.LiteLLMEmbeddingModel.check_rate_limit",
+            return_value=None,
+        )
+        mock_response = mocker.Mock()
+        mock_response.data = [
+            {"embedding": [0.1, 0.2, 0.3]},
+            {"embedding": [0.4, 0.5, 0.6]},
+        ]
+        mocker.patch("litellm.aembedding", return_value=mock_response)
+
+        embeddings = await embedding_model.embed_documents(texts)
+        assert embeddings == [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
diff --git a/tests/test_llms.py b/tests/test_llms.py
new file mode 100644
index 0000000..7ad7358
--- /dev/null
+++ b/tests/test_llms.py
@@ -0,0 +1,261 @@
+import pathlib
+import pickle
+from typing import Any
+from unittest.mock import patch
+
+import litellm
+import pytest
+
+from llmclient.embeddings import (
+    HybridEmbeddingModel,
+    LiteLLMEmbeddingModel,
+    SentenceTransformerEmbeddingModel,
+    SparseEmbeddingModel,
+    embedding_model_factory,
+)
+from llmclient.llms import Chunk, LiteLLMModel
+from tests.conftest import VCR_DEFAULT_MATCH_ON
+
+
+class TestLiteLLMModel:
+    @pytest.mark.vcr(match_on=[*VCR_DEFAULT_MATCH_ON, "body"])
+    @pytest.mark.parametrize(
+        "config",
+        [
+            pytest.param(
+                {
+                    "model_list": [
+                        {
+                            "model_name": "gpt-4o-mini",
+                            "litellm_params": {
+                                "model": "gpt-4o-mini",
+                                "temperature": 0,
+                                "max_tokens": 56,
+                            },
+                        }
+                    ]
+                },
+                id="with-router",
+            ),
+            pytest.param(
+                {
+                    "pass_through_router": True,
+                    "router_kwargs": {"temperature": 0, "max_tokens": 56},
+                },
+                id="without-router",
+            ),
+        ],
+    )
+    @pytest.mark.asyncio
+    async def test_run_prompt(self, config: dict[str, Any]) -> None:
+        llm = LiteLLMModel(name="gpt-4o-mini", config=config)
+
+        outputs = []
+
+        def accum(x) -> None:
+            outputs.append(x)
+
+        completion = await llm.run_prompt(
+            prompt="The {animal} says",
+            data={"animal": "duck"},
+            system_prompt=None,
+            callbacks=[accum],
+        )
+        assert completion.model == "gpt-4o-mini"
+        assert completion.seconds_to_first_token > 0
+        assert completion.prompt_count > 0
+        assert completion.completion_count > 0
+        assert str(completion) == "".join(outputs)
+        assert completion.cost > 0
+
+        completion = await llm.run_prompt(
+            prompt="The {animal} says",
+            data={"animal": "duck"},
+            system_prompt=None,
+        )
+        assert completion.seconds_to_first_token == 0
+        assert completion.seconds_to_last_token > 0
+        assert completion.cost > 0
+
+        # check with mixed callbacks
+        async def ac(x) -> None:
+            pass
+
+        completion = await llm.run_prompt(
+            prompt="The {animal} says",
+            data={"animal": "duck"},
+            system_prompt=None,
+            callbacks=[accum, ac],
+        )
+        assert completion.cost > 0
+
+    @pytest.mark.vcr
+    @pytest.mark.parametrize(
+        ("config", "bypassed_router"),
+        [
+            pytest.param(
+                {
+                    "model_list": [
+                        {
+                            "model_name": "gpt-4o-mini",
+                            "litellm_params": {"model": "gpt-4o-mini", "max_tokens": 3},
+                        }
+                    ]
+                },
+                False,
+                id="with-router",
+            ),
+            pytest.param(
+                {"pass_through_router": True, "router_kwargs": {"max_tokens": 3}},
+                True,
+                id="without-router",
+            ),
+        ],
+    )
+    @pytest.mark.asyncio
+    async def test_max_token_truncation(
+        self, config: dict[str, Any], bypassed_router: bool
+    ) -> None:
+        llm = LiteLLMModel(name="gpt-4o-mini", config=config)
+        with patch(
+            "litellm.Router.atext_completion",
+            side_effect=litellm.Router.atext_completion,
+            autospec=True,
+        ) as mock_atext_completion:
+            chunk = await llm.acomplete("Please tell me a story")  # type: ignore[call-arg]
+        if bypassed_router:
+            mock_atext_completion.assert_not_awaited()
+        else:
+            mock_atext_completion.assert_awaited_once()
+        assert isinstance(chunk, Chunk)
+        assert chunk.completion_tokens == 3
+        assert chunk.text
+        assert len(chunk.text) < 20
+
+    def test_pickling(self, tmp_path: pathlib.Path) -> None:
+        pickle_path = tmp_path / "llm_model.pickle"
+        llm = LiteLLMModel(
+            name="gpt-4o-mini",
+            config={
+                "model_list": [
+                    {
+                        "model_name": "gpt-4o-mini",
+                        "litellm_params": {
+                            "model": "gpt-4o-mini",
+                            "temperature": 0,
+                            "max_tokens": 56,
+                        },
+                    }
+                ]
+            },
+        )
+        with pickle_path.open("wb") as f:
+            pickle.dump(llm, f)
+        with pickle_path.open("rb") as f:
+            rehydrated_llm = pickle.load(f)
+        assert llm.name == rehydrated_llm.name
+        assert llm.config == rehydrated_llm.config
+        assert llm.router.deployment_names == rehydrated_llm.router.deployment_names
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_sentence_transformer() -> None:
+    """Test that the factory creates a SentenceTransformerEmbeddingModel when given an 'st-' prefix."""
+    embedding = "st-multi-qa-MiniLM-L6-cos-v1"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, SentenceTransformerEmbeddingModel
+    ), "Factory did not create SentenceTransformerEmbeddingModel"
+    assert model.name == "multi-qa-MiniLM-L6-cos-v1", "Incorrect model name assigned"
+
+    # Test embedding functionality
+    texts = ["Hello world", "Test sentence"]
+    embeddings = await model.embed_documents(texts)
+    assert len(embeddings) == 2, "Incorrect number of embeddings returned"
+    assert all(
+        isinstance(embed, list) for embed in embeddings
+    ), "Embeddings are not in list format"
+    assert all(len(embed) > 0 for embed in embeddings), "Embeddings should not be empty"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_hybrid_with_sentence_transformer() -> None:
+    """Test that the factory creates a HybridEmbeddingModel containing a SentenceTransformerEmbeddingModel."""
+    embedding = "hybrid-st-multi-qa-MiniLM-L6-cos-v1"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, HybridEmbeddingModel
+    ), "Factory did not create HybridEmbeddingModel"
+    assert len(model.models) == 2, "Hybrid model should contain two component models"
+    assert isinstance(
+        model.models[0], SentenceTransformerEmbeddingModel
+    ), "First component should be SentenceTransformerEmbeddingModel"
+    assert isinstance(
+        model.models[1], SparseEmbeddingModel
+    ), "Second component should be SparseEmbeddingModel"
+
+    # Test embedding functionality
+    texts = ["Hello world", "Test sentence"]
+    embeddings = await model.embed_documents(texts)
+    assert len(embeddings) == 2, "Incorrect number of embeddings returned"
+    expected_length = len((await model.models[0].embed_documents(texts))[0]) + len(
+        (await model.models[1].embed_documents(texts))[0]
+    )
+    assert all(
+        len(embed) == expected_length for embed in embeddings
+    ), "Embeddings do not match expected combined length"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_invalid_st_prefix() -> None:
+    """Test that the factory raises a ValueError when 'st-' prefix is provided without a model name."""
+    embedding = "st-"
+    with pytest.raises(
+        ValueError,
+        match="SentenceTransformer model name must be specified after 'st-'.",
+    ):
+        embedding_model_factory(embedding)
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_unknown_prefix() -> None:
+    """Test that the factory defaults to LiteLLMEmbeddingModel when an unknown prefix is provided."""
+    embedding = "unknown-prefix-model"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, LiteLLMEmbeddingModel
+    ), "Factory did not default to LiteLLMEmbeddingModel for unknown prefix"
+    assert model.name == "unknown-prefix-model", "Incorrect model name assigned"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_sparse() -> None:
+    """Test that the factory creates a SparseEmbeddingModel when 'sparse' is provided."""
+    embedding = "sparse"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, SparseEmbeddingModel
+    ), "Factory did not create SparseEmbeddingModel"
+    assert model.name == "sparse", "Incorrect model name assigned"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_litellm() -> None:
+    """Test that the factory creates a LiteLLMEmbeddingModel when 'litellm-' prefix is provided."""
+    embedding = "litellm-text-embedding-3-small"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, LiteLLMEmbeddingModel
+    ), "Factory did not create LiteLLMEmbeddingModel"
+    assert model.name == "text-embedding-3-small", "Incorrect model name assigned"
+
+
+@pytest.mark.asyncio
+async def test_embedding_model_factory_default() -> None:
+    """Test that the factory defaults to LiteLLMEmbeddingModel when no known prefix is provided."""
+    embedding = "default-model"
+    model = embedding_model_factory(embedding)
+    assert isinstance(
+        model, LiteLLMEmbeddingModel
+    ), "Factory did not default to LiteLLMEmbeddingModel"
+    assert model.name == "default-model", "Incorrect model name assigned"
diff --git a/tests/test_rate_limiter.py b/tests/test_rate_limiter.py
new file mode 100644
index 0000000..4da6b9a
--- /dev/null
+++ b/tests/test_rate_limiter.py
@@ -0,0 +1,297 @@
+import asyncio
+import time
+from itertools import product
+from typing import Any
+
+import pytest
+from limits import RateLimitItemPerSecond
+
+from llmclient.constants import CHARACTERS_PER_TOKEN_ASSUMPTION
+from llmclient.embeddings import LiteLLMEmbeddingModel
+from llmclient.llms import (
+    Chunk,
+    LiteLLMModel,
+)
+from llmclient.types import LLMResult
+
+LLM_CONFIG_W_RATE_LIMITS = [
+    # following ensures that "short-form" rate limits are also supported
+    # where the user doesn't specify the model_list
+    {
+        "name": "gpt-4o-mini",
+        "config": {
+            "rate_limit": {"gpt-4o-mini": RateLimitItemPerSecond(20, 3)},
+        },
+    },
+    {
+        "name": "gpt-4o-mini",
+        "config": {
+            "model_list": [
+                {
+                    "model_name": "gpt-4o-mini",
+                    "litellm_params": {
+                        "model": "gpt-4o-mini",
+                        "temperature": 0,
+                    },
+                }
+            ],
+            "rate_limit": {"gpt-4o-mini": RateLimitItemPerSecond(20, 1)},
+        },
+    },
+    {
+        "name": "gpt-4o-mini",
+        "config": {
+            "model_list": [
+                {
+                    "model_name": "gpt-4o-mini",
+                    "litellm_params": {
+                        "model": "gpt-4o-mini",
+                        "temperature": 0,
+                    },
+                }
+            ],
+            "rate_limit": {"gpt-4o-mini": RateLimitItemPerSecond(1_000_000, 1)},
+        },
+    },
+    {
+        "name": "gpt-4o-mini",
+        "config": {
+            "model_list": [
+                {
+                    "model_name": "gpt-4o-mini",
+                    "litellm_params": {
+                        "model": "gpt-4o-mini",
+                        "temperature": 0,
+                    },
+                }
+            ]
+        },
+    },
+]
+
+RATE_LIMITER_PROMPT = "Animals make many noises. The duck says"
+
+LLM_METHOD_AND_INPUTS = [
+    {
+        "method": "acomplete",
+        "kwargs": {"prompt": RATE_LIMITER_PROMPT},
+    },
+    {
+        "method": "acomplete_iter",
+        "kwargs": {"prompt": RATE_LIMITER_PROMPT},
+    },
+    {
+        "method": "achat",
+        "kwargs": {"messages": [{"role": "user", "content": RATE_LIMITER_PROMPT}]},
+    },
+    {
+        "method": "achat_iter",
+        "kwargs": {"messages": [{"role": "user", "content": RATE_LIMITER_PROMPT}]},
+    },
+]
+
+rate_limit_configurations = list(
+    product(LLM_CONFIG_W_RATE_LIMITS, LLM_METHOD_AND_INPUTS)
+)
+
+EMBEDDING_CONFIG_W_RATE_LIMITS = [
+    {"config": {"rate_limit": RateLimitItemPerSecond(20, 5)}},
+    {"config": {"rate_limit": RateLimitItemPerSecond(20, 3)}},
+    {"config": {"rate_limit": RateLimitItemPerSecond(1_000_000, 1)}},
+    {},
+]
+
+ACCEPTABLE_RATE_LIMIT_ERROR: float = 0.10  # 10% error margin for token estimate error
+
+
+async def time_n_llm_methods(
+    llm: LiteLLMModel, method: str, n: int, use_gather: bool = False, *args, **kwargs
+) -> float:
+    """Give the token per second rate of a method call."""
+    start_time = time.time()
+    outputs = []
+
+    if not use_gather:
+        for _ in range(n):
+            if "iter" in method:
+                outputs.extend(
+                    [
+                        output
+                        async for output in await getattr(llm, method)(*args, **kwargs)
+                    ]
+                )
+            else:
+                outputs.append(await getattr(llm, method)(*args, **kwargs))
+
+    else:
+        outputs = await asyncio.gather(
+            *[getattr(llm, method)(*args, **kwargs) for _ in range(n)]
+        )
+
+    character_count = 0
+    token_count = 0
+
+    if isinstance(outputs[0], Chunk | LLMResult):
+        character_count = sum(len(o.text or "") for o in outputs)
+    else:
+        character_count = sum(len(o) for o in outputs)
+
+    if hasattr(outputs[0], "prompt_tokens"):
+        token_count = sum(o.prompt_tokens + o.completion_tokens for o in outputs)
+
+    return (
+        (character_count / CHARACTERS_PER_TOKEN_ASSUMPTION)
+        if token_count == 0
+        else token_count
+    ) / (time.time() - start_time)
+
+
+@pytest.mark.parametrize("llm_config_w_rate_limits", LLM_CONFIG_W_RATE_LIMITS)
+@pytest.mark.asyncio
+async def test_rate_limit_on_run_prompt(
+    llm_config_w_rate_limits: dict[str, Any],
+) -> None:
+
+    llm = LiteLLMModel(**llm_config_w_rate_limits)
+
+    outputs = []
+
+    def accum(x) -> None:
+        outputs.append(x)
+
+    estimated_tokens_per_second = await time_n_llm_methods(
+        llm,
+        "run_prompt",
+        3,
+        prompt="The {animal} says",
+        data={"animal": "duck"},
+        system_prompt=None,
+        callbacks=[accum],
+    )
+
+    if "rate_limit" in llm.config:
+        max_tokens_per_second = (
+            llm.config["rate_limit"]["gpt-4o-mini"].amount
+            / llm.config["rate_limit"]["gpt-4o-mini"].multiples
+        )
+        assert estimated_tokens_per_second / max_tokens_per_second < (
+            1.0 + ACCEPTABLE_RATE_LIMIT_ERROR
+        )
+    else:
+        assert estimated_tokens_per_second > 0
+
+    outputs = []
+
+    def accum2(x) -> None:
+        outputs.append(x)
+
+    estimated_tokens_per_second = await time_n_llm_methods(
+        llm,
+        "run_prompt",
+        3,
+        use_gather=True,
+        prompt="The {animal} says",
+        data={"animal": "duck"},
+        system_prompt=None,
+        callbacks=[accum2],
+    )
+
+    if "rate_limit" in llm.config:
+        max_tokens_per_second = (
+            llm.config["rate_limit"]["gpt-4o-mini"].amount
+            / llm.config["rate_limit"]["gpt-4o-mini"].multiples
+        )
+        assert estimated_tokens_per_second / max_tokens_per_second < (
+            1.0 + ACCEPTABLE_RATE_LIMIT_ERROR
+        )
+    else:
+        assert estimated_tokens_per_second > 0
+
+
+@pytest.mark.parametrize(
+    ("llm_config_w_rate_limits", "llm_method_kwargs"), rate_limit_configurations
+)
+@pytest.mark.asyncio
+async def test_rate_limit_on_sequential_completion_litellm_methods(
+    llm_config_w_rate_limits: dict[str, Any],
+    llm_method_kwargs: dict[str, Any],
+) -> None:
+
+    llm = LiteLLMModel(**llm_config_w_rate_limits)
+
+    estimated_tokens_per_second = await time_n_llm_methods(
+        llm,
+        llm_method_kwargs["method"],
+        3,
+        use_gather=False,
+        **llm_method_kwargs["kwargs"],
+    )
+    if "rate_limit" in llm.config:
+        max_tokens_per_second = (
+            llm.config["rate_limit"]["gpt-4o-mini"].amount
+            / llm.config["rate_limit"]["gpt-4o-mini"].multiples
+        )
+        assert estimated_tokens_per_second / max_tokens_per_second < (
+            1.0 + ACCEPTABLE_RATE_LIMIT_ERROR
+        )
+    else:
+        assert estimated_tokens_per_second > 0
+
+
+@pytest.mark.parametrize(
+    ("llm_config_w_rate_limits", "llm_method_kwargs"), rate_limit_configurations
+)
+@pytest.mark.asyncio
+async def test_rate_limit_on_parallel_completion_litellm_methods(
+    llm_config_w_rate_limits: dict[str, Any],
+    llm_method_kwargs: dict[str, Any],
+) -> None:
+
+    llm = LiteLLMModel(**llm_config_w_rate_limits)
+
+    if "iter" not in llm_method_kwargs["method"]:
+        estimated_tokens_per_second = await time_n_llm_methods(
+            llm,
+            llm_method_kwargs["method"],
+            3,
+            use_gather=True,
+            **llm_method_kwargs["kwargs"],
+        )
+        if "rate_limit" in llm.config:
+            max_tokens_per_second = (
+                llm.config["rate_limit"]["gpt-4o-mini"].amount
+                / llm.config["rate_limit"]["gpt-4o-mini"].multiples
+            )
+            assert estimated_tokens_per_second / max_tokens_per_second < (
+                1.0 + ACCEPTABLE_RATE_LIMIT_ERROR
+            )
+        else:
+            assert estimated_tokens_per_second > 0
+
+
+@pytest.mark.parametrize(
+    "embedding_config_w_rate_limits", EMBEDDING_CONFIG_W_RATE_LIMITS
+)
+@pytest.mark.asyncio
+async def test_embedding_rate_limits(
+    embedding_config_w_rate_limits: dict[str, Any],
+) -> None:
+
+    embedding_model = LiteLLMEmbeddingModel(**embedding_config_w_rate_limits)
+    texts_to_embed = ["the duck says"] * 10
+    start = time.time()
+    await embedding_model.embed_documents(texts=texts_to_embed, batch_size=5)
+    estimated_tokens_per_second = sum(
+        len(t) / CHARACTERS_PER_TOKEN_ASSUMPTION for t in texts_to_embed
+    ) / (time.time() - start)
+
+    if "rate_limit" in embedding_config_w_rate_limits:
+        max_tokens_per_second = (
+            embedding_config_w_rate_limits["rate_limit"].amount
+            / embedding_config_w_rate_limits["rate_limit"].multiples
+        )
+        assert estimated_tokens_per_second / max_tokens_per_second < (
+            1.0 + ACCEPTABLE_RATE_LIMIT_ERROR
+        )
+    else:
+        assert estimated_tokens_per_second > 0
diff --git a/uv.lock b/uv.lock
index 05939e8..b1dcc0f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -158,6 +158,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24", size = 27764 },
 ]
 
+[[package]]
+name = "async-timeout"
+version = "4.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/87/d6/21b30a550dafea84b1b8eee21b5e23fa16d010ae006011221f33dcd8d7f8/async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f", size = 8345 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/fa/e01228c2938de91d47b307831c62ab9e4001e747789d0b05baf779a6488c/async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028", size = 5721 },
+]
+
 [[package]]
 name = "attrs"
 version = "24.2.0"
@@ -392,6 +401,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2b/1e/1e726ba66eddf21c940821df8cf1a7d15cb165f0682d62161eaa5e93dae1/contourpy-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:33c92cdae89ec5135d036e7218e69b0bb2851206077251f04a6c4e0e21f03927", size = 1314829 },
 ]
 
+[[package]]
+name = "coredis"
+version = "4.17.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "async-timeout" },
+    { name = "deprecated" },
+    { name = "packaging" },
+    { name = "pympler" },
+    { name = "typing-extensions" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/0c/0f2fb1cedd224666ef08e898447bb9cf4d1e98a86b03119f1c6513093ddc/coredis-4.17.0.tar.gz", hash = "sha256:04e9976e71a42004dfe19a862c648b4047bf813e15184cddfd3cb37eb704b83f", size = 243157 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/2c/2335e476f0c0b33eea53c307169bcafe9c19a4b277738258eb80354ee90c/coredis-4.17.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f3050806b4854a6624e3c2efa013b540265d88e766f815963d447c116240d75d", size = 330690 },
+    { url = "https://files.pythonhosted.org/packages/6a/b1/3c24a708b24f8e2566b1b91b64b4dc75f74633b875def19f2ac0fa03a0a0/coredis-4.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5f0f1044bdafc93f421e59e711da762c6c741ab76df0c12a42c447c1db1fcd75", size = 328051 },
+    { url = "https://files.pythonhosted.org/packages/0f/a6/e5a8add1ae7b31240248528f669127e5fd347c69625a9b423965a5902302/coredis-4.17.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1befa7db121978fd0995151af5d15ce5e37a14847797c3fbd9403882f21b48c", size = 352651 },
+    { url = "https://files.pythonhosted.org/packages/b8/d1/0ece1b888547ec26f4d33be30513cd44c77df25c9f943e7d3c20b49cc634/coredis-4.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52583dcef671c8d3a1cbecbf81cd630b1a72f946cf46601016c4f85d3f12a4a1", size = 355472 },
+    { url = "https://files.pythonhosted.org/packages/00/c2/771bafa43c37d8c968804b6bb34063eb631b5d2377db31bca6d784131f48/coredis-4.17.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:845f5c0bb7012609a1f41f8308e5166c01f162599af33cb001bd2b0d6a4386f5", size = 358740 },
+    { url = "https://files.pythonhosted.org/packages/fb/d3/90846efc003d692c46f2988ddaffaac47f2c95f378102dad490e911de157/coredis-4.17.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e3638c9a894ac7d0a04fa14515f24d0f717c431266ee0ac612ddb3a142862258", size = 330509 },
+    { url = "https://files.pythonhosted.org/packages/4c/2d/1f97441d377b457831bd9327dbdaa29888effa2edf6318cb4138a425538f/coredis-4.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73cb260bf96eacb4e455c300b5e41382bc52d9a2125f3f7e55657662a627e0cb", size = 327735 },
+    { url = "https://files.pythonhosted.org/packages/3a/3f/1dcd57f6df67b7a20b1c27abcf768cf6789be5f33d173739f482d672e9d1/coredis-4.17.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9421423bb109eb62b7595e1d0c84d8c9399bf160826ee478b6b7771bf6ad831e", size = 353755 },
+    { url = "https://files.pythonhosted.org/packages/38/24/de68bdd4b3549a8a05674f0952e646d45afd15453543e0e679dc6899174c/coredis-4.17.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74abdeda89ff5ea40d0da771d2871148b64b2f1c758f11485397adc1928b08e", size = 357309 },
+    { url = "https://files.pythonhosted.org/packages/ab/66/2bd9f9e1c10b307caf8f4e77527c620a0320291aa83a9e0e98e8df5a326c/coredis-4.17.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ddad826c5bc91f05e5fe36435086cdbe51019b2f4f0faf96d40250823548fee", size = 360856 },
+    { url = "https://files.pythonhosted.org/packages/08/1c/7249845c0f6105290d70d90c9ad48b550f5bcb989766819d38aa0f784aec/coredis-4.17.0-py3-none-any.whl", hash = "sha256:a8254fcc746efd72990d565d87e5399646ad737b7a61d86ef129df846e86b0d3", size = 239667 },
+]
+
 [[package]]
 name = "cycler"
 version = "0.12.1"
@@ -410,6 +446,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 },
 ]
 
+[[package]]
+name = "deprecated"
+version = "1.2.15"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2e/a3/53e7d78a6850ffdd394d7048a31a6f14e44900adedf190f9a165f6b69439/deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d", size = 2977612 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/8f/c7f227eb42cfeaddce3eb0c96c60cbca37797fa7b34f8e1aeadf6c5c0983/Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320", size = 9941 },
+]
+
 [[package]]
 name = "dicttoxml"
 version = "1.7.16"
@@ -535,6 +583,195 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/f6/af0d1f58f86002be0cf1e2665cdd6f7a4a71cdc8a7a9438cdc9e3b5375fe/fastapi-0.115.4-py3-none-any.whl", hash = "sha256:0b504a063ffb3cf96a5e27dc1bc32c80ca743a2528574f9cdc77daa2d31b4742", size = 94732 },
 ]
 
+[[package]]
+name = "fh-llm-client"
+version = "0.1.dev37+g63f57b5.d20241126"
+source = { editable = "." }
+dependencies = [
+    { name = "aiofiles" },
+    { name = "coredis" },
+    { name = "dm-tree" },
+    { name = "fhaviary" },
+    { name = "httpx" },
+    { name = "limits" },
+    { name = "litellm" },
+    { name = "networkx", extra = ["default"] },
+    { name = "numpy" },
+    { name = "pydantic" },
+    { name = "pymupdf" },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "tqdm" },
+    { name = "typing-extensions", marker = "python_full_version < '3.12'" },
+    { name = "usearch" },
+]
+
+[package.optional-dependencies]
+dev = [
+    { name = "fastapi" },
+    { name = "fhaviary", extra = ["xml"] },
+    { name = "ipython" },
+    { name = "mypy" },
+    { name = "pre-commit" },
+    { name = "pydantic" },
+    { name = "pydot" },
+    { name = "pylint-pydantic" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "pytest-mock" },
+    { name = "pytest-recording" },
+    { name = "pytest-rerunfailures" },
+    { name = "pytest-subtests" },
+    { name = "pytest-sugar" },
+    { name = "pytest-timer", extra = ["colorama"] },
+    { name = "pytest-xdist" },
+    { name = "python-dotenv" },
+    { name = "refurb" },
+    { name = "rich" },
+    { name = "sentence-transformers" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "types-aiofiles" },
+    { name = "types-tqdm" },
+    { name = "wandb" },
+]
+local = [
+    { name = "sentence-transformers" },
+]
+monitor = [
+    { name = "wandb" },
+]
+nn = [
+    { name = "torch" },
+]
+rich = [
+    { name = "rich" },
+    { name = "tqdm" },
+]
+server = [
+    { name = "fastapi" },
+]
+typing = [
+    { name = "types-aiofiles" },
+    { name = "types-tqdm" },
+]
+visualization = [
+    { name = "pydot" },
+]
+
+[package.dev-dependencies]
+codeflash = [
+    { name = "codeflash" },
+    { name = "fastapi" },
+    { name = "fhaviary", extra = ["xml"] },
+    { name = "ipython" },
+    { name = "mypy" },
+    { name = "pre-commit" },
+    { name = "pydantic" },
+    { name = "pydot" },
+    { name = "pylint-pydantic" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "pytest-mock" },
+    { name = "pytest-recording" },
+    { name = "pytest-rerunfailures" },
+    { name = "pytest-subtests" },
+    { name = "pytest-sugar" },
+    { name = "pytest-timer", extra = ["colorama"] },
+    { name = "pytest-xdist" },
+    { name = "python-dotenv" },
+    { name = "refurb" },
+    { name = "rich" },
+    { name = "sentence-transformers" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "types-aiofiles" },
+    { name = "types-tqdm" },
+    { name = "wandb" },
+]
+dev = [
+    { name = "fastapi" },
+    { name = "fhaviary", extra = ["xml"] },
+    { name = "ipython" },
+    { name = "mypy" },
+    { name = "pre-commit" },
+    { name = "pydantic" },
+    { name = "pydot" },
+    { name = "pylint-pydantic" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "pytest-mock" },
+    { name = "pytest-recording" },
+    { name = "pytest-rerunfailures" },
+    { name = "pytest-subtests" },
+    { name = "pytest-sugar" },
+    { name = "pytest-timer", extra = ["colorama"] },
+    { name = "pytest-xdist" },
+    { name = "python-dotenv" },
+    { name = "refurb" },
+    { name = "rich" },
+    { name = "sentence-transformers" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "types-aiofiles" },
+    { name = "types-tqdm" },
+    { name = "wandb" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "aiofiles" },
+    { name = "coredis" },
+    { name = "dm-tree" },
+    { name = "fastapi", marker = "extra == 'server'", specifier = ">=0.109" },
+    { name = "fh-llm-client", extras = ["local", "monitor", "nn", "rich", "server", "typing", "visualization"], marker = "extra == 'dev'" },
+    { name = "fhaviary", specifier = ">=0.8.2" },
+    { name = "fhaviary", extras = ["xml"], marker = "extra == 'dev'" },
+    { name = "httpx" },
+    { name = "ipython", marker = "extra == 'dev'", specifier = ">=8" },
+    { name = "limits" },
+    { name = "litellm", specifier = ">=1.44" },
+    { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.8" },
+    { name = "networkx", extras = ["default"], specifier = "~=3.4" },
+    { name = "numpy" },
+    { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.4" },
+    { name = "pydantic", specifier = "~=2.0" },
+    { name = "pydantic", marker = "extra == 'dev'", specifier = "~=2.0" },
+    { name = "pydot", marker = "extra == 'visualization'", specifier = ">=3.0.1" },
+    { name = "pylint-pydantic", marker = "extra == 'dev'" },
+    { name = "pymupdf", specifier = ">=1.24.12" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8" },
+    { name = "pytest-asyncio", marker = "extra == 'dev'" },
+    { name = "pytest-mock", marker = "extra == 'dev'" },
+    { name = "pytest-recording", marker = "extra == 'dev'" },
+    { name = "pytest-rerunfailures", marker = "extra == 'dev'" },
+    { name = "pytest-subtests", marker = "extra == 'dev'" },
+    { name = "pytest-sugar", marker = "extra == 'dev'" },
+    { name = "pytest-timer", extras = ["colorama"], marker = "extra == 'dev'" },
+    { name = "pytest-xdist", marker = "extra == 'dev'" },
+    { name = "python-dotenv", marker = "extra == 'dev'" },
+    { name = "refurb", marker = "extra == 'dev'", specifier = ">=2" },
+    { name = "rich", marker = "extra == 'rich'" },
+    { name = "sentence-transformers", marker = "extra == 'local'" },
+    { name = "tenacity" },
+    { name = "tiktoken", specifier = ">=0.4.0" },
+    { name = "torch", marker = "extra == 'nn'", specifier = ">=2.2" },
+    { name = "tqdm" },
+    { name = "tqdm", marker = "extra == 'rich'", specifier = ">=4.56" },
+    { name = "types-aiofiles", marker = "extra == 'typing'" },
+    { name = "types-tqdm", marker = "extra == 'typing'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.12'" },
+    { name = "usearch", specifier = ">=2.13" },
+    { name = "wandb", marker = "extra == 'monitor'" },
+]
+
+[package.metadata.requires-dev]
+codeflash = [
+    { name = "codeflash", specifier = ">=0.7" },
+    { name = "fh-llm-client", extras = ["dev"] },
+]
+dev = [{ name = "fh-llm-client", extras = ["dev"] }]
+
 [[package]]
 name = "fhaviary"
 version = "0.10.0"
@@ -776,6 +1013,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 },
 ]
 
+[[package]]
+name = "importlib-resources"
+version = "6.4.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/98/be/f3e8c6081b684f176b761e6a2fef02a0be939740ed6f54109a2951d806f3/importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065", size = 43372 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/6a/4604f9ae2fa62ef47b9de2fa5ad599589d28c9fd1d335f32759813dfa91e/importlib_resources-6.4.5-py3-none-any.whl", hash = "sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717", size = 36115 },
+]
+
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -909,6 +1155,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ca/96/58b3d260e212add0087563672931b1176e70bef1225839a4470ec66157a5/jiter-0.7.0-cp313-none-win_amd64.whl", hash = "sha256:7417c2b928062c496f381fb0cb50412eee5ad1d8b53dbc0e011ce45bb2de522c", size = 199305 },
 ]
 
+[[package]]
+name = "joblib"
+version = "1.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e", size = 2116621 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6", size = 301817 },
+]
+
 [[package]]
 name = "jsonschema"
 version = "4.23.0"
@@ -1030,6 +1285,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/9f/5b5481d716670ed5fbd8d06dfa94b7108272b645da2f2406eb909cb6a450/libcst-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:4d6acb0bdee1e55b44c6215c59755ec4693ac01e74bb1fde04c37358b378835d", size = 2029600 },
 ]
 
+[[package]]
+name = "limits"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "importlib-resources" },
+    { name = "packaging" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/5f/89fb5405ee37d8b172e48e357438dd79482731b0cd5db2f734ac58f019e4/limits-3.13.0.tar.gz", hash = "sha256:6571b0c567bfa175a35fed9f8a954c0c92f1c3200804282f1b8f1de4ad98a953", size = 70218 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/80/b340bc7c3eb8f5c40e4d38c8e3cd04c127756d8de06b9e54caefb4ae16d5/limits-3.13.0-py3-none-any.whl", hash = "sha256:9767f7233da4255e9904b79908a728e8ec0984c0b086058b4cbbd309aea553f6", size = 45547 },
+]
+
 [[package]]
 name = "litellm"
 version = "1.48.10"
@@ -1052,192 +1322,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/5b/b6eb2098ed289f99abb55ab966b4f318a467294c218ad846e96ba72949b0/litellm-1.48.10-py3-none-any.whl", hash = "sha256:752efd59747a0895f4695d025c66f0b2258d80a61175f7cfa41dbe4894ef95e1", size = 6238318 },
 ]
 
-[[package]]
-name = "llm-client"
-version = "0.1.dev22+g7d88740.d20241107"
-source = { editable = "." }
-dependencies = [
-    { name = "aiofiles" },
-    { name = "dm-tree" },
-    { name = "fhaviary" },
-    { name = "httpx" },
-    { name = "litellm" },
-    { name = "networkx", extra = ["default"] },
-    { name = "numpy" },
-    { name = "openai" },
-    { name = "pydantic" },
-    { name = "tenacity" },
-    { name = "tiktoken" },
-    { name = "tqdm" },
-    { name = "typing-extensions", marker = "python_full_version < '3.12'" },
-    { name = "usearch" },
-]
-
-[package.optional-dependencies]
-dev = [
-    { name = "fastapi" },
-    { name = "fhaviary", extra = ["xml"] },
-    { name = "ipython" },
-    { name = "litellm" },
-    { name = "mypy" },
-    { name = "openai" },
-    { name = "pre-commit" },
-    { name = "pydantic" },
-    { name = "pydot" },
-    { name = "pylint" },
-    { name = "pylint-pydantic" },
-    { name = "pytest" },
-    { name = "pytest-asyncio" },
-    { name = "pytest-mock" },
-    { name = "pytest-recording" },
-    { name = "pytest-rerunfailures" },
-    { name = "pytest-subtests" },
-    { name = "pytest-sugar" },
-    { name = "pytest-timer", extra = ["colorama"] },
-    { name = "pytest-xdist" },
-    { name = "refurb" },
-    { name = "rich" },
-    { name = "torch" },
-    { name = "tqdm" },
-    { name = "types-aiofiles" },
-    { name = "types-tqdm" },
-    { name = "wandb" },
-]
-monitor = [
-    { name = "wandb" },
-]
-nn = [
-    { name = "torch" },
-]
-rich = [
-    { name = "rich" },
-    { name = "tqdm" },
-]
-server = [
-    { name = "fastapi" },
-]
-typing = [
-    { name = "types-aiofiles" },
-    { name = "types-tqdm" },
-]
-visualization = [
-    { name = "pydot" },
-]
-
-[package.dev-dependencies]
-codeflash = [
-    { name = "codeflash" },
-    { name = "fastapi" },
-    { name = "fhaviary", extra = ["xml"] },
-    { name = "ipython" },
-    { name = "litellm" },
-    { name = "mypy" },
-    { name = "openai" },
-    { name = "pre-commit" },
-    { name = "pydantic" },
-    { name = "pydot" },
-    { name = "pylint" },
-    { name = "pylint-pydantic" },
-    { name = "pytest" },
-    { name = "pytest-asyncio" },
-    { name = "pytest-mock" },
-    { name = "pytest-recording" },
-    { name = "pytest-rerunfailures" },
-    { name = "pytest-subtests" },
-    { name = "pytest-sugar" },
-    { name = "pytest-timer", extra = ["colorama"] },
-    { name = "pytest-xdist" },
-    { name = "refurb" },
-    { name = "rich" },
-    { name = "torch" },
-    { name = "tqdm" },
-    { name = "types-aiofiles" },
-    { name = "types-tqdm" },
-    { name = "wandb" },
-]
-dev = [
-    { name = "fastapi" },
-    { name = "fhaviary", extra = ["xml"] },
-    { name = "ipython" },
-    { name = "litellm" },
-    { name = "mypy" },
-    { name = "openai" },
-    { name = "pre-commit" },
-    { name = "pydantic" },
-    { name = "pydot" },
-    { name = "pylint" },
-    { name = "pylint-pydantic" },
-    { name = "pytest" },
-    { name = "pytest-asyncio" },
-    { name = "pytest-mock" },
-    { name = "pytest-recording" },
-    { name = "pytest-rerunfailures" },
-    { name = "pytest-subtests" },
-    { name = "pytest-sugar" },
-    { name = "pytest-timer", extra = ["colorama"] },
-    { name = "pytest-xdist" },
-    { name = "refurb" },
-    { name = "rich" },
-    { name = "torch" },
-    { name = "tqdm" },
-    { name = "types-aiofiles" },
-    { name = "types-tqdm" },
-    { name = "wandb" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "aiofiles" },
-    { name = "dm-tree" },
-    { name = "fastapi", marker = "extra == 'server'", specifier = ">=0.109" },
-    { name = "fhaviary", specifier = ">=0.8.2" },
-    { name = "fhaviary", extras = ["xml"], marker = "extra == 'dev'" },
-    { name = "httpx" },
-    { name = "ipython", marker = "extra == 'dev'", specifier = ">=8" },
-    { name = "litellm", specifier = ">=1.40.15" },
-    { name = "litellm", marker = "extra == 'dev'", specifier = "!=1.49.4,!=1.49.5,!=1.49.6" },
-    { name = "llm-client", extras = ["monitor", "nn", "rich", "server", "typing", "visualization"], marker = "extra == 'dev'" },
-    { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.8" },
-    { name = "networkx", extras = ["default"], specifier = "~=3.4" },
-    { name = "numpy", specifier = ">=1.20" },
-    { name = "openai", specifier = ">=1" },
-    { name = "openai", marker = "extra == 'dev'", specifier = "<1.47" },
-    { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.4" },
-    { name = "pydantic", specifier = "~=2.0" },
-    { name = "pydantic", marker = "extra == 'dev'", specifier = "~=2.9" },
-    { name = "pydot", marker = "extra == 'visualization'", specifier = ">=3.0.1" },
-    { name = "pylint", marker = "extra == 'dev'", specifier = ">=3.2" },
-    { name = "pylint-pydantic", marker = "extra == 'dev'" },
-    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8" },
-    { name = "pytest-asyncio", marker = "extra == 'dev'" },
-    { name = "pytest-mock", marker = "extra == 'dev'" },
-    { name = "pytest-recording", marker = "extra == 'dev'" },
-    { name = "pytest-rerunfailures", marker = "extra == 'dev'" },
-    { name = "pytest-subtests", marker = "extra == 'dev'" },
-    { name = "pytest-sugar", marker = "extra == 'dev'" },
-    { name = "pytest-timer", extras = ["colorama"], marker = "extra == 'dev'" },
-    { name = "pytest-xdist", marker = "extra == 'dev'" },
-    { name = "refurb", marker = "extra == 'dev'", specifier = ">=2" },
-    { name = "rich", marker = "extra == 'rich'" },
-    { name = "tenacity" },
-    { name = "tiktoken" },
-    { name = "torch", marker = "extra == 'nn'", specifier = ">=2.2" },
-    { name = "tqdm" },
-    { name = "tqdm", marker = "extra == 'rich'", specifier = ">=4.56" },
-    { name = "types-aiofiles", marker = "extra == 'typing'" },
-    { name = "types-tqdm", marker = "extra == 'typing'" },
-    { name = "typing-extensions", marker = "python_full_version < '3.12'" },
-    { name = "usearch", specifier = ">=2.13" },
-    { name = "wandb", marker = "extra == 'monitor'" },
-]
-
-[package.metadata.requires-dev]
-codeflash = [
-    { name = "codeflash", specifier = ">=0.7" },
-    { name = "llm-client", extras = ["dev"] },
-]
-dev = [{ name = "llm-client", extras = ["dev"] }]
-
 [[package]]
 name = "lxml"
 version = "5.3.0"
@@ -2180,6 +2264,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/11/80/34b429c6534be99ef3d6d20bd794b26fda0682d38e2d57f85df258beaac2/pylint_pydantic-0.3.2-py3-none-any.whl", hash = "sha256:e5cec02370aa68ac8eff138e5d573b0ac049bab864e9a6c3a9057cf043440aa1", size = 15951 },
 ]
 
+[[package]]
+name = "pympler"
+version = "1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pywin32", marker = "platform_system == 'Windows'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/37/c384631908029676d8e7213dd956bb686af303a80db7afbc9be36bc49495/pympler-1.1.tar.gz", hash = "sha256:1eaa867cb8992c218430f1708fdaccda53df064144d1c5656b1e6f1ee6000424", size = 179954 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/4f/a6a2e2b202d7fd97eadfe90979845b8706676b41cbd3b42ba75adf329d1f/Pympler-1.1-py3-none-any.whl", hash = "sha256:5b223d6027d0619584116a0cbc28e8d2e378f7a79c1e5e024f9ff3b673c58506", size = 165766 },
+]
+
+[[package]]
+name = "pymupdf"
+version = "1.24.14"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/6b/6bd735144a190d26dcc23f98b4aae0e09b259cc4c87bba266a39b7b91f56/PyMuPDF-1.24.14.tar.gz", hash = "sha256:0eed9f998525eaf39706dbf2d0cf3162150f0f526e4a36b1748ffa50bde581ae", size = 56242747 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/ce/972b080c526af80577ffaa49676c05361ba152de94de3af339a2f3ac07c2/PyMuPDF-1.24.14-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b3ad7a4f4b607ff97f2e1b8111823dd3797dbb381ec851c3ae4695fea6f68478", size = 19167365 },
+    { url = "https://files.pythonhosted.org/packages/2c/11/8d6f4c8fca86b93759e430c4b0b7b66f8067d58893d6fe0a193420d14453/PyMuPDF-1.24.14-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:755906af4b4d693552ae5469ba682075853f4dc8a70639affd1bd6c049c5d900", size = 18417324 },
+    { url = "https://files.pythonhosted.org/packages/51/69/518e6c088e20a5ded1fc658d4aec1e54c0f98f2d62d91362bd4231df9ecf/PyMuPDF-1.24.14-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:37f24108e2e18150fb8d512dcccdfa1e3d9b9dd203ffaa7ffb959bb20aea40b4", size = 19303826 },
+    { url = "https://files.pythonhosted.org/packages/27/bf/203d06c68660d5535db65b6c54cacd35b950945c11c1c4546d674f270892/PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0de4f5ed903c2be6d0abcccdc796368939b51ce03916eb53292916e3b6ea65d6", size = 19833056 },
+    { url = "https://files.pythonhosted.org/packages/77/ed/40eb23cf5e91de0510dfedb7d9feedeab5ce9691544ad09599e124a0a333/PyMuPDF-1.24.14-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2d1b5c47df2f8055de5dedfbd3189c742188261a8c257f406378382adac94cff", size = 20963535 },
+    { url = "https://files.pythonhosted.org/packages/87/2b/46af7461bd299c3f52bc5455332cc82608cea1667cd692652505fdf9308e/PyMuPDF-1.24.14-cp39-abi3-win32.whl", hash = "sha256:60a7ee7db3e0d3a4dcbe6df2781ba4487acb7e515c64ea9c857504f44effcb25", size = 14965671 },
+    { url = "https://files.pythonhosted.org/packages/25/b2/82d70d9f5aea5a33e770f37e6db43ed08b5dc71b3526c5d7051689d1031e/PyMuPDF-1.24.14-cp39-abi3-win_amd64.whl", hash = "sha256:3d1f1ec2fe0249484afde7a0fc02589f19aaeb47c42939d23ae1d012aa1bc59b", size = 16257645 },
+]
+
 [[package]]
 name = "pyparsing"
 version = "3.2.0"
@@ -2353,6 +2464,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 },
 ]
 
+[[package]]
+name = "pywin32"
+version = "308"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/e2/02652007469263fe1466e98439831d65d4ca80ea1a2df29abecedf7e47b7/pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a", size = 5928156 },
+    { url = "https://files.pythonhosted.org/packages/48/ef/f4fb45e2196bc7ffe09cad0542d9aff66b0e33f6c0954b43e49c33cad7bd/pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b", size = 6559559 },
+    { url = "https://files.pythonhosted.org/packages/79/ef/68bb6aa865c5c9b11a35771329e95917b5559845bd75b65549407f9fc6b4/pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6", size = 7972495 },
+    { url = "https://files.pythonhosted.org/packages/00/7c/d00d6bdd96de4344e06c4afbf218bc86b54436a94c01c71a8701f613aa56/pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897", size = 5939729 },
+    { url = "https://files.pythonhosted.org/packages/21/27/0c8811fbc3ca188f93b5354e7c286eb91f80a53afa4e11007ef661afa746/pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47", size = 6543015 },
+    { url = "https://files.pythonhosted.org/packages/9d/0f/d40f8373608caed2255781a3ad9a51d03a594a1248cd632d6a298daca693/pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091", size = 7976033 },
+    { url = "https://files.pythonhosted.org/packages/a9/a4/aa562d8935e3df5e49c161b427a3a2efad2ed4e9cf81c3de636f1fdddfd0/pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed", size = 5938579 },
+    { url = "https://files.pythonhosted.org/packages/c7/50/b0efb8bb66210da67a53ab95fd7a98826a97ee21f1d22949863e6d588b22/pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4", size = 6542056 },
+    { url = "https://files.pythonhosted.org/packages/26/df/2b63e3e4f2df0224f8aaf6d131f54fe4e8c96400eb9df563e2aae2e1a1f9/pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd", size = 7974986 },
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
@@ -2574,6 +2701,77 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/d6/17caf2e4af1dec288477a0cbbe4a96fbc9b8a28457dce3f1f452630ce216/runs-1.2.2-py3-none-any.whl", hash = "sha256:0980dcbc25aba1505f307ac4f0e9e92cbd0be2a15a1e983ee86c24c87b839dfd", size = 7033 },
 ]
 
+[[package]]
+name = "safetensors"
+version = "0.4.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cb/46/a1c56ed856c6ac3b1a8b37abe5be0cac53219367af1331e721b04d122577/safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310", size = 65702 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9a/a5/25bcf75e373412daf1fd88045ab3aa8140a0d804ef0e70712c4f2c5b94d8/safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c", size = 392256 },
+    { url = "https://files.pythonhosted.org/packages/08/8c/ece3bf8756506a890bd980eca02f47f9d98dfbf5ce16eda1368f53560f67/safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971", size = 381490 },
+    { url = "https://files.pythonhosted.org/packages/39/83/c4a7ce01d626e46ea2b45887f2e59b16441408031e2ce2f9fe01860c6946/safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42", size = 441093 },
+    { url = "https://files.pythonhosted.org/packages/47/26/cc52de647e71bd9a0b0d78ead0d31d9c462b35550a817aa9e0cab51d6db4/safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688", size = 438960 },
+    { url = "https://files.pythonhosted.org/packages/06/78/332538546775ee97e749867df2d58f2282d9c48a1681e4891eed8b94ec94/safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68", size = 478031 },
+    { url = "https://files.pythonhosted.org/packages/d9/03/a3c8663f1ddda54e624ecf43fce651659b49e8e1603c52c3e464b442acfa/safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df", size = 494754 },
+    { url = "https://files.pythonhosted.org/packages/e6/ee/69e498a892f208bd1da4104d4b9be887f8611bf4942144718b6738482250/safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090", size = 435013 },
+    { url = "https://files.pythonhosted.org/packages/a2/61/f0cfce984515b86d1260f556ba3b782158e2855e6a318446ac2613786fa9/safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943", size = 455984 },
+    { url = "https://files.pythonhosted.org/packages/e7/a9/3e3b48fcaade3eb4e347d39ebf0bd44291db21a3e4507854b42a7cb910ac/safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0", size = 619513 },
+    { url = "https://files.pythonhosted.org/packages/80/23/2a7a1be24258c0e44c1d356896fd63dc0545a98d2d0184925fa09cd3ec76/safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f", size = 604841 },
+    { url = "https://files.pythonhosted.org/packages/b4/5c/34d082ff1fffffd8545fb22cbae3285ab4236f1f0cfc64b7e58261c2363b/safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92", size = 272602 },
+    { url = "https://files.pythonhosted.org/packages/6d/41/948c96c8a7e9fef57c2e051f1871c108a6dbbc6d285598bdb1d89b98617c/safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04", size = 285973 },
+    { url = "https://files.pythonhosted.org/packages/bf/ac/5a63082f931e99200db95fd46fb6734f050bb6e96bf02521904c6518b7aa/safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e", size = 392015 },
+    { url = "https://files.pythonhosted.org/packages/73/95/ab32aa6e9bdc832ff87784cdf9da26192b93de3ef82b8d1ada8f345c5044/safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e", size = 381774 },
+    { url = "https://files.pythonhosted.org/packages/d6/6c/7e04b7626809fc63f3698f4c50e43aff2864b40089aa4506c918a75b8eed/safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f", size = 441134 },
+    { url = "https://files.pythonhosted.org/packages/58/2b/ffe7c86a277e6c1595fbdf415cfe2903f253f574a5405e93fda8baaa582c/safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461", size = 438467 },
+    { url = "https://files.pythonhosted.org/packages/67/9c/f271bd804e08c7fda954d17b70ff281228a88077337a9e70feace4f4cc93/safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea", size = 476566 },
+    { url = "https://files.pythonhosted.org/packages/4c/ad/4cf76a3e430a8a26108407fa6cb93e6f80d996a5cb75d9540c8fe3862990/safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed", size = 492253 },
+    { url = "https://files.pythonhosted.org/packages/d9/40/a6f75ea449a9647423ec8b6f72c16998d35aa4b43cb38536ac060c5c7bf5/safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c", size = 434769 },
+    { url = "https://files.pythonhosted.org/packages/52/47/d4b49b1231abf3131f7bb0bc60ebb94b27ee33e0a1f9569da05f8ac65dee/safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1", size = 457166 },
+    { url = "https://files.pythonhosted.org/packages/c3/cd/006468b03b0fa42ff82d795d47c4193e99001e96c3f08bd62ef1b5cab586/safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4", size = 619280 },
+    { url = "https://files.pythonhosted.org/packages/22/4d/b6208d918e83daa84b424c0ac3191ae61b44b3191613a3a5a7b38f94b8ad/safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646", size = 605390 },
+    { url = "https://files.pythonhosted.org/packages/e8/20/bf0e01825dc01ed75538021a98b9a046e60ead63c6c6700764c821a8c873/safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6", size = 273250 },
+    { url = "https://files.pythonhosted.org/packages/f1/5f/ab6b6cec85b40789801f35b7d2fb579ae242d8193929974a106d5ff5c835/safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532", size = 286307 },
+    { url = "https://files.pythonhosted.org/packages/90/61/0e27b1403e311cba0be20026bee4ee822d90eda7dad372179e7f18bb99f3/safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e", size = 392062 },
+    { url = "https://files.pythonhosted.org/packages/b1/9f/cc31fafc9f5d79da10a83a820ca37f069bab0717895ad8cbcacf629dd1c5/safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916", size = 382517 },
+    { url = "https://files.pythonhosted.org/packages/a4/c7/4fda8a0ebb96662550433378f4a74c677fa5fc4d0a43a7ec287d1df254a9/safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679", size = 441378 },
+    { url = "https://files.pythonhosted.org/packages/14/31/9abb431f6209de9c80dab83e1112ebd769f1e32e7ab7ab228a02424a4693/safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89", size = 438831 },
+    { url = "https://files.pythonhosted.org/packages/37/37/99bfb195578a808b8d045159ee9264f8da58d017ac0701853dcacda14d4e/safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f", size = 477112 },
+    { url = "https://files.pythonhosted.org/packages/7d/05/fac3ef107e60d2a78532bed171a91669d4bb259e1236f5ea8c67a6976c75/safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a", size = 493373 },
+    { url = "https://files.pythonhosted.org/packages/cf/7a/825800ee8c68214b4fd3506d5e19209338c69b41e01c6e14dd13969cc8b9/safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3", size = 435422 },
+    { url = "https://files.pythonhosted.org/packages/5e/6c/7a3233c08bde558d6c33a41219119866cb596139a4673cc6c24024710ffd/safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35", size = 457382 },
+    { url = "https://files.pythonhosted.org/packages/a0/58/0b7bcba3788ff503990cf9278d611b56c029400612ba93e772c987b5aa03/safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523", size = 619301 },
+    { url = "https://files.pythonhosted.org/packages/82/cc/9c2cf58611daf1c83ce5d37f9de66353e23fcda36008b13fd3409a760aa3/safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142", size = 605580 },
+]
+
+[[package]]
+name = "scikit-learn"
+version = "1.5.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/37/59/44985a2bdc95c74e34fef3d10cb5d93ce13b0e2a7baefffe1b53853b502d/scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d", size = 7001680 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/91/609961972f694cb9520c4c3d201e377a26583e1eb83bc5a334c893729214/scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445", size = 12088580 },
+    { url = "https://files.pythonhosted.org/packages/cd/7a/19fe32c810c5ceddafcfda16276d98df299c8649e24e84d4f00df4a91e01/scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de", size = 10975994 },
+    { url = "https://files.pythonhosted.org/packages/4c/75/62e49f8a62bf3c60b0e64d0fce540578ee4f0e752765beb2e1dc7c6d6098/scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675", size = 12465782 },
+    { url = "https://files.pythonhosted.org/packages/49/21/3723de321531c9745e40f1badafd821e029d346155b6c79704e0b7197552/scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1", size = 13322034 },
+    { url = "https://files.pythonhosted.org/packages/17/1c/ccdd103cfcc9435a18819856fbbe0c20b8fa60bfc3343580de4be13f0668/scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6", size = 11015224 },
+    { url = "https://files.pythonhosted.org/packages/a4/db/b485c1ac54ff3bd9e7e6b39d3cc6609c4c76a65f52ab0a7b22b6c3ab0e9d/scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a", size = 12110344 },
+    { url = "https://files.pythonhosted.org/packages/54/1a/7deb52fa23aebb855431ad659b3c6a2e1709ece582cb3a63d66905e735fe/scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1", size = 11033502 },
+    { url = "https://files.pythonhosted.org/packages/a1/32/4a7a205b14c11225609b75b28402c196e4396ac754dab6a81971b811781c/scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd", size = 12085794 },
+    { url = "https://files.pythonhosted.org/packages/c6/29/044048c5e911373827c0e1d3051321b9183b2a4f8d4e2f11c08fcff83f13/scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6", size = 12945797 },
+    { url = "https://files.pythonhosted.org/packages/aa/ce/c0b912f2f31aeb1b756a6ba56bcd84dd1f8a148470526a48515a3f4d48cd/scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1", size = 10985467 },
+    { url = "https://files.pythonhosted.org/packages/a4/50/8891028437858cc510e13578fe7046574a60c2aaaa92b02d64aac5b1b412/scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5", size = 12025584 },
+    { url = "https://files.pythonhosted.org/packages/d2/79/17feef8a1c14149436083bec0e61d7befb4812e272d5b20f9d79ea3e9ab1/scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908", size = 10959795 },
+    { url = "https://files.pythonhosted.org/packages/b1/c8/f08313f9e2e656bd0905930ae8bf99a573ea21c34666a813b749c338202f/scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3", size = 12077302 },
+    { url = "https://files.pythonhosted.org/packages/a7/48/fbfb4dc72bed0fe31fe045fb30e924909ad03f717c36694351612973b1a9/scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12", size = 13002811 },
+    { url = "https://files.pythonhosted.org/packages/a5/e7/0c869f9e60d225a77af90d2aefa7a4a4c0e745b149325d1450f0f0ce5399/scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f", size = 10951354 },
+]
+
 [[package]]
 name = "scipy"
 version = "1.14.1"
@@ -2609,6 +2807,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f5/1b/6ee032251bf4cdb0cc50059374e86a9f076308c1512b61c4e003e241efb7/scipy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84", size = 44469524 },
 ]
 
+[[package]]
+name = "sentence-transformers"
+version = "3.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "pillow" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/0a/c677efe908b20e7e8d4ed6cce3a3447eebc7dc5e348e458f5f9a44a72b00/sentence_transformers-3.3.1.tar.gz", hash = "sha256:9635dbfb11c6b01d036b9cfcee29f7716ab64cf2407ad9f403a2e607da2ac48b", size = 217914 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/c8/990e22a465e4771338da434d799578865d6d7ef1fdb50bd844b7ecdcfa19/sentence_transformers-3.3.1-py3-none-any.whl", hash = "sha256:abffcc79dab37b7d18d21a26d5914223dd42239cfe18cb5e111c66c54b658ae7", size = 268797 },
+]
+
 [[package]]
 name = "sentry-sdk"
 version = "2.18.0"
@@ -2788,6 +3004,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7f/be/df630c387a0a054815d60be6a97eb4e8f17385d5d6fe660e1c02750062b4/termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8", size = 7755 },
 ]
 
+[[package]]
+name = "threadpoolctl"
+version = "3.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107", size = 41936 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/2c/ffbf7a134b9ab11a67b0cf0726453cedd9c5043a4fe7a35d1cefa9a1bcfb/threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467", size = 18414 },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.8.0"
@@ -2939,6 +3164,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 },
 ]
 
+[[package]]
+name = "transformers"
+version = "4.46.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "safetensors" },
+    { name = "tokenizers" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/37/5a/58f96c83e566f907ae39f16d4401bbefd8bb85c60bd1e6a95c419752ab90/transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc", size = 8627944 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536 },
+]
+
 [[package]]
 name = "triton"
 version = "3.1.0"