Future-House · maykcaldas · Dec 9, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/llmclient/__init__.py b/llmclient/__init__.py
@@ -1,9 +1,16 @@
+from .constants import (
+    CHARACTERS_PER_TOKEN_ASSUMPTION,
+    EXTRA_TOKENS_FROM_USER_ROLE,
+    MODEL_COST_MAP,
+)
 from .embeddings import (
     EmbeddingModel,
     EmbeddingModes,
     HybridEmbeddingModel,
+    LiteLLMEmbeddingModel,
     SentenceTransformerEmbeddingModel,
     SparseEmbeddingModel,
+    embedding_model_factory,
 )
 from .exceptions import (
     JSONSchemaValidationError,
@@ -13,17 +20,28 @@
     LLMModel,
     MultipleCompletionLLMModel,
 )
-from .types import LLMResult
+from .types import (
+    Chunk,
+    Embeddable,
+    LLMResult,
+)
 
 __all__ = [
+    "CHARACTERS_PER_TOKEN_ASSUMPTION",
+    "EXTRA_TOKENS_FROM_USER_ROLE",
+    "MODEL_COST_MAP",
+    "Chunk",
+    "Embeddable",
     "EmbeddingModel",
     "EmbeddingModes",
     "HybridEmbeddingModel",
     "JSONSchemaValidationError",
     "LLMModel",
     "LLMResult",
+    "LiteLLMEmbeddingModel",
     "LiteLLMModel",
     "MultipleCompletionLLMModel",
     "SentenceTransformerEmbeddingModel",
     "SparseEmbeddingModel",
+    "embedding_model_factory",
 ]
diff --git a/llmclient/embeddings.py b/llmclient/embeddings.py
@@ -1,10 +1,11 @@
 import asyncio
 from abc import ABC, abstractmethod
+from collections import Counter
 from enum import StrEnum
+from itertools import chain
 from typing import Any
 
 import litellm
-import numpy as np
 import tiktoken
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
@@ -171,13 +172,9 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
         enc_batch = self.enc.encode_ordinary_batch(texts)
         # now get frequency of each token rel to length
         return [
-            (
-                np.bincount([xi % self.ndim for xi in x], minlength=self.ndim).astype(
-                    float
-                )
-                / len(x)
-            ).tolist()
+            [token_counts.get(xi, 0) / len(x) for xi in range(self.ndim)]
             for x in enc_batch
+            if (token_counts := Counter(xi % self.ndim for xi in x))
         ]
 
 
@@ -199,7 +196,11 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
         all_embeds = await asyncio.gather(
             *[m.embed_documents(texts) for m in self.models]
         )
-        return np.concatenate(all_embeds, axis=1).tolist()
+
+        return [
+            list(chain.from_iterable(embed_group))
+            for embed_group in zip(*all_embeds, strict=True)
+        ]
 
     def set_mode(self, mode: EmbeddingModes) -> None:
         # Set mode for all component models
@@ -217,6 +218,7 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         try:
+            import numpy as np  # noqa: F401
             from sentence_transformers import SentenceTransformer
         except ImportError as exc:
             raise ImportError(
@@ -240,6 +242,8 @@ async def embed_documents(self, texts: list[str]) -> list[list[float]]:
         Returns:
             A list of embedding vectors.
         """
+        import numpy as np
+
         # Extract additional configurations if needed
         batch_size = self.config.get("batch_size", 32)
         device = self.config.get("device", "cpu")

diff --git a/llmclient/llms.py b/llmclient/llms.py
@@ -612,7 +612,7 @@ class MultipleCompletionLLMModel(BaseModel):
             "Configuration of the model:"
             "model is the name of the llm model to use,"
             "temperature is the sampling temperature, and"
-            "n is the number of completions to generate."
+            "n is the number of completions to generate by default."
         ),
     )
     encoding: Any | None = None
@@ -667,6 +667,23 @@ async def call(  # noqa: C901, PLR0915
         tool_choice: Tool | str | None = TOOL_CHOICE_REQUIRED,
         **chat_kwargs,
     ) -> list[LLMResult]:
+        """
+        Call the LLM model with the given messages and configuration.
+
+        Args:
+            messages: A list of messages to send to the language model.
+            callbacks: A list of callback functions to execute after receiving the response.
+            output_type: The type of the output model.
+            tools: A list of tools to use during the call.
+            tool_choice: The tool or tool identifier to use.
+            **chat_kwargs: Additional keyword arguments to pass to the chat function.
+
+        Returns:
+            A list of LLMResult objects containing the results of the call.
+
+        Raises:
+            ValueError: If the number of completions (n) is invalid.
+        """
         start_clock = asyncio.get_running_loop().time()
 
         # Deal with tools. Note OpenAI throws a 400 response if tools is empty:
@@ -829,3 +846,18 @@ async def call(  # noqa: C901, PLR0915
             result.seconds_to_last_token = end_clock - start_clock
 
         return results
+
+    async def call_single(
+        self,
+        messages: list[Message],
+        callbacks: list[Callable] | None = None,
+        output_type: type[BaseModel] | None = None,
+        tools: list[Tool] | None = None,
+        tool_choice: Tool | str | None = TOOL_CHOICE_REQUIRED,
+        **chat_kwargs,
+    ) -> LLMResult:
+        return (
+            await self.call(
+                messages, callbacks, output_type, tools, tool_choice, n=1, **chat_kwargs
+            )
+        )[0]
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,6 @@ dependencies = [
     "fhaviary>=0.8.2",  # For core namespace
     "limits",
     "litellm>=1.44",  # For LITELLM_LOG addition
-    "numpy",
     "pydantic~=2.0,>=2.10.1,<2.10.2",
     "tiktoken>=0.4.0",
     "typing-extensions; python_version <= '3.11'",  # for typing.override
@@ -40,7 +39,7 @@ requires-python = ">=3.11"
 
 [project.optional-dependencies]
 dev = [
-    "fh-llm-client[image,local]",
+    "fh-llm-client[local]",
     "fhaviary[xml]",
     "ipython>=8",  # Pin to keep recent
     "mypy>=1.8",  # Pin for mutable-override
@@ -58,10 +57,8 @@ dev = [
     "python-dotenv",
     "refurb>=2",  # Pin to keep recent
 ]
-image = [
-    "Pillow",
-]
 local = [
+    "numpy",
     "sentence-transformers",
 ]
 

diff --git a/tests/cassettes/TestMultipleCompletionLLMModel.test_multiple_completion[gpt-3.5-turbo].yaml b/tests/cassettes/TestMultipleCompletionLLMModel.test_multiple_completion[gpt-3.5-turbo].yaml
@@ -0,0 +1,196 @@
+interactions:
+  - request:
+      body:
+        '{"messages":[{"role":"system","content":"Respond with single words."},{"role":"user","content":"Hello,
+        how are you?"}],"model":"gpt-3.5-turbo","n":2}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "149"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.0
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.0
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA9RTy2rDMBC8+yuEzklo3jS3QCCXXNoe+qIYWdrYamStKq1LS8i/FzkPOySFXnvR
+          YWZnNLsrbRPGuFZ8xrgsBMnSme48W6zuxvBAL5v55/3H06Ra4OOoWhXl82LJO1GB2TtIOqp6Ektn
+          gDTaPS09CILo2p8Oh6PhYHo7qYkSFZgoyx11h71xlyqfYfemPxgflAVqCYHP2GvCGGPb+owZrYIv
+          PmM3nSNSQggiBz47FTHGPZqIcBGCDiQs8U5DSrQEto69RFRtysO6CiJGs5UxB3x3ustg7jxm4cCf
+          8LW2OhSpBxHQRt9A6HjSEl800P83DSSMvdVLqc5icuexdJQSbsBGw8Fgb8ebZ9AiDxwhCdOCR50r
+          ZqkCEtqE1ki4FLIA1SibByAqpbFFtMd+meWa975tbfO/2DeElOAIVOo8KC3P+23KPMQ/8lvZacR1
+          YB6+A0GZrrXNwTuv6yXXm9wlPwAAAP//AwAh8pBrpAMAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8ed70040cbcdf99b-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Thu, 05 Dec 2024 21:06:36 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "134"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "12000"
+        x-ratelimit-limit-tokens:
+          - "1000000"
+        x-ratelimit-remaining-requests:
+          - "11999"
+        x-ratelimit-remaining-tokens:
+          - "999953"
+        x-ratelimit-reset-requests:
+          - 5ms
+        x-ratelimit-reset-tokens:
+          - 2ms
+        x-request-id:
+          - req_1f88664946b9891fbc90796687f144c4
+      status:
+        code: 200
+        message: OK
+  - request:
+      body:
+        '{"messages":[{"role":"system","content":"Respond with single words."},{"role":"user","content":"Hello,
+        how are you?"}],"model":"gpt-3.5-turbo","n":2}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "149"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.0
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.0
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "0"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA9RTTUsDMRC9768IOW9LP63tzaIIIqgH7UFkSZPZbTSbCcksWEr/u2T7sVtawauX
+          HN6b9/JmJtkkjHGt+IxxuRIkS2c6N8vbx5fF9Tisy7l5e13clfl0/vQwfl5P5o6nUYHLT5B0UHUl
+          ls4AabQ7WnoQBNG1PxkOR8PBZHpVEyUqMFFWOOoMu+MOVX6JnV5/MN4rV6glBD5j7wljjG3qM2a0
+          Cr75jPXSA1JCCKIAPjsWMcY9mohwEYIOJCzxtCElWgJbx75HVG3KQ14FEaPZypg9vj3eZbBwHpdh
+          zx/xXFsdVpkHEdBG30DoeNISnzXQ/zcNJIx91EupTmJy57F0lBF+gY2Gg8HOjjfPoEXuOUISpgWP
+          0gtmmQIS2oTWSLgUcgWqUTYPQFRKY4toj/08yyXvXdvaFn+xbwgpwRGozHlQWp7225R5iH/kt7Lj
+          iOvAPKwDQZnl2hbgndf1kutNbpMfAAAA//8DALEE5HikAwAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8ed700428d77f99b-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Thu, 05 Dec 2024 21:06:36 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "114"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "12000"
+        x-ratelimit-limit-tokens:
+          - "1000000"
+        x-ratelimit-remaining-requests:
+          - "11999"
+        x-ratelimit-remaining-tokens:
+          - "999953"
+        x-ratelimit-reset-requests:
+          - 5ms
+        x-ratelimit-reset-tokens:
+          - 2ms
+        x-request-id:
+          - req_e32516fa5bb6ab11dda5155511280ea6
+      status:
+        code: 200
+        message: OK
+version: 1