feat: make llama-cpp-python an optional dependency

superlinear-ai · Feb 20, 2025 · a8dcb67 · a8dcb67
1 parent 05df664
commit a8dcb67
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 18 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,7 @@ wtpsplit-lite = ">=0.1.0"
 # Large Language Models:
 huggingface-hub = ">=0.22.0"
 litellm = ">=1.48.4,<1.56.10"
-llama-cpp-python = ">=0.3.2"
+llama-cpp-python = { version = ">=0.3.2", optional = true }
 pydantic = ">=2.7.0"
 # Approximate Nearest Neighbors:
 pynndescent = ">=0.5.12"
@@ -61,6 +61,7 @@ packaging = ">=23.0"
 
 [tool.poetry.extras] # https://python-poetry.org/docs/pyproject/#extras
 chainlit = ["chainlit"]
+llama-cpp-python = ["llama-cpp-python"]
 pandoc = ["pypandoc-binary"]
 ragas = ["ragas"]
 
@@ -130,7 +131,7 @@ target-version = "py310"
 
 [tool.ruff.lint]
 select = ["A", "ASYNC", "B", "BLE", "C4", "C90", "D", "DTZ", "E", "EM", "ERA", "F", "FBT", "FLY", "FURB", "G", "I", "ICN", "INP", "INT", "ISC", "LOG", "N", "NPY", "PERF", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "Q", "RET", "RSE", "RUF", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TCH", "TID", "TRY", "UP", "W", "YTT"]
-ignore = ["D203", "D213", "E501", "RET504", "RUF002", "S101", "S307"]
+ignore = ["D203", "D213", "E501", "RET504", "RUF002", "S101", "S307", "TCH004"]
 unfixable = ["ERA001", "F401", "F841", "T201", "T203"]
 
 [tool.ruff.lint.flake8-tidy-imports]

diff --git a/src/raglite/_chatml_function_calling.py b/src/raglite/_chatml_function_calling.py
@@ -33,11 +33,14 @@
 
 import jinja2
 from jinja2.sandbox import ImmutableSandboxedEnvironment
-from llama_cpp import llama, llama_grammar, llama_types
-from llama_cpp.llama_chat_format import (
+
+from raglite._lazy_llama import (
     _convert_completion_to_chat,
     _convert_completion_to_chat_function,
     _grammar_for_response_format,
+    llama,
+    llama_grammar,
+    llama_types,
 )
 
 

diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -7,10 +7,11 @@
 from pathlib import Path
 from typing import Literal
 
-from llama_cpp import llama_supports_gpu_offload
 from platformdirs import user_data_dir
 from sqlalchemy.engine import URL
 
+from raglite._lazy_llama import llama_supports_gpu_offload
+
 # Suppress rerankers output on import until [1] is fixed.
 # [1] https://github.com/AnswerDotAI/rerankers/issues/36
 with contextlib.redirect_stdout(StringIO()):

diff --git a/src/raglite/_embed.py b/src/raglite/_embed.py
@@ -5,10 +5,10 @@
 
 import numpy as np
 from litellm import embedding
-from llama_cpp import LLAMA_POOLING_TYPE_NONE, Llama
 from tqdm.auto import tqdm, trange
 
 from raglite._config import RAGLiteConfig
+from raglite._lazy_llama import LLAMA_POOLING_TYPE_NONE, Llama
 from raglite._litellm import LlamaCppPythonLLM
 from raglite._typing import FloatMatrix, IntVector
 

diff --git a/src/raglite/_lazy_llama.py b/src/raglite/_lazy_llama.py
@@ -0,0 +1,93 @@
+"""Import llama_cpp lazily to avoid import errors when it is not installed."""
+
+from importlib import import_module
+from typing import TYPE_CHECKING
+
+# When type checking, import everything normally.
+if TYPE_CHECKING:
+    from llama_cpp import (  # type: ignore[attr-defined]
+        LLAMA_POOLING_TYPE_NONE,
+        Llama,
+        LlamaRAMCache,
+        llama,
+        llama_grammar,
+        llama_supports_gpu_offload,
+        llama_types,
+    )
+    from llama_cpp.llama_chat_format import (
+        _convert_completion_to_chat,
+        _convert_completion_to_chat_function,
+        _grammar_for_response_format,
+    )
+    from llama_cpp.llama_types import (
+        ChatCompletionRequestMessage,
+        ChatCompletionTool,
+        ChatCompletionToolChoiceOption,
+        CreateChatCompletionResponse,
+        CreateChatCompletionStreamResponse,
+    )
+
+# Explicitly export these names for static analysis.
+__all__ = [
+    "llama",
+    "llama_grammar",
+    "llama_types",
+    "Llama",
+    "LLAMA_POOLING_TYPE_NONE",
+    "llama_supports_gpu_offload",
+    "LlamaRAMCache",
+    "_convert_completion_to_chat",
+    "_convert_completion_to_chat_function",
+    "_grammar_for_response_format",
+    "ChatCompletionRequestMessage",
+    "ChatCompletionTool",
+    "ChatCompletionToolChoiceOption",
+    "CreateChatCompletionResponse",
+    "CreateChatCompletionStreamResponse",
+]
+
+
+# Module names for the submodules of llama_cpp.
+LLAMA_CPP_MODULE_NAME = "llama_cpp"
+CHAT_SUBMODULE_NAME = "llama_chat_format"
+TYPES_SUBMODULE_NAME = "llama_types"
+
+# Map attributes that live in submodules to their module names.
+_SUBMODULE_ATTRS = {
+    # Attributes from llama_cpp.llama_chat_format
+    "_convert_completion_to_chat": f"{LLAMA_CPP_MODULE_NAME}.{CHAT_SUBMODULE_NAME}",
+    "_convert_completion_to_chat_function": f"{LLAMA_CPP_MODULE_NAME}.{CHAT_SUBMODULE_NAME}",
+    "_grammar_for_response_format": f"{LLAMA_CPP_MODULE_NAME}.{CHAT_SUBMODULE_NAME}",
+    # Attributes from llama_cpp.llama_types
+    "ChatCompletionRequestMessage": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
+    "ChatCompletionTool": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
+    "ChatCompletionToolChoiceOption": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
+    "CreateChatCompletionResponse": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
+    "CreateChatCompletionStreamResponse": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
+    # Attributes from the top-level llama_cpp module.
+    "llama": LLAMA_CPP_MODULE_NAME,
+    "llama_grammar": LLAMA_CPP_MODULE_NAME,
+    "llama_types": LLAMA_CPP_MODULE_NAME,
+    "Llama": LLAMA_CPP_MODULE_NAME,
+    "LLAMA_POOLING_TYPE_NONE": LLAMA_CPP_MODULE_NAME,
+}
+
+
+def __getattr__(name: str) -> object:
+    """Import the requested attribute from the llama_cpp module lazily."""
+    module_name = _SUBMODULE_ATTRS.get(name, LLAMA_CPP_MODULE_NAME)
+
+    try:
+        module = import_module(module_name)
+    except ImportError as e:
+        import_error_message = (
+            "llama-cpp-python is required for local language model support.\n"
+            "Install it with `pip install raglite[llama-cpp-python]`."
+        )
+        raise ImportError(import_error_message) from e
+
+    try:
+        return getattr(module, name)
+    except AttributeError as e:
+        attribute_error_message = f"Module '{module_name}' has no attribute '{name}'"
+        raise AttributeError(attribute_error_message) from e
diff --git a/src/raglite/_litellm.py b/src/raglite/_litellm.py
@@ -23,7 +23,10 @@
 )
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import custom_llm_setup
-from llama_cpp import (  # type: ignore[attr-defined]
+
+from raglite._chatml_function_calling import chatml_function_calling_with_streaming
+from raglite._config import RAGLiteConfig
+from raglite._lazy_llama import (
     ChatCompletionRequestMessage,
     CreateChatCompletionResponse,
     CreateChatCompletionStreamResponse,
@@ -32,9 +35,6 @@
     llama_supports_gpu_offload,
 )
 
-from raglite._chatml_function_calling import chatml_function_calling_with_streaming
-from raglite._config import RAGLiteConfig
-
 # Reduce the logging level for LiteLLM, flashrank, and httpx.
 litellm.suppress_debug_info = True
 os.environ["LITELLM_LOG"] = "WARNING"

diff --git a/tests/test_chatml_function_calling.py b/tests/test_chatml_function_calling.py
@@ -5,17 +5,18 @@
 from typing import cast
 
 import pytest
-from llama_cpp import Llama, llama_supports_gpu_offload
-from llama_cpp.llama_types import (
+from typeguard import ForwardRefPolicy, check_type
+
+from raglite._chatml_function_calling import chatml_function_calling_with_streaming
+from raglite._lazy_llama import (
     ChatCompletionRequestMessage,
     ChatCompletionTool,
     ChatCompletionToolChoiceOption,
     CreateChatCompletionResponse,
     CreateChatCompletionStreamResponse,
+    Llama,
+    llama_supports_gpu_offload,
 )
-from typeguard import ForwardRefPolicy, check_type
-
-from raglite._chatml_function_calling import chatml_function_calling_with_streaming
 
 
 def is_accelerator_available() -> bool: