Skip to content

Commit

Permalink
feat: make llama-cpp-python an optional dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
rchretien committed Feb 20, 2025
1 parent 05df664 commit a8dcb67
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 18 deletions.
7 changes: 4 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ wtpsplit-lite = ">=0.1.0"
# Large Language Models:
huggingface-hub = ">=0.22.0"
litellm = ">=1.48.4,<1.56.10"
llama-cpp-python = ">=0.3.2"
llama-cpp-python = { version = ">=0.3.2", optional = true }
pydantic = ">=2.7.0"
# Approximate Nearest Neighbors:
pynndescent = ">=0.5.12"
Expand All @@ -61,6 +61,7 @@ packaging = ">=23.0"

[tool.poetry.extras] # https://python-poetry.org/docs/pyproject/#extras
chainlit = ["chainlit"]
llama-cpp-python = ["llama-cpp-python"]
pandoc = ["pypandoc-binary"]
ragas = ["ragas"]

Expand Down Expand Up @@ -130,7 +131,7 @@ target-version = "py310"

[tool.ruff.lint]
select = ["A", "ASYNC", "B", "BLE", "C4", "C90", "D", "DTZ", "E", "EM", "ERA", "F", "FBT", "FLY", "FURB", "G", "I", "ICN", "INP", "INT", "ISC", "LOG", "N", "NPY", "PERF", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "Q", "RET", "RSE", "RUF", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TCH", "TID", "TRY", "UP", "W", "YTT"]
ignore = ["D203", "D213", "E501", "RET504", "RUF002", "S101", "S307"]
ignore = ["D203", "D213", "E501", "RET504", "RUF002", "S101", "S307", "TCH004"]
unfixable = ["ERA001", "F401", "F841", "T201", "T203"]

[tool.ruff.lint.flake8-tidy-imports]
Expand Down
7 changes: 5 additions & 2 deletions src/raglite/_chatml_function_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@

import jinja2
from jinja2.sandbox import ImmutableSandboxedEnvironment
from llama_cpp import llama, llama_grammar, llama_types
from llama_cpp.llama_chat_format import (

from raglite._lazy_llama import (
_convert_completion_to_chat,
_convert_completion_to_chat_function,
_grammar_for_response_format,
llama,
llama_grammar,
llama_types,
)


Expand Down
3 changes: 2 additions & 1 deletion src/raglite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from pathlib import Path
from typing import Literal

from llama_cpp import llama_supports_gpu_offload
from platformdirs import user_data_dir
from sqlalchemy.engine import URL

from raglite._lazy_llama import llama_supports_gpu_offload

# Suppress rerankers output on import until [1] is fixed.
# [1] https://github.com/AnswerDotAI/rerankers/issues/36
with contextlib.redirect_stdout(StringIO()):
Expand Down
2 changes: 1 addition & 1 deletion src/raglite/_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

import numpy as np
from litellm import embedding
from llama_cpp import LLAMA_POOLING_TYPE_NONE, Llama
from tqdm.auto import tqdm, trange

from raglite._config import RAGLiteConfig
from raglite._lazy_llama import LLAMA_POOLING_TYPE_NONE, Llama
from raglite._litellm import LlamaCppPythonLLM
from raglite._typing import FloatMatrix, IntVector

Expand Down
93 changes: 93 additions & 0 deletions src/raglite/_lazy_llama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""Import llama_cpp lazily to avoid import errors when it is not installed."""

from importlib import import_module
from typing import TYPE_CHECKING

# When type checking, import everything normally.
if TYPE_CHECKING:
from llama_cpp import ( # type: ignore[attr-defined]
LLAMA_POOLING_TYPE_NONE,
Llama,
LlamaRAMCache,
llama,
llama_grammar,
llama_supports_gpu_offload,
llama_types,
)
from llama_cpp.llama_chat_format import (
_convert_completion_to_chat,
_convert_completion_to_chat_function,
_grammar_for_response_format,
)
from llama_cpp.llama_types import (
ChatCompletionRequestMessage,
ChatCompletionTool,
ChatCompletionToolChoiceOption,
CreateChatCompletionResponse,
CreateChatCompletionStreamResponse,
)

# Explicitly export these names for static analysis.
__all__ = [
"llama",
"llama_grammar",
"llama_types",
"Llama",
"LLAMA_POOLING_TYPE_NONE",
"llama_supports_gpu_offload",
"LlamaRAMCache",
"_convert_completion_to_chat",
"_convert_completion_to_chat_function",
"_grammar_for_response_format",
"ChatCompletionRequestMessage",
"ChatCompletionTool",
"ChatCompletionToolChoiceOption",
"CreateChatCompletionResponse",
"CreateChatCompletionStreamResponse",
]


# Module names for the submodules of llama_cpp.
LLAMA_CPP_MODULE_NAME = "llama_cpp"
CHAT_SUBMODULE_NAME = "llama_chat_format"
TYPES_SUBMODULE_NAME = "llama_types"

# Map attributes that live in submodules to their module names.
_SUBMODULE_ATTRS = {
# Attributes from llama_cpp.llama_chat_format
"_convert_completion_to_chat": f"{LLAMA_CPP_MODULE_NAME}.{CHAT_SUBMODULE_NAME}",
"_convert_completion_to_chat_function": f"{LLAMA_CPP_MODULE_NAME}.{CHAT_SUBMODULE_NAME}",
"_grammar_for_response_format": f"{LLAMA_CPP_MODULE_NAME}.{CHAT_SUBMODULE_NAME}",
# Attributes from llama_cpp.llama_types
"ChatCompletionRequestMessage": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
"ChatCompletionTool": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
"ChatCompletionToolChoiceOption": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
"CreateChatCompletionResponse": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
"CreateChatCompletionStreamResponse": f"{LLAMA_CPP_MODULE_NAME}.{TYPES_SUBMODULE_NAME}",
# Attributes from the top-level llama_cpp module.
"llama": LLAMA_CPP_MODULE_NAME,
"llama_grammar": LLAMA_CPP_MODULE_NAME,
"llama_types": LLAMA_CPP_MODULE_NAME,
"Llama": LLAMA_CPP_MODULE_NAME,
"LLAMA_POOLING_TYPE_NONE": LLAMA_CPP_MODULE_NAME,
}


def __getattr__(name: str) -> object:
"""Import the requested attribute from the llama_cpp module lazily."""
module_name = _SUBMODULE_ATTRS.get(name, LLAMA_CPP_MODULE_NAME)

try:
module = import_module(module_name)
except ImportError as e:
import_error_message = (
"llama-cpp-python is required for local language model support.\n"
"Install it with `pip install raglite[llama-cpp-python]`."
)
raise ImportError(import_error_message) from e

try:
return getattr(module, name)
except AttributeError as e:
attribute_error_message = f"Module '{module_name}' has no attribute '{name}'"
raise AttributeError(attribute_error_message) from e
8 changes: 4 additions & 4 deletions src/raglite/_litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
)
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import custom_llm_setup
from llama_cpp import ( # type: ignore[attr-defined]

from raglite._chatml_function_calling import chatml_function_calling_with_streaming
from raglite._config import RAGLiteConfig
from raglite._lazy_llama import (
ChatCompletionRequestMessage,
CreateChatCompletionResponse,
CreateChatCompletionStreamResponse,
Expand All @@ -32,9 +35,6 @@
llama_supports_gpu_offload,
)

from raglite._chatml_function_calling import chatml_function_calling_with_streaming
from raglite._config import RAGLiteConfig

# Reduce the logging level for LiteLLM, flashrank, and httpx.
litellm.suppress_debug_info = True
os.environ["LITELLM_LOG"] = "WARNING"
Expand Down
11 changes: 6 additions & 5 deletions tests/test_chatml_function_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@
from typing import cast

import pytest
from llama_cpp import Llama, llama_supports_gpu_offload
from llama_cpp.llama_types import (
from typeguard import ForwardRefPolicy, check_type

from raglite._chatml_function_calling import chatml_function_calling_with_streaming
from raglite._lazy_llama import (
ChatCompletionRequestMessage,
ChatCompletionTool,
ChatCompletionToolChoiceOption,
CreateChatCompletionResponse,
CreateChatCompletionStreamResponse,
Llama,
llama_supports_gpu_offload,
)
from typeguard import ForwardRefPolicy, check_type

from raglite._chatml_function_calling import chatml_function_calling_with_streaming


def is_accelerator_available() -> bool:
Expand Down

0 comments on commit a8dcb67

Please sign in to comment.