Skip to content

Commit

Permalink
Merge pull request #48 from NexaAI/david/bugfix
Browse files Browse the repository at this point in the history
David/bugfix
  • Loading branch information
zhiyuan8 authored Aug 25, 2024
2 parents da5fc70 + 93c763e commit c00ce6c
Show file tree
Hide file tree
Showing 17 changed files with 906 additions and 1,134 deletions.
1 change: 0 additions & 1 deletion nexa/cli/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ def main():
image_group = run_parser.add_argument_group('Image generation options')
image_group.add_argument("-i2i", "--img2img", action="store_true", help="Whether to run image-to-image generation")
image_group.add_argument("-ns", "--num_inference_steps", type=int, help="Number of inference steps")
image_group.add_argument("-np", "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt")
image_group.add_argument("-H", "--height", type=int, help="Height of the output image")
image_group.add_argument("-W", "--width", type=int, help="Width of the output image")
image_group.add_argument("-g", "--guidance_scale", type=float, help="Guidance scale for diffusion")
Expand Down
3 changes: 0 additions & 3 deletions nexa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@

DEFAULT_IMG_GEN_PARAMS = {
"num_inference_steps": 20,
"num_images_per_prompt": 1,
"height": 512,
"width": 512,
"guidance_scale": 7.5,
Expand All @@ -191,7 +190,6 @@

DEFAULT_IMG_GEN_PARAMS_LCM = {
"num_inference_steps": 4,
"num_images_per_prompt": 1,
"height": 512,
"width": 512,
"guidance_scale": 1.0,
Expand All @@ -201,7 +199,6 @@

DEFAULT_IMG_GEN_PARAMS_TURBO = {
"num_inference_steps": 5,
"num_images_per_prompt": 1,
"height": 512,
"width": 512,
"guidance_scale": 5.0,
Expand Down
27 changes: 18 additions & 9 deletions nexa/gguf/lib_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def is_gpu_available():
def load_library(lib_base_name: str):
# Construct the paths to the possible shared library names
_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
logging.debug(f"Base path for libraries: {_base_path}")
# Searching for the library in the current directory under the name "libllama" (default name
# for llamacpp) and "llama" (default name for this repo)
_lib_paths: List[pathlib.Path] = []
Expand All @@ -29,18 +28,16 @@ def load_library(lib_base_name: str):
]
elif sys.platform == "darwin":
_lib_paths += [
_base_path / f"lib{lib_base_name}.dylib",
_base_path / f"lib{lib_base_name}.so",
_base_path / f"lib{lib_base_name}.dylib",
]
elif sys.platform == "win32":
_lib_paths += [
_base_path / f"{lib_base_name}.dll",
_base_path / f"lib{lib_base_name}.dll",
]
_add_windows_dll_directories(_base_path)
else:
raise RuntimeError("Unsupported platform")
logging.debug(f"Possible shared library paths: {_lib_paths}")

if "LLAMA_CPP_LIB" in os.environ:
lib_base_name = os.environ["LLAMA_CPP_LIB"]
Expand All @@ -50,19 +47,31 @@ def load_library(lib_base_name: str):

cdll_args = dict() # type: ignore

# Add the library directory to the DLL search path on Windows (if needed)
if sys.platform == "win32":
os.add_dll_directory(str(_base_path))
os.environ["PATH"] = str(_base_path) + os.pathsep + os.environ["PATH"]

if sys.platform == "win32" and sys.version_info >= (3, 8):
os.add_dll_directory(str(_base_path))
if "CUDA_PATH" in os.environ:
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
if "HIP_PATH" in os.environ:
os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
cdll_args["winmode"] = ctypes.RTLD_GLOBAL

# Try to load the shared library, handling potential errors
for _lib_path in _lib_paths:
logging.debug(f"Trying to load shared library from: {_lib_path}")
if _lib_path.exists():
try:
loaded_lib = ctypes.CDLL(str(_lib_path), **cdll_args) # type: ignore
logging.debug(f"Successfully loaded shared library: {_lib_path}")
return loaded_lib
return ctypes.CDLL(str(_lib_path), **cdll_args) # type: ignore
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")

raise FileNotFoundError(
f"Shared library with base name '{lib_base_name}' not found in paths: {_lib_paths}"
f"Shared library with base name '{lib_base_name}' not found"
)


Expand Down
18 changes: 5 additions & 13 deletions nexa/gguf/llama/_internals_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,11 @@ def token_eot(self) -> int:
assert self.model is not None
return llama_cpp.llama_token_eot(self.model)

def add_bos_token(self) -> int:
def add_bos_token(self) -> bool:
assert self.model is not None
return llama_cpp.llama_add_bos_token(self.model)

def add_eos_token(self) -> int:
def add_eos_token(self) -> bool:
assert self.model is not None
return llama_cpp.llama_add_eos_token(self.model)

Expand Down Expand Up @@ -343,14 +343,6 @@ def get_state_size(self) -> int:
assert self.ctx is not None
return llama_cpp.llama_get_state_size(self.ctx)

# TODO: copy_state_data

# TODO: set_state_data

# TODO: llama_load_session_file

# TODO: llama_save_session_file

def decode(self, batch: "_LlamaBatch"):
assert self.ctx is not None
assert batch.batch is not None
Expand Down Expand Up @@ -511,7 +503,7 @@ def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
assert self.ctx is not None
assert grammar.grammar is not None
llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token)
llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token)

def reset_timings(self):
assert self.ctx is not None
Expand Down Expand Up @@ -691,8 +683,8 @@ def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str:
def _should_add_bos(model: _LlamaModel) -> bool:
assert model.model is not None
add_bos = llama_cpp.llama_add_bos_token(model.model)
if add_bos != -1:
return add_bos != 0
if add_bos:
return add_bos
else:
return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM

Expand Down
2 changes: 1 addition & 1 deletion nexa/gguf/llama/_utils_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class suppress_stdout_stderr(object):
sys = sys
os = os

def __init__(self, disable: bool = False):
def __init__(self, disable: bool = True):
self.disable = disable

# Oddly enough this works better than the contextlib version
Expand Down
104 changes: 57 additions & 47 deletions nexa/gguf/llama/llama.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
from __future__ import annotations

import contextlib
import ctypes
import multiprocessing
import os
import sys
import uuid
import time
import json
import ctypes
import typing
import uuid
import fnmatch
import warnings
from collections import deque
import contextlib
import multiprocessing

from typing import (
Any,
Callable,
Deque,
Dict,
Generator,
Iterator,
List,
Literal,
Optional,
Sequence,
Union,
Generator,
Sequence,
Iterator,
Deque,
Callable,
Dict,
)
from collections import deque
from pathlib import Path


import numpy as np
import numpy.typing as npt
Expand All @@ -37,10 +43,9 @@
from nexa.gguf.llama._internals_transformers import _normalize_embedding # type: ignore
from nexa.gguf.llama._logger_transformers import set_verbose
from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr

# from nexa.gguf.llama.llama_cache import LlamaCache # type: ignore
# from nexa.gguf.llama.llama_cache import LlamaDiskCache # type: ignore
# from nexa.gguf.llama.llama_cache import LlamaRAMCache # type: ignore
from nexa.gguf.llama.llama_cache import LlamaCache # type: ignore
from nexa.gguf.llama.llama_cache import LlamaDiskCache # type: ignore
from nexa.gguf.llama.llama_cache import LlamaRAMCache # type: ignore
from nexa.gguf.llama.llama_cache import BaseLlamaCache
from nexa.gguf.llama.llama_grammar import LlamaGrammar
from nexa.gguf.llama.llama_speculative import LlamaDraftModel
Expand Down Expand Up @@ -187,6 +192,7 @@ def __init__(
A Llama instance.
"""
self.verbose = verbose
self._stack = contextlib.ExitStack()

set_verbose(verbose)

Expand Down Expand Up @@ -251,28 +257,28 @@ def __init__(
for i, (k, v) in enumerate(kv_overrides.items()):
self._kv_overrides_array[i].key = k.encode("utf-8")
if isinstance(v, bool):
self._kv_overrides_array[
i
].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
self._kv_overrides_array[i].tag = (
llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
)
self._kv_overrides_array[i].value.val_bool = v
elif isinstance(v, int):
self._kv_overrides_array[
i
].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
self._kv_overrides_array[i].tag = (
llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
)
self._kv_overrides_array[i].value.val_i64 = v
elif isinstance(v, float):
self._kv_overrides_array[
i
].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
self._kv_overrides_array[i].tag = (
llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
)
self._kv_overrides_array[i].value.val_f64 = v
elif isinstance(v, str): # type: ignore
v_bytes = v.encode("utf-8")
if len(v_bytes) > 128: # TODO: Make this a constant
raise ValueError(f"Value for {k} is too long: {v}")
v_bytes = v_bytes.ljust(128, b"\0")
self._kv_overrides_array[
i
].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
self._kv_overrides_array[i].tag = (
llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
)
# copy min(v_bytes, 128) to str_value
address = typing.cast(
int,
Expand All @@ -288,9 +294,9 @@ def __init__(
else:
raise ValueError(f"Unknown value type for {k}: {v}")

self._kv_overrides_array[
-1
].key = b"\0" # ensure sentinel element is zeroed
self._kv_overrides_array[-1].key = (
b"\0" # ensure sentinel element is zeroed
)
self.model_params.kv_overrides = self._kv_overrides_array

self.n_batch = min(n_ctx, n_batch) # ???
Expand Down Expand Up @@ -354,8 +360,6 @@ def __init__(
if not os.path.exists(model_path):
raise ValueError(f"Model path does not exist: {model_path}")

self._stack = contextlib.ExitStack()

self._model = self._stack.enter_context(
contextlib.closing(
_LlamaModel(
Expand Down Expand Up @@ -409,6 +413,15 @@ def __init__(
raise RuntimeError(
f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
)

def free_lora_adapter():
if self._lora_adapter is None:
return
llama_cpp.llama_lora_adapter_free(self._lora_adapter)
self._lora_adapter = None

self._stack.callback(free_lora_adapter)

assert self._ctx.ctx is not None
if llama_cpp.llama_lora_adapter_set(
self._ctx.ctx, self._lora_adapter, self.lora_scale
Expand All @@ -422,9 +435,9 @@ def __init__(

self.chat_format = chat_format
self.chat_handler = chat_handler
self._chat_handlers: Dict[
str, llama_chat_format.LlamaChatCompletionHandler
] = {}
self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = (
{}
)

self.draft_model = draft_model

Expand Down Expand Up @@ -766,11 +779,12 @@ def generate(
else:
break
if longest_prefix > 0:
if self.verbose:
print("Llama.generate: prefix-match hit", file=sys.stderr)
reset = False
tokens = tokens[longest_prefix:]
self.n_tokens = longest_prefix
if self.verbose:
print(f"Llama.generate: {longest_prefix} prefix-match hit, "
f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr)

# Reset the model state
if reset:
Expand Down Expand Up @@ -1046,13 +1060,13 @@ def _create_completion(

if (
(isinstance(prompt, list) and suffix is None)
or self._model.add_bos_token() == 0
or not self._model.add_bos_token()
or bos_tokens[:1] == [-1]
):
bos_tokens = []

if (isinstance(prompt, list) and suffix is None) or (
self._model.add_eos_token() != 1 and sep_token_id == -1
not self._model.add_eos_token() and sep_token_id == -1
):
eos_tokens = []

Expand Down Expand Up @@ -1511,7 +1525,8 @@ def logit_bias_processor(
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
print("Llama._create_completion: cache saved", file=sys.stderr)
if self.verbose:
print("Llama._create_completion: cache saved", file=sys.stderr)
return

if self.cache:
Expand Down Expand Up @@ -1930,10 +1945,7 @@ def create_chat_completion_openai_v1(
stream = kwargs.get("stream", False) # type: ignore
assert isinstance(stream, bool)
if stream:
return (
ChatCompletionChunk(**chunk)
for chunk in self.create_chat_completion(*args, **kwargs)
) # type: ignore
return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
else:
return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
except ImportError:
Expand Down Expand Up @@ -2078,8 +2090,6 @@ def close(self) -> None:
self._stack.close()

def __del__(self) -> None:
if self._lora_adapter is not None:
llama_cpp.llama_lora_adapter_free(self._lora_adapter)
self.close()

@staticmethod
Expand Down Expand Up @@ -2164,4 +2174,4 @@ def __call__(
self.prompt_tokens = len(input_ids)
if len(input_ids) - self.prompt_tokens < self.min_tokens:
scores[self.token_eos] = -np.inf
return scores
return scores
2 changes: 1 addition & 1 deletion nexa/gguf/llama/llama_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,4 @@ def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
while self.cache_size > self.capacity_bytes and len(self.cache) > 0:
key_to_remove = next(iter(self.cache))
del self.cache[key_to_remove]
print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)
print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)
2 changes: 1 addition & 1 deletion nexa/gguf/llama/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3776,4 +3776,4 @@ def chatml_function_calling(
},
}

raise ValueError("Automatic streaming tool choice is not supported")
raise ValueError("Automatic streaming tool choice is not supported")
Loading

0 comments on commit c00ce6c

Please sign in to comment.