From 4b82d23b56f7691db2565cdaaf5dbfdeaea84d3e Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Sun, 25 Aug 2024 03:01:05 +0000 Subject: [PATCH] bugfix update llama_pp_python --- README.md | 2 +- nexa/cli/entry.py | 1 - nexa/constants.py | 3 - nexa/gguf/lib_utils.py | 27 +- nexa/gguf/llama/_internals_transformers.py | 18 +- nexa/gguf/llama/_utils_transformers.py | 2 +- nexa/gguf/llama/llama.py | 104 +- nexa/gguf/llama/llama_cache.py | 2 +- nexa/gguf/llama/llama_chat_format.py | 2 +- nexa/gguf/llama/llama_cpp.py | 154 +- nexa/gguf/llama/llama_grammar.py | 1671 ++++++++----------- nexa/gguf/llama/llama_speculative.py | 2 +- nexa/gguf/llama/llama_tokenizer.py | 11 +- nexa/gguf/llama/llama_types.py | 3 +- nexa/gguf/llama/llava_cpp.py | 24 +- nexa/gguf/nexa_inference_image.py | 4 +- nexa/gguf/nexa_inference_vlm.py | 4 +- nexa/gguf/streamlit/streamlit_image_chat.py | 8 - 18 files changed, 907 insertions(+), 1135 deletions(-) diff --git a/README.md b/README.md index ef909b69..cbbba23b 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ If pre-built wheels cannot meet your requirements, you can install Nexa SDK from pip install nexaai ``` -####FAQ +#### FAQ If you met following issue while building ![](docs/.media/error.jpeg) diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py index e1995f28..422696eb 100644 --- a/nexa/cli/entry.py +++ b/nexa/cli/entry.py @@ -102,7 +102,6 @@ def main(): image_group = run_parser.add_argument_group('Image generation options') image_group.add_argument("-i2i", "--img2img", action="store_true", help="Whether to run image-to-image generation") image_group.add_argument("-ns", "--num_inference_steps", type=int, help="Number of inference steps") - image_group.add_argument("-np", "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt") image_group.add_argument("-H", "--height", type=int, help="Height of the output image") image_group.add_argument("-W", "--width", type=int, help="Width of the output image") image_group.add_argument("-g", "--guidance_scale", type=float, help="Guidance scale for diffusion") diff --git a/nexa/constants.py b/nexa/constants.py index 17812f88..753e612c 100644 --- a/nexa/constants.py +++ b/nexa/constants.py @@ -181,7 +181,6 @@ DEFAULT_IMG_GEN_PARAMS = { "num_inference_steps": 20, - "num_images_per_prompt": 1, "height": 512, "width": 512, "guidance_scale": 7.5, @@ -191,7 +190,6 @@ DEFAULT_IMG_GEN_PARAMS_LCM = { "num_inference_steps": 4, - "num_images_per_prompt": 1, "height": 512, "width": 512, "guidance_scale": 1.0, @@ -201,7 +199,6 @@ DEFAULT_IMG_GEN_PARAMS_TURBO = { "num_inference_steps": 5, - "num_images_per_prompt": 1, "height": 512, "width": 512, "guidance_scale": 5.0, diff --git a/nexa/gguf/lib_utils.py b/nexa/gguf/lib_utils.py index 462ebcfe..ec030b9d 100644 --- a/nexa/gguf/lib_utils.py +++ b/nexa/gguf/lib_utils.py @@ -18,7 +18,6 @@ def is_gpu_available(): def load_library(lib_base_name: str): # Construct the paths to the possible shared library names _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" - logging.debug(f"Base path for libraries: {_base_path}") # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths: List[pathlib.Path] = [] @@ -29,18 +28,16 @@ def load_library(lib_base_name: str): ] elif sys.platform == "darwin": _lib_paths += [ - _base_path / f"lib{lib_base_name}.dylib", _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", ] elif sys.platform == "win32": _lib_paths += [ _base_path / f"{lib_base_name}.dll", _base_path / f"lib{lib_base_name}.dll", ] - _add_windows_dll_directories(_base_path) else: raise RuntimeError("Unsupported platform") - logging.debug(f"Possible shared library paths: {_lib_paths}") if "LLAMA_CPP_LIB" in os.environ: lib_base_name = os.environ["LLAMA_CPP_LIB"] @@ -50,19 +47,31 @@ def load_library(lib_base_name: str): cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32": + os.add_dll_directory(str(_base_path)) + os.environ["PATH"] = str(_base_path) + os.pathsep + os.environ["PATH"] + + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + if "HIP_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) + cdll_args["winmode"] = ctypes.RTLD_GLOBAL + # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: - logging.debug(f"Trying to load shared library from: {_lib_path}") if _lib_path.exists(): try: - loaded_lib = ctypes.CDLL(str(_lib_path), **cdll_args) # type: ignore - logging.debug(f"Successfully loaded shared library: {_lib_path}") - return loaded_lib + return ctypes.CDLL(str(_lib_path), **cdll_args) # type: ignore except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found in paths: {_lib_paths}" + f"Shared library with base name '{lib_base_name}' not found" ) diff --git a/nexa/gguf/llama/_internals_transformers.py b/nexa/gguf/llama/_internals_transformers.py index a43d84ee..7646563f 100644 --- a/nexa/gguf/llama/_internals_transformers.py +++ b/nexa/gguf/llama/_internals_transformers.py @@ -179,11 +179,11 @@ def token_eot(self) -> int: assert self.model is not None return llama_cpp.llama_token_eot(self.model) - def add_bos_token(self) -> int: + def add_bos_token(self) -> bool: assert self.model is not None return llama_cpp.llama_add_bos_token(self.model) - def add_eos_token(self) -> int: + def add_eos_token(self) -> bool: assert self.model is not None return llama_cpp.llama_add_eos_token(self.model) @@ -343,14 +343,6 @@ def get_state_size(self) -> int: assert self.ctx is not None return llama_cpp.llama_get_state_size(self.ctx) - # TODO: copy_state_data - - # TODO: set_state_data - - # TODO: llama_load_session_file - - # TODO: llama_save_session_file - def decode(self, batch: "_LlamaBatch"): assert self.ctx is not None assert batch.batch is not None @@ -511,7 +503,7 @@ def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: def grammar_accept_token(self, grammar: LlamaGrammar, token: int): assert self.ctx is not None assert grammar.grammar is not None - llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token) + llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token) def reset_timings(self): assert self.ctx is not None @@ -691,8 +683,8 @@ def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str: def _should_add_bos(model: _LlamaModel) -> bool: assert model.model is not None add_bos = llama_cpp.llama_add_bos_token(model.model) - if add_bos != -1: - return add_bos != 0 + if add_bos: + return add_bos else: return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM diff --git a/nexa/gguf/llama/_utils_transformers.py b/nexa/gguf/llama/_utils_transformers.py index 0049e9cc..945c1478 100644 --- a/nexa/gguf/llama/_utils_transformers.py +++ b/nexa/gguf/llama/_utils_transformers.py @@ -17,7 +17,7 @@ class suppress_stdout_stderr(object): sys = sys os = os - def __init__(self, disable: bool = False): + def __init__(self, disable: bool = True): self.disable = disable # Oddly enough this works better than the contextlib version diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py index b914701d..fe3bc1a6 100644 --- a/nexa/gguf/llama/llama.py +++ b/nexa/gguf/llama/llama.py @@ -1,27 +1,33 @@ from __future__ import annotations -import contextlib -import ctypes -import multiprocessing import os import sys +import uuid import time +import json +import ctypes import typing -import uuid +import fnmatch import warnings -from collections import deque +import contextlib +import multiprocessing + from typing import ( Any, - Callable, - Deque, - Dict, - Generator, - Iterator, List, + Literal, Optional, - Sequence, Union, + Generator, + Sequence, + Iterator, + Deque, + Callable, + Dict, ) +from collections import deque +from pathlib import Path + import numpy as np import numpy.typing as npt @@ -37,10 +43,9 @@ from nexa.gguf.llama._internals_transformers import _normalize_embedding # type: ignore from nexa.gguf.llama._logger_transformers import set_verbose from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr - -# from nexa.gguf.llama.llama_cache import LlamaCache # type: ignore -# from nexa.gguf.llama.llama_cache import LlamaDiskCache # type: ignore -# from nexa.gguf.llama.llama_cache import LlamaRAMCache # type: ignore +from nexa.gguf.llama.llama_cache import LlamaCache # type: ignore +from nexa.gguf.llama.llama_cache import LlamaDiskCache # type: ignore +from nexa.gguf.llama.llama_cache import LlamaRAMCache # type: ignore from nexa.gguf.llama.llama_cache import BaseLlamaCache from nexa.gguf.llama.llama_grammar import LlamaGrammar from nexa.gguf.llama.llama_speculative import LlamaDraftModel @@ -187,6 +192,7 @@ def __init__( A Llama instance. """ self.verbose = verbose + self._stack = contextlib.ExitStack() set_verbose(verbose) @@ -251,28 +257,28 @@ def __init__( for i, (k, v) in enumerate(kv_overrides.items()): self._kv_overrides_array[i].key = k.encode("utf-8") if isinstance(v, bool): - self._kv_overrides_array[ - i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL + self._kv_overrides_array[i].tag = ( + llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL + ) self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): - self._kv_overrides_array[ - i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT + self._kv_overrides_array[i].tag = ( + llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT + ) self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): - self._kv_overrides_array[ - i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT + self._kv_overrides_array[i].tag = ( + llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT + ) self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") if len(v_bytes) > 128: # TODO: Make this a constant raise ValueError(f"Value for {k} is too long: {v}") v_bytes = v_bytes.ljust(128, b"\0") - self._kv_overrides_array[ - i - ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR + self._kv_overrides_array[i].tag = ( + llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR + ) # copy min(v_bytes, 128) to str_value address = typing.cast( int, @@ -288,9 +294,9 @@ def __init__( else: raise ValueError(f"Unknown value type for {k}: {v}") - self._kv_overrides_array[ - -1 - ].key = b"\0" # ensure sentinel element is zeroed + self._kv_overrides_array[-1].key = ( + b"\0" # ensure sentinel element is zeroed + ) self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? @@ -354,8 +360,6 @@ def __init__( if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self._stack = contextlib.ExitStack() - self._model = self._stack.enter_context( contextlib.closing( _LlamaModel( @@ -409,6 +413,15 @@ def __init__( raise RuntimeError( f"Failed to initialize LoRA adapter from lora path: {self.lora_path}" ) + + def free_lora_adapter(): + if self._lora_adapter is None: + return + llama_cpp.llama_lora_adapter_free(self._lora_adapter) + self._lora_adapter = None + + self._stack.callback(free_lora_adapter) + assert self._ctx.ctx is not None if llama_cpp.llama_lora_adapter_set( self._ctx.ctx, self._lora_adapter, self.lora_scale @@ -422,9 +435,9 @@ def __init__( self.chat_format = chat_format self.chat_handler = chat_handler - self._chat_handlers: Dict[ - str, llama_chat_format.LlamaChatCompletionHandler - ] = {} + self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = ( + {} + ) self.draft_model = draft_model @@ -766,11 +779,12 @@ def generate( else: break if longest_prefix > 0: - if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) reset = False tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix + if self.verbose: + print(f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr) # Reset the model state if reset: @@ -1046,13 +1060,13 @@ def _create_completion( if ( (isinstance(prompt, list) and suffix is None) - or self._model.add_bos_token() == 0 + or not self._model.add_bos_token() or bos_tokens[:1] == [-1] ): bos_tokens = [] if (isinstance(prompt, list) and suffix is None) or ( - self._model.add_eos_token() != 1 and sep_token_id == -1 + not self._model.add_eos_token() and sep_token_id == -1 ): eos_tokens = [] @@ -1511,7 +1525,8 @@ def logit_bias_processor( if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() - print("Llama._create_completion: cache saved", file=sys.stderr) + if self.verbose: + print("Llama._create_completion: cache saved", file=sys.stderr) return if self.cache: @@ -1930,10 +1945,7 @@ def create_chat_completion_openai_v1( stream = kwargs.get("stream", False) # type: ignore assert isinstance(stream, bool) if stream: - return ( - ChatCompletionChunk(**chunk) - for chunk in self.create_chat_completion(*args, **kwargs) - ) # type: ignore + return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore else: return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore except ImportError: @@ -2078,8 +2090,6 @@ def close(self) -> None: self._stack.close() def __del__(self) -> None: - if self._lora_adapter is not None: - llama_cpp.llama_lora_adapter_free(self._lora_adapter) self.close() @staticmethod @@ -2164,4 +2174,4 @@ def __call__( self.prompt_tokens = len(input_ids) if len(input_ids) - self.prompt_tokens < self.min_tokens: scores[self.token_eos] = -np.inf - return scores + return scores \ No newline at end of file diff --git a/nexa/gguf/llama/llama_cache.py b/nexa/gguf/llama/llama_cache.py index e94fb3c7..54f22eb7 100644 --- a/nexa/gguf/llama/llama_cache.py +++ b/nexa/gguf/llama/llama_cache.py @@ -152,4 +152,4 @@ def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): while self.cache_size > self.capacity_bytes and len(self.cache) > 0: key_to_remove = next(iter(self.cache)) del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) \ No newline at end of file diff --git a/nexa/gguf/llama/llama_chat_format.py b/nexa/gguf/llama/llama_chat_format.py index a7e37980..75ff6833 100644 --- a/nexa/gguf/llama/llama_chat_format.py +++ b/nexa/gguf/llama/llama_chat_format.py @@ -3776,4 +3776,4 @@ def chatml_function_calling( }, } - raise ValueError("Automatic streaming tool choice is not supported") + raise ValueError("Automatic streaming tool choice is not supported") \ No newline at end of file diff --git a/nexa/gguf/llama/llama_cpp.py b/nexa/gguf/llama/llama_cpp.py index f88dd69c..51a15455 100644 --- a/nexa/gguf/llama/llama_cpp.py +++ b/nexa/gguf/llama/llama_cpp.py @@ -1,21 +1,24 @@ from __future__ import annotations +import sys +import os import ctypes import functools +import pathlib + from typing import ( - TYPE_CHECKING, Any, Callable, - Generic, List, + Union, NewType, Optional, + TYPE_CHECKING, TypeVar, - Union, + Generic, ) from typing_extensions import TypeAlias - from nexa.gguf.lib_utils import load_library # Specify the base name of the shared library to load @@ -250,6 +253,9 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, # LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, # LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, +# LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, +# LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, +# LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -274,6 +280,9 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20 LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21 LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22 +LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 +LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 +LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 # // note: these values should be synchronized with ggml_rope @@ -281,13 +290,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # enum llama_rope_type { # LLAMA_ROPE_TYPE_NONE = -1, # LLAMA_ROPE_TYPE_NORM = 0, -# LLAMA_ROPE_TYPE_NEOX = 2, -# LLAMA_ROPE_TYPE_GLM = 4, +# LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, # }; LLAMA_ROPE_TYPE_NONE = -1 LLAMA_ROPE_TYPE_NORM = 0 -LLAMA_ROPE_TYPE_NEOX = 2 -LLAMA_ROPE_TYPE_GLM = 4 +LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 # enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file @@ -880,7 +887,7 @@ class llama_context_params(ctypes.Structure): # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() # enum llama_ftype ftype; // quantize to this llama_ftype # enum ggml_type output_tensor_type; // output tensor type -# enum ggml_type token_embedding_type; // itoken embeddings tensor type +# enum ggml_type token_embedding_type; // token embeddings tensor type # bool allow_requantize; // allow quantizing non-f32/f16 tensors # bool quantize_output_tensor; // quantize output.weight # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored @@ -896,7 +903,7 @@ class llama_model_quantize_params(ctypes.Structure): nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() ftype (int): quantize to this llama_ftype output_tensor_type (int): output tensor type - token_embedding_type (int): itoken embeddings tensor type + token_embedding_type (int): token embeddings tensor type allow_requantize (bool): allow quantizing non-f32/f16 tensors quantize_output_tensor (bool): quantize output.weight only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored @@ -1125,7 +1132,8 @@ def llama_backend_init(): [ctypes.c_int], None, ) -def llama_numa_init(numa: int, /): ... +def llama_numa_init(numa: int, /): + ... # // Call once at the end of the program - currently only used for MPI @@ -1150,7 +1158,8 @@ def llama_backend_free(): ) def llama_load_model_from_file( path_model: bytes, params: llama_model_params, / -) -> Optional[llama_model_p]: ... +) -> Optional[llama_model_p]: + ... # LLAMA_API void llama_free_model(struct llama_model * model); @@ -1159,7 +1168,8 @@ def llama_load_model_from_file( [llama_model_p_ctypes], None, ) -def llama_free_model(model: llama_model_p, /): ... +def llama_free_model(model: llama_model_p, /): + ... # LLAMA_API struct llama_context * llama_new_context_with_model( @@ -1172,7 +1182,8 @@ def llama_free_model(model: llama_model_p, /): ... ) def llama_new_context_with_model( model: llama_model_p, params: llama_context_params, / -) -> Optional[llama_context_p]: ... +) -> Optional[llama_context_p]: + ... # // Frees all allocated memory @@ -1193,87 +1204,104 @@ def llama_free(ctx: llama_context_p, /): [], ctypes.c_int64, ) -def llama_time_us() -> int: ... +def llama_time_us() -> int: + ... # LLAMA_API size_t llama_max_devices(void); @ctypes_function("llama_max_devices", [], ctypes.c_size_t) -def llama_max_devices() -> int: ... +def llama_max_devices() -> int: + ... # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) -def llama_supports_mmap() -> bool: ... +def llama_supports_mmap() -> bool: + ... # LLAMA_API bool llama_supports_mlock (void); @ctypes_function("llama_supports_mlock", [], ctypes.c_bool) -def llama_supports_mlock() -> bool: ... +def llama_supports_mlock() -> bool: + ... # LLAMA_API bool llama_supports_gpu_offload(void); @ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool) -def llama_supports_gpu_offload() -> bool: ... +def llama_supports_gpu_offload() -> bool: + ... # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes) -def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... +def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: + ... # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ctx(ctx: llama_context_p, /) -> int: ... +def llama_n_ctx(ctx: llama_context_p, /) -> int: + ... # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_batch(ctx: llama_context_p, /) -> int: ... +def llama_n_batch(ctx: llama_context_p, /) -> int: + ... # LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); @ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... +def llama_n_ubatch(ctx: llama_context_p, /) -> int: + ... # LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); @ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... +def llama_n_seq_max(ctx: llama_context_p, /) -> int: + ... # LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) -def llama_pooling_type(ctx: llama_context_p, /) -> int: ... +def llama_pooling_type(ctx: llama_context_p, /) -> int: + ... # LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_vocab_type(model: llama_model_p, /) -> int: ... +def llama_vocab_type(model: llama_model_p, /) -> int: + ... # LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_rope_type(model: llama_model_p, /) -> int: ... +def llama_rope_type(model: llama_model_p, /) -> int: + ... # LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); @ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_vocab(model: llama_model_p, /) -> int: ... +def llama_n_vocab(model: llama_model_p, /) -> int: + ... # LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_ctx_train(model: llama_model_p, /) -> int: ... +def llama_n_ctx_train(model: llama_model_p, /) -> int: + ... # LLAMA_API int32_t llama_n_embd (const struct llama_model * model); @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_embd(model: llama_model_p, /) -> int: ... +def llama_n_embd(model: llama_model_p, /) -> int: + ... # LLAMA_API int32_t llama_n_layer (const struct llama_model * model); @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_layer(model: llama_model_p, /) -> int: ... +def llama_n_layer(model: llama_model_p, /) -> int: + ... # // Get the model's RoPE frequency scaling factor @@ -1692,9 +1720,7 @@ def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /): # @ctypes_function( "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None ) -def llama_kv_cache_view_update( - ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], / -): # type: ignore +def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /): # type: ignore """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)""" ... @@ -2050,7 +2076,8 @@ def llama_state_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> bool: ... +) -> bool: + ... # LLAMA_API DEPRECATED(bool llama_load_session_file( @@ -2078,7 +2105,8 @@ def llama_load_session_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: ... +) -> int: + ... # LLAMA_API bool llama_state_save_file( @@ -2102,7 +2130,8 @@ def llama_state_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> bool: ... +) -> bool: + ... # LLAMA_API DEPRECATED(bool llama_save_session_file( @@ -2127,7 +2156,8 @@ def llama_save_session_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: ... +) -> int: + ... # // Get the exact size needed to copy the KV cache of a single sequence @@ -2225,7 +2255,8 @@ def llama_state_seq_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: ... +) -> int: + ... # LLAMA_API size_t llama_state_seq_load_file( @@ -2255,7 +2286,8 @@ def llama_state_seq_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: ... +) -> int: + ... # // @@ -2560,7 +2592,8 @@ def llama_get_embeddings_seq( ) def llama_token_get_text( model: llama_model_p, token: Union[llama_token, int], / -) -> bytes: ... +) -> bytes: + ... # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); @@ -2569,7 +2602,8 @@ def llama_token_get_text( ) def llama_token_get_score( model: llama_model_p, token: Union[llama_token, int], / -) -> float: ... +) -> float: + ... # LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token); @@ -2578,7 +2612,8 @@ def llama_token_get_score( ) def llama_token_get_attr( model: llama_model_p, token: Union[llama_token, int], / -) -> int: ... +) -> int: + ... # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) @@ -2641,19 +2676,15 @@ def llama_token_nl(model: llama_model_p, /) -> int: ... -# // Returns -1 if unknown, 1 for true or 0 for false. -# LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model); -@ctypes_function("llama_add_bos_token", [llama_model_p_ctypes], ctypes.c_int32) -def llama_add_bos_token(model: llama_model_p, /) -> int: - """Returns -1 if unknown, 1 for true or 0 for false.""" +# LLAMA_API bool llama_add_bos_token(const struct llama_model * model); +@ctypes_function("llama_add_bos_token", [llama_model_p_ctypes], ctypes.c_bool) +def llama_add_bos_token(model: llama_model_p, /) -> bool: ... -# // Returns -1 if unknown, 1 for true or 0 for false. -# LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model); -@ctypes_function("llama_add_eos_token", [llama_model_p_ctypes], ctypes.c_int32) -def llama_add_eos_token(model: llama_model_p, /) -> int: - """Returns -1 if unknown, 1 for true or 0 for false.""" +# LLAMA_API bool llama_add_eos_token(const struct llama_model * model); +@ctypes_function("llama_add_eos_token", [llama_model_p_ctypes], ctypes.c_bool) +def llama_add_eos_token(model: llama_model_p, /) -> bool: ... @@ -2667,17 +2698,20 @@ def llama_token_prefix(model: llama_model_p) -> int: # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token) -def llama_token_middle(model: llama_model_p, /) -> int: ... +def llama_token_middle(model: llama_model_p, /) -> int: + ... # LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token) -def llama_token_suffix(model: llama_model_p, /) -> int: ... +def llama_token_suffix(model: llama_model_p, /) -> int: + ... # LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle @ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) -def llama_token_eot(model: llama_model_p, /) -> int: ... +def llama_token_eot(model: llama_model_p, /) -> int: + ... # // @@ -2878,7 +2912,8 @@ def llama_chat_apply_template( chat: CtypesArray[llama_chat_message], n_msg: int, /, -) -> int: ... +) -> int: + ... # // @@ -2906,7 +2941,7 @@ def llama_grammar_init( n_rules: Union[ctypes.c_size_t, int], start_rule_index: Union[ctypes.c_size_t, int], /, -) -> llama_grammar_p: +) -> Optional[llama_grammar_p]: """Initialize a grammar from a set of rules.""" ... @@ -3545,4 +3580,5 @@ def llama_log_set( [ctypes.c_void_p, llama_context_p_ctypes], None, ) -def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ... +def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): + ... \ No newline at end of file diff --git a/nexa/gguf/llama/llama_grammar.py b/nexa/gguf/llama/llama_grammar.py index 26618984..2fc20d05 100644 --- a/nexa/gguf/llama/llama_grammar.py +++ b/nexa/gguf/llama/llama_grammar.py @@ -3,518 +3,100 @@ # flake8: noqa from pathlib import Path import sys -from ctypes import * # type: ignore -from enum import Enum -from itertools import islice, groupby +import ctypes +import enum +import typing +import dataclasses + +from itertools import groupby from typing import ( Any, - Callable, - Dict, Set, - Generic, List, Optional, - OrderedDict, - TextIO, Tuple, - TypeVar, Union, - overload, ) import nexa.gguf.llama.llama_cpp as llama_cpp -# Type aliases -llama_grammar_element = llama_cpp.llama_grammar_element -llama_grammar_element_p = llama_cpp.llama_grammar_element_p -llama_grammar_p = llama_cpp.llama_grammar_p - -# Type variables -Ptr = TypeVar("Ptr", bound="const_char_p") -T = TypeVar("T") -U = TypeVar("U") -V = TypeVar("V") -W = TypeVar("W") - - -class Sentinel: - """Used to mark the end of a iterator of std::vector & std::map.""" - - -class LlamaGrammar: - """Keeps reference counts of all the arguments, so that they are not - garbage collected by Python.""" - - def __del__(self) -> None: - """Free the grammar pointer when the object is deleted.""" - if self.grammar is not None: - llama_cpp.llama_grammar_free(self.grammar) - self.grammar = None - - def __init__( - self, - parsed_grammar: "parse_state", - ) -> None: - """Initialize the grammar pointer from the parsed state.""" - self._grammar_rules = ( - parsed_grammar.c_rules() - ) # type: std.vector[std.vector[LlamaGrammarElement]] - self._n_rules = self._grammar_rules.size() # type: int - self._start_rule_index = parsed_grammar.symbol_ids.at("root") # type: int - self.init() - - @classmethod - def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar": - """Convert a GBNF grammar to a Llama grammar.""" - parsed_grammar = parse(const_char_p(grammar)) # type: parse_state - if parsed_grammar.rules.empty(): - raise ValueError( - f"{cls.from_string.__name__}: error parsing grammar file: parsed_grammar.rules is empty" - ) - if verbose: - print(f"{cls.from_string.__name__} grammar:", file=sys.stderr) - print_grammar(sys.stderr, parsed_grammar) - print(file=sys.stderr) - return cls(parsed_grammar) - - @classmethod - def from_json_schema( - cls, - json_schema: str, - verbose: bool = True, - ) -> "LlamaGrammar": - """Convert a JSON schema to a Llama grammar.""" - return cls.from_string(json_schema_to_gbnf(json_schema), verbose=verbose) - - @classmethod - def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar": - try: - with open(file) as f: - grammar = f.read() - except Exception as err: - raise Exception( - f"{cls.from_file.__name__}: error reading grammar file: {err}" - ) - - if grammar: - return cls.from_string(grammar, verbose=verbose) - - raise ValueError( - f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty" - ) - - def init(self) -> None: - # Step 1: Convert LlamaGrammarElement to llama_grammar_element - self._element_lists = [ - [ - llama_grammar_element(c_int(elem.type.value), c_uint32(elem.value)) - for elem in subvector - ] - for subvector in self._grammar_rules - ] # type: List[List[llama_grammar_element]] - - # Step 2: Convert each list to llama_grammar_element array and get pointer - self._element_arrays = [ - (llama_grammar_element * len(sublist))(*sublist) - for sublist in self._element_lists - ] # type: List[Array[llama_grammar_element]] - - # Step 3: Get pointer of each array - self._element_array_pointers = [ - cast(subarray, llama_grammar_element_p) for subarray in self._element_arrays - ] # type: List[llama_grammar_element_p] - - # Step 4: Make array of these pointers and get its pointer - self._rules = (llama_grammar_element_p * len(self._element_array_pointers))( - *self._element_array_pointers - ) - self.grammar = llama_cpp.llama_grammar_init( - self._rules, c_size_t(self._n_rules), c_size_t(self._start_rule_index) - ) - - def reset(self) -> None: - if self.grammar is not None: - llama_cpp.llama_grammar_free(self.grammar) - self.init() - - -class LlamaGrammarElement: - def __init__(self, type: "llama_gretype", value: int): - self.type = type - self.value = value # Unicode code point or rule ID +class GrammarElementType(enum.IntEnum): + END = llama_cpp.LLAMA_GRETYPE_END + ALT = llama_cpp.LLAMA_GRETYPE_ALT + RULE_REF = llama_cpp.LLAMA_GRETYPE_RULE_REF + CHAR = llama_cpp.LLAMA_GRETYPE_CHAR + CHAR_NOT = llama_cpp.LLAMA_GRETYPE_CHAR_NOT + CHAR_RNG_UPPER = llama_cpp.LLAMA_GRETYPE_CHAR_RNG_UPPER + CHAR_ALT = llama_cpp.LLAMA_GRETYPE_CHAR_ALT + CHAR_ANY = llama_cpp.LLAMA_GRETYPE_CHAR_ANY -class const_char_p: - """C++ implementation of const char *.""" +@dataclasses.dataclass +class GrammarElement: + type: GrammarElementType + value: int - def __init__(self, value: Union[str, Ptr], move: Optional[int] = None): - if isinstance(value, const_char_p): - # We're copying an existing const_char_p - self.value = value.value - self.pos = value.pos + (move or 0) - return - # We're creating a new const_char_p - self.value = value - self.pos = move or 0 +@dataclasses.dataclass +class ParseState: + symbol_ids: typing.Dict[str, int] = dataclasses.field(default_factory=dict) + rules: typing.List[typing.List[GrammarElement]] = dataclasses.field(default_factory=list) - def __str__(self) -> str: - assert self.value is not None, "null pointer" - return self.value[self.pos :] - - def __getitem__(self, index: int) -> str: - value = str(self) - return value[index] if index < len(value) else "" - - @overload - def __add__(self: Ptr, other: int) -> Ptr: ... - - @overload - def __add__(self: Ptr, other: Ptr) -> int: ... - - def __add__(self: Ptr, other: Union[int, Ptr]) -> Union[int, Ptr]: - return ( - self.__class__(self.value, self.pos + other) - if isinstance(other, int) - else self.pos + other.pos - ) - - @overload - def __sub__(self: Ptr, other: int) -> Ptr: ... - - @overload - def __sub__(self: Ptr, other: Ptr) -> int: ... - - def __sub__(self: Ptr, other: Union[int, Ptr]) -> Union[int, Ptr]: - return ( - self.__class__(self.value, self.pos - other) - if isinstance(other, int) - else self.pos - other.pos - ) - - def __eq__(self: Ptr, other: Ptr) -> bool: - assert self.value == other.value, "comparing pointers from different strings" - return self.pos == other.pos - - def __lt__(self: Ptr, other: Ptr) -> bool: - assert self.value == other.value, "comparing pointers from different strings" - return self.pos < other.pos - - def __gt__(self: Ptr, other: Ptr) -> bool: - assert self.value == other.value, "comparing pointers from different strings" - return self.pos > other.pos - - -class std: - @staticmethod - def string(ptr: const_char_p, length: Optional[int] = None) -> str: - """C++ implementation of std::string constructor.""" - value = str(ptr) - if length is not None: - value = value[:length] - return value - - class vector(Generic[T], List[T]): - """C++ implementation of std::vector.""" - - class iterator: - def __init__(self, vector: "std.vector[T]", index: int): - self._vector = vector - self._index = index - self._version = vector._version - - def _check_version(self): - if self._version != self._vector._version: - raise RuntimeError("Iterator used after vector was modified.") - - def __iter__(self): - return self - - def __next__(self) -> T: - self._check_version() - if self._index >= self._vector.size(): - raise StopIteration - value = self._vector[self._index] - self._index += 1 - return value - - def __add__(self, value: int) -> "std.vector[T].iterator": - return self.__class__(self._vector, self._index + value) - - def __sub__(self, value: int) -> "std.vector[T].iterator": - return self.__class__(self._vector, self._index - value) - - def __init__(self): - self._version = 0 - - def modify(self): - # This is a bit of a hack to make sure iterators are invalidated - self._version += 1 - - def push_back(self, value: T) -> None: - self.modify() - self.append(value) - - def pop_back(self) -> None: - self.modify() - if not self.empty(): - self.pop() - - def back(self) -> T: - return self[-1] - - def size(self) -> int: - return len(self) - - def clear(self) -> None: - self.modify() - super().clear() - - def empty(self) -> bool: - return self.size() == 0 - - def data(self) -> "std.vector[T]": - return self - - def resize( - self, - new_size: int, - fill_value_factory: Optional[Callable[[], T]] = None, - ) -> None: - if new_size > self.size(): - if fill_value_factory is None: - raise ValueError("A fill value factory function must be provided.") - self.reserve(new_size, fill_value_factory) - elif new_size < self.size(): - self[:] = self[:new_size] - - def reserve(self, capacity: int, fill_value_factory: Callable[[], T]) -> None: - if capacity > self.size(): - fill_value = fill_value_factory() - self.extend([fill_value] * (capacity - self.size())) - - def front(self) -> T: - if not self.empty(): - return self[0] - else: - raise IndexError("Vector is empty.") - - def assign(self, count: int, value: T) -> None: - self.clear() - self.extend([value] * count) - - def insert( - self, - pos: "std.vector[T].iterator", - first: "std.vector[T].iterator", - last: "std.vector[T].iterator", - ) -> None: - self[pos._index : pos._index] = list( - islice(first._vector, first._index, last._index) - ) - - def begin(self) -> "std.vector[T].iterator": - return self.iterator(self, 0) - - def end(self) -> "std.vector[T].iterator": - return self.iterator(self, self.size()) - - class map(Generic[T, U], OrderedDict[T, U]): - """C++ implementation of std::map.""" - - class iterator(Generic[V, W]): - def __init__(self, _map: "std.map[T, U]", key: Union[T, Sentinel]): - self._map = _map - self.iter = iter(_map) - self.key = key - self._advance() - - def _sanitize_key(self) -> T: - if isinstance(self.key, Sentinel): - raise StopIteration - return self.key - - def _advance(self) -> None: - try: - while next(self.iter) != self.key: - pass - except StopIteration: - self.key = Sentinel() - - def __next__(self) -> Tuple[T, U]: - key = self._sanitize_key() - if key in self._map: - value = self._map[key] - self._advance() - return key, value - else: - raise StopIteration - - def get(self) -> Tuple[T, U]: - key = self._sanitize_key() - return key, self._map[key] - - @property - def first(self) -> T: - return self._sanitize_key() - - @property - def second(self) -> U: - return self._map[self._sanitize_key()] - - def insert( - self, key: T, value: U - ) -> Tuple["std.map[T, U].iterator[T, U]", bool]: - if key in self: - return self.iterator(self, key), False - else: - self[key] = value - return self.iterator(self, key), True - - def find(self, key: T) -> "std.map[T, U].iterator[T, U]": - if key in self: - return self.iterator(self, key) - else: - return self.end() - - def at(self, key: T) -> U: - if key in self: - return self[key] - else: - raise KeyError("The provided key is not found in the map.") - - def erase(self, iterator: "std.map[T, U].iterator[T, U]") -> None: - key = iterator.first - if key in self: - del self[key] - - def size(self) -> int: - return len(self) - - def empty(self) -> bool: - return self.size() == 0 - - def lower_bound(self, key: T) -> "std.map[T, U].iterator[T, U]": - try: - keys = sorted(list(self.keys())) # type: ignore - for k in keys: - if k >= key: - return self.iterator(self, k) - raise ValueError("No key found that is not less than the input key") - except TypeError: - raise TypeError("Keys of type T cannot be sorted.") - - def begin(self) -> "std.map[T, U].iterator[T, U]": - return self.iterator(self, next(iter(self))) - - def end(self) -> "std.map[T, U].iterator[T, U]": - return self.iterator(self, Sentinel()) - - -# // grammar element type -# enum llama_gretype { -# // end of rule definition -# LLAMA_GRETYPE_END = 0, - -# // start of alternate definition for rule -# LLAMA_GRETYPE_ALT = 1, - -# // non-terminal element: reference to rule -# LLAMA_GRETYPE_RULE_REF = 2, - -# // terminal element: character (code point) -# LLAMA_GRETYPE_CHAR = 3, - -# // inverse char(s) ([^a], [^a-b] [^abc]) -# LLAMA_GRETYPE_CHAR_NOT = 4, - -# // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to -# // be an inclusive range ([a-z]) -# LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, - - -# // modifies a preceding LLAMA_GRETYPE_CHAR or -# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) -# LLAMA_GRETYPE_CHAR_ALT = 6, -# }; -class llama_gretype(Enum): - """grammar element type""" - - LLAMA_GRETYPE_END = 0 # end of rule definition - LLAMA_GRETYPE_ALT = 1 # start of alternate definition for rule - LLAMA_GRETYPE_RULE_REF = 2 # non-terminal element: reference to rule - LLAMA_GRETYPE_CHAR = 3 # terminal element: character (code point) - LLAMA_GRETYPE_CHAR_NOT = 4 # inverse char(s) ([^a], [^a-b] [^abc]) - LLAMA_GRETYPE_CHAR_RNG_UPPER = 5 # modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to be an inclusive range ([a-z]) - LLAMA_GRETYPE_CHAR_ALT = 6 # modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) - - -# struct parse_state { -# std::map symbol_ids; -# std::vector> rules; -# std::vector c_rules(); -# }; -class parse_state: - def __init__(self): - self.symbol_ids: std.map[str, int] = std.map() - self.rules: std.vector[std.vector[LlamaGrammarElement]] = std.vector() - - # std::vector parse_state::c_rules() { - # std::vector ret; - # for (const auto & rule : rules) { - # ret.push_back(rule.data()); - # } - # return ret; - # } - def c_rules(self) -> std.vector[std.vector[LlamaGrammarElement]]: - ret = std.vector() # type: std.vector[std.vector[LlamaGrammarElement]] - for rule in self.rules: - ret.push_back(rule.data()) - return ret - - def __repr__(self) -> str: - return ( - f"parse_state(symbol_ids={len(self.symbol_ids)}, rules={len(self.rules)})" - ) +# static std::pair decode_utf8(const char * src) { +# static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; +# uint8_t first_byte = static_cast(*src); +# uint8_t highbits = first_byte >> 4; +# int len = lookup[highbits]; +# uint8_t mask = (1 << (8 - len)) - 1; +# uint32_t value = first_byte & mask; +# const char * end = src + len; // may overrun! +# const char * pos = src + 1; +# for ( ; pos < end && *pos; pos++) { +# value = (value << 6) + (static_cast(*pos) & 0x3F); +# } +# return std::make_pair(value, pos); +# } +def decode_utf8(src: str) -> typing.Tuple[int, str]: + lookup: list[int] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4] + first_byte: int = ord(src[0]) + highbits: int = first_byte >> 4 + length: int = lookup[highbits] + mask: int = (1 << (8 - length)) - 1 + value: int = first_byte & mask + end: int = min(len(src), length) # Prevent overrun + + pos: int = 1 + for pos in range(1, end): + if not src[pos]: + break + value = (value << 6) + (ord(src[pos]) & 0x3F) -# struct llama_grammar { -# const std::vector> rules; -# std::vector> stacks; -# }; -# class llama_grammar: -# def __init__( -# self, -# rules: std.vector[std.vector[llama_grammar_element]], -# stacks: std.vector[std.vector[llama_grammar_element]], -# ): -# self.rules = rules -# self.stacks = stacks + return value, src[pos:] if pos < len(src) else "" -# uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { +# static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { # uint32_t next_id = static_cast(state.symbol_ids.size()); -# auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); +# auto result = state.symbol_ids.emplace(std::string(src, len), next_id); # return result.first->second; # } -def get_symbol_id(state: parse_state, src: const_char_p, len: int) -> int: - next_id = state.symbol_ids.size() # type: int - result = state.symbol_ids.insert(std.string(src, len), next_id) - return result[0].second # type: ignore +def get_symbol_id(state: ParseState, name: str) -> int: + next_id = len(state.symbol_ids) + return state.symbol_ids.setdefault(name, next_id) -# uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { +# static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { # uint32_t next_id = static_cast(state.symbol_ids.size()); # state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; # return next_id; # } -def generate_symbol_id(state: parse_state, base_name: str) -> int: - next_id = state.symbol_ids.size() # type: int - state.symbol_ids[base_name + "_" + str(next_id)] = next_id +def generate_symbol_id(state: ParseState, base_name: str) -> int: + next_id = len(state.symbol_ids) + state.symbol_ids[f"{base_name}_{next_id}"] = next_id return next_id -# void add_rule( +# static void add_rule( # parse_state & state, # uint32_t rule_id, # const std::vector & rule) { @@ -523,51 +105,27 @@ def generate_symbol_id(state: parse_state, base_name: str) -> int: # } # state.rules[rule_id] = rule; # } -def add_rule( - state: parse_state, - rule_id: int, - rule: std.vector[LlamaGrammarElement], -) -> None: - if state.rules.size() <= rule_id: - state.rules.resize( - rule_id + 1, - fill_value_factory=std.vector[LlamaGrammarElement], - ) +def add_rule(state: ParseState, rule_id: int, rule: typing.List[GrammarElement]) -> None: + if len(state.rules) <= rule_id: + state.rules.extend([[]] * (rule_id + 1 - len(state.rules))) state.rules[rule_id] = rule -# std::pair decode_utf8(const char * src) { -# static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; -# uint8_t first_byte = static_cast(*src); -# uint8_t highbits = first_byte >> 4; -# int len = lookup[highbits]; -# uint8_t mask = (1 << (8 - len)) - 1; -# uint32_t value = first_byte & mask; -# const char * end = src + len; // may overrun! -# const char * pos = src + 1; -# for ( ; pos < end && *pos; pos++) { -# value = (value << 6) + (static_cast(*pos) & 0x3F); -# } -# return std::make_pair(value, pos); +# static bool is_digit_char(char c) { +# return '0' <= c && c <= '9'; # } -def decode_utf8(src: const_char_p) -> Tuple[int, const_char_p]: - """Decodes a UTF-8 character from the source string.""" - # Get the codepoint of the first character - value = ord(src[0]) - # Move the pointer ahead one character - pos = src + 1 - - return value, pos +def is_digit_char(c: str) -> bool: + return "0" <= c <= "9" -# bool is_word_char(char c) { -# return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); +# static bool is_word_char(char c) { +# return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c); # } def is_word_char(c: str) -> bool: - return ("a" <= c <= "z") or ("A" <= c <= "Z") or c == "-" or ("0" <= c <= "9") + return ("a" <= c <= "z") or ("A" <= c <= "Z") or c == "-" or is_digit_char(c) -# std::pair parse_hex(const char * src, int size) { +# static std::pair parse_hex(const char * src, int size) { # const char * pos = src; # const char * end = src + size; # uint32_t value = 0; @@ -589,13 +147,12 @@ def is_word_char(c: str) -> bool: # } # return std::make_pair(value, pos); # } -def parse_hex(src: const_char_p, size: int) -> Tuple[int, const_char_p]: - pos = const_char_p(src) # type: const_char_p - end = src + size # type: const_char_p - value = 0 # type: int - while pos < end and pos[0]: +def parse_hex(src: str, size: int) -> typing.Tuple[int, str]: + pos = 0 + value = 0 + for _ in range(size): value <<= 4 - c = pos[0] # type: str + c = src[pos] if "a" <= c <= "f": value += ord(c) - ord("a") + 10 elif "A" <= c <= "F": @@ -605,12 +162,74 @@ def parse_hex(src: const_char_p, size: int) -> Tuple[int, const_char_p]: else: break pos += 1 - if pos != end: - raise RuntimeError("expecting " + str(size) + " hex chars at " + str(src)) - return (value, pos) + if pos != size: + raise ValueError(f"expecting {size} hex chars at {src}") + return value, src[pos:] -# std::pair parse_char(const char * src) { +# static const char * parse_space(const char * src, bool newline_ok) { +# const char * pos = src; +# while (*pos == ' ' || *pos == '\t' || *pos == '#' || +# (newline_ok && (*pos == '\r' || *pos == '\n'))) { +# if (*pos == '#') { +# while (*pos && *pos != '\r' && *pos != '\n') { +# pos++; +# } +# } else { +# pos++; +# } +# } +# return pos; +# } +def parse_space(src: str, newline_ok: bool) -> str: + pos = src + while pos and (pos[0] in (' ', '\t', '#') or (newline_ok and pos[0] in ('\r', '\n'))): + if pos[0] == "#": + while pos and pos[0] not in ("\r", "\n"): + pos = pos[1:] + else: + pos = pos[1:] + return pos + + +# static const char * parse_name(const char * src) { +# const char * pos = src; +# while (is_word_char(*pos)) { +# pos++; +# } +# if (pos == src) { +# throw std::runtime_error(std::string("expecting name at ") + src); +# } +# return pos; +# } +def parse_name(src: str) -> typing.Tuple[str, str]: + pos = src + while pos and is_word_char(pos[0]): + pos = pos[1:] + if pos == src: + raise ValueError(f"expecting name at {src}") + return src[:len(src) - len(pos)], pos + +# static const char * parse_int(const char * src) { +# const char * pos = src; +# while (is_digit_char(*pos)) { +# pos++; +# } +# if (pos == src) { +# throw std::runtime_error(std::string("expecting integer at ") + src); +# } +# return pos; +# } +def parse_int(src: str) -> typing.Tuple[int, str]: + pos = src + while pos and is_digit_char(pos[0]): + pos = pos[1:] + if pos == src: + raise ValueError(f"expecting integer at {src}") + return int(src[:len(src) - len(pos)]), pos + + +# static std::pair parse_char(const char * src) { # if (*src == '\\') { # switch (src[1]) { # case 'x': return parse_hex(src + 2, 2); @@ -632,273 +251,320 @@ def parse_hex(src: const_char_p, size: int) -> Tuple[int, const_char_p]: # } # throw std::runtime_error("unexpected end of input"); # } -def parse_char(src: const_char_p) -> Tuple[int, const_char_p]: +def parse_char(src: str) -> typing.Tuple[int, str]: + if not src: + raise ValueError("unexpected end of input") if src[0] == "\\": - case = src[1] # type: str - if case == "x": - return parse_hex(src + 2, 2) - elif case == "u": - return parse_hex(src + 2, 4) - elif case == "U": - return parse_hex(src + 2, 8) - elif case == "t": - return (ord("\t"), src + 2) # implicit cast - elif case == "r": - return (ord("\r"), src + 2) # implicit cast - elif case == "n": - return (ord("\n"), src + 2) # implicit cast - elif case in ("\\", '"', "[", "]"): - return (ord(case), src + 2) # implicit cast + if src[1] == "x": + return parse_hex(src[2:], 2) + elif src[1] == "u": + return parse_hex(src[2:], 4) + elif src[1] == "U": + return parse_hex(src[2:], 8) + elif src[1] == "t": + return ord("\t"), src[2:] + elif src[1] == "r": + return ord("\r"), src[2:] + elif src[1] == "n": + return ord("\n"), src[2:] + elif src[1] in ('\\', '"', '[', ']'): + return ord(src[1]), src[2:] else: - raise RuntimeError("unknown escape at " + str(src)) - elif src[0]: - return decode_utf8(src) - else: - raise RuntimeError("unexpected end of input") + raise ValueError(f"unknown escape at {src}") + return decode_utf8(src) - -# const char * parse_name(const char * src) { -# const char * pos = src; -# while (is_word_char(*pos)) { -# pos++; -# } -# if (pos == src) { -# throw std::runtime_error(std::string("expecting name at ") + src); -# } -# return pos; -# } -def parse_name(src: const_char_p) -> const_char_p: - pos = const_char_p(src) # type: const_char_p - while is_word_char(pos[0]): - pos += 1 - if pos == src: - raise RuntimeError("expecting name at " + str(src)) - return pos - - -# const char * parse_space(const char * src, bool newline_ok) { +# static const char * parse_sequence( +# parse_state & state, +# const char * src, +# const std::string & rule_name, +# std::vector & out_elements, +# bool is_nested) { +# size_t last_sym_start = out_elements.size(); # const char * pos = src; -# while (*pos == ' ' || *pos == '\t' || *pos == '#' || -# (newline_ok && (*pos == '\r' || *pos == '\n'))) { -# if (*pos == '#') { -# while (*pos && *pos != '\r' && *pos != '\n') { +# +# auto handle_repetitions = [&](int min_times, int max_times) { +# +# if (last_sym_start == out_elements.size()) { +# throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); +# } +# +# // apply transformation to previous symbol (last_sym_start to end) according to +# // the following rewrite rules: +# // S{m,n} --> S S S (m times) S'(n-m) +# // S'(x) ::= S S'(x-1) | +# // (... n-m definitions of these S' rules ...) +# // S'(1) ::= S | +# // S{m,} --> S S S (m times) S' +# // S' ::= S S' | +# // S* --> S{0,} +# // --> S' ::= S S' | +# // S+ --> S{1,} +# // --> S S' +# // S' ::= S S' | +# // S? --> S{0,1} +# // --> S' +# // S' ::= S | +# +# std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end()); +# if (min_times == 0) { +# out_elements.resize(last_sym_start); +# } else { +# // Repeat the previous elements (min_times - 1) times +# for (int i = 1; i < min_times; i++) { +# out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end()); +# } +# } +# +# uint32_t last_rec_rule_id = 0; +# auto n_opt = max_times < 0 ? 1 : max_times - min_times; +# +# std::vector rec_rule(previous_elements); +# for (int i = 0; i < n_opt; i++) { +# rec_rule.resize(previous_elements.size()); +# uint32_t rec_rule_id = generate_symbol_id(state, rule_name); +# if (i > 0 || max_times < 0) { +# rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); +# } +# rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); +# rec_rule.push_back({LLAMA_GRETYPE_END, 0}); +# add_rule(state, rec_rule_id, rec_rule); +# last_rec_rule_id = rec_rule_id; +# } +# if (n_opt > 0) { +# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); +# } +# }; +# +# while (*pos) { +# if (*pos == '"') { // literal string +# pos++; +# last_sym_start = out_elements.size(); +# while (*pos != '"') { +# if (!*pos) { +# throw std::runtime_error("unexpected end of input"); +# } +# auto char_pair = parse_char(pos); +# pos = char_pair.second; +# out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); +# } +# pos = parse_space(pos + 1, is_nested); +# } else if (*pos == '[') { // char range(s) +# pos++; +# enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; +# if (*pos == '^') { # pos++; +# start_type = LLAMA_GRETYPE_CHAR_NOT; +# } +# last_sym_start = out_elements.size(); +# while (*pos != ']') { +# if (!*pos) { +# throw std::runtime_error("unexpected end of input"); +# } +# auto char_pair = parse_char(pos); +# pos = char_pair.second; +# enum llama_gretype type = last_sym_start < out_elements.size() +# ? LLAMA_GRETYPE_CHAR_ALT +# : start_type; +# +# out_elements.push_back({type, char_pair.first}); +# if (pos[0] == '-' && pos[1] != ']') { +# if (!pos[1]) { +# throw std::runtime_error("unexpected end of input"); +# } +# auto endchar_pair = parse_char(pos + 1); +# pos = endchar_pair.second; +# out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); +# } +# } +# pos = parse_space(pos + 1, is_nested); +# } else if (is_word_char(*pos)) { // rule reference +# const char * name_end = parse_name(pos); +# uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); +# pos = parse_space(name_end, is_nested); +# last_sym_start = out_elements.size(); +# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); +# } else if (*pos == '(') { // grouping +# // parse nested alternates into synthesized rule +# pos = parse_space(pos + 1, true); +# uint32_t sub_rule_id = generate_symbol_id(state, rule_name); +# pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); +# last_sym_start = out_elements.size(); +# // output reference to synthesized rule +# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); +# if (*pos != ')') { +# throw std::runtime_error(std::string("expecting ')' at ") + pos); +# } +# pos = parse_space(pos + 1, is_nested); +# } else if (*pos == '.') { // any char +# last_sym_start = out_elements.size(); +# out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); +# pos = parse_space(pos + 1, is_nested); +# } else if (*pos == '*') { +# pos = parse_space(pos + 1, is_nested); +# handle_repetitions(0, -1); +# } else if (*pos == '+') { +# pos = parse_space(pos + 1, is_nested); +# handle_repetitions(1, -1); +# } else if (*pos == '?') { +# pos = parse_space(pos + 1, is_nested); +# handle_repetitions(0, 1); +# } else if (*pos == '{') { +# pos = parse_space(pos + 1, is_nested); +# +# if (!is_digit_char(*pos)) { +# throw std::runtime_error(std::string("expecting an int at ") + pos); +# } +# const char * int_end = parse_int(pos); +# int min_times = std::stoul(std::string(pos, int_end - pos)); +# pos = parse_space(int_end, is_nested); +# +# int max_times = -1; +# +# if (*pos == '}') { +# max_times = min_times; +# pos = parse_space(pos + 1, is_nested); +# } else if (*pos == ',') { +# pos = parse_space(pos + 1, is_nested); +# +# if (is_digit_char(*pos)) { +# const char * int_end = parse_int(pos); +# max_times = std::stoul(std::string(pos, int_end - pos)); +# pos = parse_space(int_end, is_nested); +# } +# +# if (*pos != '}') { +# throw std::runtime_error(std::string("expecting '}' at ") + pos); +# } +# pos = parse_space(pos + 1, is_nested); +# } else { +# throw std::runtime_error(std::string("expecting ',' at ") + pos); # } +# handle_repetitions(min_times, max_times); # } else { -# pos++; +# break; # } # } # return pos; # } -def parse_space(src: const_char_p, newline_ok: bool) -> const_char_p: - pos = const_char_p(src) # type: const_char_p - while pos[0] in (" ", "\t", "#") or (newline_ok and pos[0] in ("\r", "\n")): - if pos[0] == "#": - while pos[0] is not None and pos[0] not in ("\r", "\n"): - pos += 1 - else: - pos += 1 - return pos +def parse_sequence(state: ParseState, src: str, rule_name: str, out_elements: typing.List[GrammarElement], is_nested: bool) -> str: + last_sym_start = len(out_elements) + pos = src + def handle_repetitions(min_times: int, max_times: int) -> None: + nonlocal state, src, rule_name, out_elements, is_nested, last_sym_start, pos -# const char * parse_sequence( -# parse_state & state, -# const char * src, -# const std::string & rule_name, -# std::vector & out_elements, -# bool is_nested) { -def parse_sequence( - state: parse_state, - src: const_char_p, - rule_name: str, - out_elements: std.vector[LlamaGrammarElement], - is_nested: bool, -) -> const_char_p: - # size_t last_sym_start = out_elements.size(); - # const char * pos = src; - last_sym_start = out_elements.size() # type: int - pos = const_char_p(src) # type: const_char_p - # while (*pos) { - while pos[0]: - # if (*pos == '"') { // literal string - # pos++; - # last_sym_start = out_elements.size(); - # while (*pos != '"') { - # auto char_pair = parse_char(pos); - # pos = char_pair.second; - # out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); - # } - # pos = parse_space(pos + 1, is_nested); - if pos[0] == '"': # literal string - pos += 1 - last_sym_start = out_elements.size() - while pos[0] != '"': - char_pair = parse_char(pos) # type: Tuple[int, const_char_p] - pos = char_pair[1] - out_elements.push_back( - LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_CHAR, char_pair[0]) - ) - pos = parse_space(pos + 1, is_nested) - # } else if (*pos == '[') { // char range(s) - # pos++; - # enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; - elif pos[0] == "[": # char range(s) - pos += 1 - start_type = llama_gretype.LLAMA_GRETYPE_CHAR # type: llama_gretype - # if (*pos == '^') { - # pos++; - # start_type = LLAMA_GRETYPE_CHAR_NOT; - # } - # last_sym_start = out_elements.size(); + if last_sym_start == len(out_elements): + raise ValueError(f"expecting preceding item to */+/?/{{ at {pos}") + + previous_elements = out_elements[last_sym_start:] + if min_times == 0: + del out_elements[last_sym_start:] + else: + for i in range(1, min_times): + out_elements.extend(previous_elements) + + last_rec_rule_id = 0 + n_opt = 1 if max_times < 0 else max_times - min_times + + rec_rule = previous_elements[:] + for i in range(n_opt): + rec_rule = rec_rule[:len(previous_elements)] + rec_rule_id = generate_symbol_id(state, rule_name) + if i > 0 or max_times < 0: + rec_rule.append(GrammarElement(GrammarElementType.RULE_REF, rec_rule_id if max_times < 0 else last_rec_rule_id)) + rec_rule.append(GrammarElement(GrammarElementType.ALT, 0)) + rec_rule.append(GrammarElement(GrammarElementType.END, 0)) + add_rule(state, rec_rule_id, rec_rule) + last_rec_rule_id = rec_rule_id + if n_opt > 0: + out_elements.append(GrammarElement(GrammarElementType.RULE_REF, last_rec_rule_id)) + + while pos: + if pos[0] == '"': + pos = pos[1:] + last_sym_start = len(out_elements) + while not pos.startswith('"'): + if not pos: + raise ValueError("unexpected end of input") + char, pos = parse_char(pos) + out_elements.append(GrammarElement(GrammarElementType.CHAR, char)) + pos = parse_space(pos[1:], is_nested) + elif pos[0] == "[": + pos = pos[1:] + start_type = GrammarElementType.CHAR if pos[0] == "^": - pos += 1 - start_type = llama_gretype.LLAMA_GRETYPE_CHAR_NOT - last_sym_start = out_elements.size() - # while (*pos != ']') { - # auto char_pair = parse_char(pos); - # pos = char_pair.second; - # enum llama_gretype type = last_sym_start < out_elements.size() - # ? LLAMA_GRETYPE_CHAR_ALT - # : start_type; - # out_elements.push_back({type, char_pair.first}); + pos = pos[1:] + start_type = GrammarElementType.CHAR_NOT + last_sym_start = len(out_elements) while pos[0] != "]": - char_pair = parse_char(pos) # type: Tuple[int, const_char_p] - pos = char_pair[1] - type = ( - llama_gretype.LLAMA_GRETYPE_CHAR_ALT - if last_sym_start < out_elements.size() - else start_type - ) # type: llama_gretype - out_elements.push_back(LlamaGrammarElement(type, char_pair[0])) - # if (pos[0] == '-' && pos[1] != ']') { - # auto endchar_pair = parse_char(pos + 1); - # pos = endchar_pair.second; - # out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); - # } - # } + if not pos: + raise ValueError("unexpected end of input") + char, pos = parse_char(pos) + type = GrammarElementType.CHAR_ALT if last_sym_start < len(out_elements) else start_type + out_elements.append(GrammarElement(type, char)) if pos[0] == "-" and pos[1] != "]": - endchar_pair = parse_char(pos + 1) # type: Tuple[int, const_char_p] - pos = endchar_pair[1] - out_elements.push_back( - LlamaGrammarElement( - llama_gretype.LLAMA_GRETYPE_CHAR_RNG_UPPER, - endchar_pair[0], - ) - ) - # pos = parse_space(pos + 1, is_nested); - pos = parse_space(pos + 1, is_nested) - # } else if (is_word_char(*pos)) { // rule reference - # const char * name_end = parse_name(pos); - # uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); - # pos = parse_space(name_end, is_nested); - # last_sym_start = out_elements.size(); - # out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); - elif is_word_char(pos[0]): # rule reference - name_end = parse_name(pos) # type: const_char_p - ref_rule_id = get_symbol_id(state, pos, name_end - pos) # type: int - pos = parse_space(name_end, is_nested) - last_sym_start = out_elements.size() - out_elements.push_back( - LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_RULE_REF, ref_rule_id) - ) - # } else if (*pos == '(') { // grouping - # // parse nested alternates into synthesized rule - # pos = parse_space(pos + 1, true); - # uint32_t sub_rule_id = generate_symbol_id(state, rule_name); - # pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); - # last_sym_start = out_elements.size(); - # // output reference to synthesized rule - # out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); - # if (*pos != ')') { - # throw std::runtime_error(std::string("expecting ')' at ") + pos); - # } - # pos = parse_space(pos + 1, is_nested); - elif pos[0] == "(": # grouping - # parse nested alternates into synthesized rule - pos = parse_space(pos + 1, True) - sub_rule_id = generate_symbol_id(state, rule_name) # type: int - pos = parse_alternates(state, pos, rule_name, sub_rule_id, True) - last_sym_start = out_elements.size() - # output reference to synthesized rule - out_elements.push_back( - LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_RULE_REF, sub_rule_id) - ) + if not pos[1]: + raise ValueError("unexpected end of input") + endchar, pos = parse_char(pos[1:]) + out_elements.append(GrammarElement(GrammarElementType.CHAR_RNG_UPPER, endchar)) + pos = parse_space(pos[1:], is_nested) + elif pos and is_word_char(pos[0]): + name, rest = parse_name(pos) + ref_rule_id = get_symbol_id(state, name) + pos = parse_space(rest, is_nested) + last_sym_start = len(out_elements) + out_elements.append(GrammarElement(GrammarElementType.RULE_REF, ref_rule_id)) + elif pos.startswith("("): + pos = parse_space(pos[1:], newline_ok=True) + sub_rule_id = generate_symbol_id(state, rule_name) + pos = parse_alternates(state, pos, rule_name, sub_rule_id, is_nested=True) + last_sym_start = len(out_elements) + out_elements.append(GrammarElement(GrammarElementType.RULE_REF, sub_rule_id)) if pos[0] != ")": - raise RuntimeError("expecting ')' at " + str(pos)) - pos = parse_space(pos + 1, is_nested) - # } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator - # if (last_sym_start == out_elements.size()) { - # throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); - # } - elif pos[0] in ("*", "+", "?"): # repetition operator - if last_sym_start == out_elements.size(): - raise RuntimeError("expecting preceding item to */+/? at " + str(pos)) - # // apply transformation to previous symbol (last_sym_start to end) according to - # // rewrite rules: - # // S* --> S' ::= S S' | - # // S+ --> S' ::= S S' | S - # // S? --> S' ::= S | - # uint32_t sub_rule_id = generate_symbol_id(state, rule_name); - # std::vector sub_rule; - # // add preceding symbol to generated rule - # sub_rule.insert( - # sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end()); - sub_rule_id = generate_symbol_id(state, rule_name) # type: int - sub_rule = std.vector[ - LlamaGrammarElement - ]() # type: std.vector[LlamaGrammarElement] - sub_rule.insert( - sub_rule.end(), - out_elements.begin() + last_sym_start, - out_elements.end(), - ) - # if (*pos == '*' || *pos == '+') { - # // cause generated rule to recurse - # sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); - # } - # // mark start of alternate def - # sub_rule.push_back({LLAMA_GRETYPE_ALT, 0}); - if pos[0] in ("*", "+"): - sub_rule.push_back( - LlamaGrammarElement( - llama_gretype.LLAMA_GRETYPE_RULE_REF, sub_rule_id - ) - ) - sub_rule.push_back(LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_ALT, 0)) - # if (*pos == '+') { - # // add preceding symbol as alternate only for '+' (otherwise empty) - # sub_rule.insert( - # sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end()); - # } - # sub_rule.push_back({LLAMA_GRETYPE_END, 0}); - # add_rule(state, sub_rule_id, sub_rule); - # // in original rule, replace previous symbol with reference to generated rule - # out_elements.resize(last_sym_start); - # out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); - # pos = parse_space(pos + 1, is_nested); - if pos[0] == "+": - # add preceding symbol as alternate only for '+' (otherwise empty) - sub_rule.insert( - sub_rule.end(), - out_elements.begin() + last_sym_start, - out_elements.end(), - ) - sub_rule.push_back(LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_END, 0)) - add_rule(state, sub_rule_id, sub_rule) - # in original rule, replace previous symbol with reference to generated rule - out_elements.resize(last_sym_start) - out_elements.push_back( - LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_RULE_REF, sub_rule_id) - ) - pos = parse_space(pos + 1, is_nested) - # } else { - # break; - # } + raise ValueError(f"expecting ')' at {pos}") + pos = parse_space(pos[1:], is_nested) + elif pos.startswith("."): + last_sym_start = len(out_elements) + out_elements.append(GrammarElement(GrammarElementType.CHAR_ANY, 0)) + pos = parse_space(pos[1:], is_nested) + elif pos.startswith("*"): + pos = parse_space(pos[1:], is_nested) + handle_repetitions(0, -1) + elif pos.startswith("+"): + pos = parse_space(pos[1:], is_nested) + handle_repetitions(1, -1) + elif pos.startswith("?"): + pos = parse_space(pos[1:], is_nested) + handle_repetitions(0, 1) + elif pos.startswith("{"): + pos = parse_space(pos[1:], is_nested) + + if not is_digit_char(pos): + raise ValueError(f"expecting an int at {pos}") + min_times, pos = parse_int(pos) + pos = parse_space(pos, is_nested) + + max_times = -1 + + if pos[0] == "}": + max_times = min_times + pos = parse_space(pos[1:], is_nested) + elif pos[0] == ",": + pos = parse_space(pos[1:], is_nested) + + if is_digit_char(pos): + max_times, pos = parse_int(pos) + pos = parse_space(pos, is_nested) + + if pos[0] != "}": + raise ValueError("expecting '}' at {}".format(pos)) + + pos = parse_space(pos[1:], is_nested) + else: + raise ValueError(f"expecting ',' at {pos}") + handle_repetitions(min_times, max_times) else: break - # } - # return pos; - # } return pos @@ -919,39 +585,32 @@ def parse_sequence( # add_rule(state, rule_id, rule); # return pos; # } -def parse_alternates( - state: parse_state, - src: const_char_p, - rule_name: str, - rule_id: int, - is_nested: bool, -) -> const_char_p: - rule = std.vector() # type: std.vector[LlamaGrammarElement] - pos = parse_sequence(state, src, rule_name, rule, is_nested) # type: const_char_p - while pos[0] == "|": - rule.push_back(LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_ALT, 0)) - pos = parse_space(pos + 1, True) +def parse_alternates(state: ParseState, src: str, rule_name: str, rule_id: int, is_nested: bool) -> str: + rule = [] + pos = parse_sequence(state, src, rule_name, rule, is_nested) + while pos.startswith("|"): + rule.append(GrammarElement(GrammarElementType.ALT, 0)) + pos = parse_space(pos[1:], newline_ok=True) pos = parse_sequence(state, pos, rule_name, rule, is_nested) - rule.push_back(LlamaGrammarElement(llama_gretype.LLAMA_GRETYPE_END, 0)) + rule.append(GrammarElement(GrammarElementType.END, 0)) add_rule(state, rule_id, rule) return pos -# const char * parse_rule(parse_state & state, const char * src) { +# static const char * parse_rule(parse_state & state, const char * src) { # const char * name_end = parse_name(src); # const char * pos = parse_space(name_end, false); # size_t name_len = name_end - src; # uint32_t rule_id = get_symbol_id(state, src, name_len); # const std::string name(src, name_len); - +# # if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { # throw std::runtime_error(std::string("expecting ::= at ") + pos); # } # pos = parse_space(pos + 3, true); - +# # pos = parse_alternates(state, pos, name, rule_id, false); - - +# # if (*pos == '\r') { # pos += pos[1] == '\n' ? 2 : 1; # } else if (*pos == '\n') { @@ -961,26 +620,26 @@ def parse_alternates( # } # return parse_space(pos, true); # } -def parse_rule(state: parse_state, src: const_char_p) -> const_char_p: - name_end = parse_name(src) # type: const_char_p - pos = parse_space(name_end, False) # type: const_char_p - name_len = name_end - src # type: int - rule_id = get_symbol_id(state, src, name_len) # type: int - name = std.string(src, name_len) # type: str - - if not (pos[0] == ":" and pos[1] == ":" and pos[2] == "="): - raise RuntimeError("expecting ::= at " + str(pos)) - - pos = parse_space(pos + 3, True) # type: const_char_p - pos = parse_alternates(state, pos, name, rule_id, False) # type: const_char_p - - if pos[0] == "\r": - pos += 2 if pos[1] == "\n" else 1 - elif pos[0] == "\n": - pos += 1 - elif pos[0]: - raise RuntimeError("expecting newline or end at " + str(pos)) - return parse_space(pos, True) +def parse_rule(state: ParseState, src: str) -> str: + pos = src + name, pos = parse_name(pos) + pos = parse_space(pos, newline_ok=False) + rule_id = get_symbol_id(state, name) + + if not pos.startswith("::="): + raise ValueError(f"expecting ::= at {pos}") + + pos = parse_space(pos[3:], newline_ok=True) + + pos = parse_alternates(state, pos, name, rule_id, is_nested=False) + + if pos.startswith("\r"): + pos = pos[2:] if pos[1] == "\n" else pos[1:] + elif pos.startswith("\n"): + pos = pos[1:] + elif pos: + raise ValueError(f"expecting newline or end at {pos}") + return parse_space(pos, newline_ok=True) # parse_state parse(const char * src) { @@ -990,204 +649,273 @@ def parse_rule(state: parse_state, src: const_char_p) -> const_char_p: # while (*pos) { # pos = parse_rule(state, pos); # } +# // Validate the state to ensure that all rules are defined +# for (const auto & rule : state.rules) { +# for (const auto & elem : rule) { +# if (elem.type == LLAMA_GRETYPE_RULE_REF) { +# // Ensure that the rule at that location exists +# if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { +# // Get the name of the rule that is missing +# for (const auto & kv : state.symbol_ids) { +# if (kv.second == elem.value) { +# throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); +# } +# } +# } +# } +# } +# } # return state; # } catch (const std::exception & err) { # fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); # return parse_state(); # } # } -def parse(src: const_char_p) -> parse_state: - try: - state = parse_state() # type: parse_state - pos = parse_space(src, True) # type: const_char_p - while pos[0]: - pos = parse_rule(state, pos) - return state - except Exception as err: - print(f"{parse.__name__}: error parsing grammar: {err}") - return parse_state() - - -# void print_grammar_char(FILE * file, uint32_t c) { -# if (0x20 <= c && c <= 0x7f) { -# fprintf(file, "%c", static_cast(c)); -# } else { -# // cop out of encoding UTF-8 -# fprintf(file, "", c); -# } -# } -def print_grammar_char(file: TextIO, c: int) -> None: - if 0x20 <= c and c <= 0x7F: - file.write(chr(c)) - else: - # cop out of encoding UTF-8 - file.write(f"") - - -# bool is_char_element(llama_grammar_element elem) { +def parse(src: str) -> ParseState: + state = ParseState() + pos = src + pos = parse_space(pos, newline_ok=True) + while pos: + pos = parse_rule(state, pos) + # validate + for rule in state.rules: + for elem in rule: + if elem.type == GrammarElementType.RULE_REF: + if elem.value >= len(state.rules) or not state.rules[elem.value]: + for k, v in state.symbol_ids.items(): + if v == elem.value: + raise ValueError(f"Undefined rule identifier '{k}'") + return state + + +# static bool is_char_element(llama_grammar_element elem) { # switch (elem.type) { # case LLAMA_GRETYPE_CHAR: return true; # case LLAMA_GRETYPE_CHAR_NOT: return true; # case LLAMA_GRETYPE_CHAR_ALT: return true; # case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; +# case LLAMA_GRETYPE_CHAR_ANY: return true; # default: return false; # } # } -def is_char_element(elem: LlamaGrammarElement) -> bool: +def is_char_element(elem: GrammarElement) -> bool: return elem.type in ( - llama_gretype.LLAMA_GRETYPE_CHAR, - llama_gretype.LLAMA_GRETYPE_CHAR_NOT, - llama_gretype.LLAMA_GRETYPE_CHAR_ALT, - llama_gretype.LLAMA_GRETYPE_CHAR_RNG_UPPER, + GrammarElementType.CHAR, + GrammarElementType.CHAR_NOT, + GrammarElementType.CHAR_ALT, + GrammarElementType.CHAR_RNG_UPPER, + GrammarElementType.CHAR_ANY ) -# void print_rule( +def print_grammar_char(file: typing.TextIO, c: int) -> None: + if 0x20 <= c <= 0x7f: + print(chr(c), end="", file=file) + else: + print(f"", end="", file=file) + + +# static void print_rule( # FILE * file, # uint32_t rule_id, # const std::vector & rule, # const std::map & symbol_id_names) { +# if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { +# throw std::runtime_error( +# "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); +# } +# fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); +# for (size_t i = 0, end = rule.size() - 1; i < end; i++) { +# llama_grammar_element elem = rule[i]; +# switch (elem.type) { +# case LLAMA_GRETYPE_END: +# throw std::runtime_error( +# "unexpected end of rule: " + std::to_string(rule_id) + "," + +# std::to_string(i)); +# case LLAMA_GRETYPE_ALT: +# fprintf(file, "| "); +# break; +# case LLAMA_GRETYPE_RULE_REF: +# fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); +# break; +# case LLAMA_GRETYPE_CHAR: +# fprintf(file, "["); +# print_grammar_char(file, elem.value); +# break; +# case LLAMA_GRETYPE_CHAR_NOT: +# fprintf(file, "[^"); +# print_grammar_char(file, elem.value); +# break; +# case LLAMA_GRETYPE_CHAR_RNG_UPPER: +# if (i == 0 || !is_char_element(rule[i - 1])) { +# throw std::runtime_error( +# "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + +# std::to_string(rule_id) + "," + std::to_string(i)); +# } +# fprintf(file, "-"); +# print_grammar_char(file, elem.value); +# break; +# case LLAMA_GRETYPE_CHAR_ALT: +# if (i == 0 || !is_char_element(rule[i - 1])) { +# throw std::runtime_error( +# "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + +# std::to_string(rule_id) + "," + std::to_string(i)); +# } +# print_grammar_char(file, elem.value); +# break; +# case LLAMA_GRETYPE_CHAR_ANY: +# fprintf(file, "."); +# break; +# } +# if (is_char_element(elem)) { +# switch (rule[i + 1].type) { +# case LLAMA_GRETYPE_CHAR_ALT: +# case LLAMA_GRETYPE_CHAR_RNG_UPPER: +# case LLAMA_GRETYPE_CHAR_ANY: +# break; +# default: +# fprintf(file, "] "); +# } +# } +# } +# fprintf(file, "\n"); +# } def print_rule( - file: TextIO, + file: typing.TextIO, rule_id: int, - rule: std.vector[LlamaGrammarElement], - symbol_id_names: std.map[int, str], + rule: typing.List[GrammarElement], + symbol_id_names: typing.Dict[int, str], ) -> None: - # if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { - # throw std::runtime_error( - # "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); - # } - # fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); - if rule.empty() or rule.back().type != llama_gretype.LLAMA_GRETYPE_END: - raise RuntimeError( - "malformed rule, does not end with LLAMA_GRETYPE_END: " + str(rule_id) - ) - print(f"{symbol_id_names.at(rule_id)} ::=", file=file, end=" ") - # for (size_t i = 0, end = rule.size() - 1; i < end; i++) { - # llama_grammar_element elem = rule[i]; - # switch (elem.type) { - # case LLAMA_GRETYPE_END: - # throw std::runtime_error( - # "unexpected end of rule: " + std::to_string(rule_id) + "," + - # std::to_string(i)); - # case LLAMA_GRETYPE_ALT: - # fprintf(file, "| "); - # break; - # case LLAMA_GRETYPE_RULE_REF: - # fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); - # break; - # case LLAMA_GRETYPE_CHAR: - # fprintf(file, "["); - # print_grammar_char(file, elem.value); - # break; - # case LLAMA_GRETYPE_CHAR_NOT: - # fprintf(file, "[^"); - # print_grammar_char(file, elem.value); - # break; - # case LLAMA_GRETYPE_CHAR_RNG_UPPER: - # if (i == 0 || !is_char_element(rule[i - 1])) { - # throw std::runtime_error( - # "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + - # std::to_string(rule_id) + "," + std::to_string(i)); - # } - # fprintf(file, "-"); - # print_grammar_char(file, elem.value); - # break; - # case LLAMA_GRETYPE_CHAR_ALT: - # if (i == 0 || !is_char_element(rule[i - 1])) { - # throw std::runtime_error( - # "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + - # std::to_string(rule_id) + "," + std::to_string(i)); - # } - # print_grammar_char(file, elem.value); - # break; - # } + if not rule or rule[-1].type != GrammarElementType.END: + raise ValueError(f"malformed rule, does not end with LLAMA_GRETYPE_END: {rule_id}") + + print(f"{symbol_id_names[rule_id]} ::=", end=" ", file=file) + for i, elem in enumerate(rule[:-1]): - case = elem.type # type: llama_gretype - if case is llama_gretype.LLAMA_GRETYPE_END: - raise RuntimeError("unexpected end of rule: " + str(rule_id) + "," + str(i)) - elif case is llama_gretype.LLAMA_GRETYPE_ALT: - print("| ", file=file, end="") - elif case is llama_gretype.LLAMA_GRETYPE_RULE_REF: - print(f"{symbol_id_names.at(elem.value)} ", file=file, end="") - elif case is llama_gretype.LLAMA_GRETYPE_CHAR: - print("[", file=file, end="") + if elem.type == GrammarElementType.END: + raise ValueError(f"unexpected end of rule: {rule_id}, {i}") + if elem.type == GrammarElementType.ALT: + print("| ", end="", file=file) + elif elem.type == GrammarElementType.RULE_REF: + print(f"{symbol_id_names[elem.value]} ", end="", file=file) + elif elem.type == GrammarElementType.CHAR: + print("[", end="", file=file) print_grammar_char(file, elem.value) - elif case is llama_gretype.LLAMA_GRETYPE_CHAR_NOT: - print("[^", file=file, end="") + elif elem.type == GrammarElementType.CHAR_NOT: + print("[^", end="", file=file) print_grammar_char(file, elem.value) - elif case is llama_gretype.LLAMA_GRETYPE_CHAR_RNG_UPPER: + elif elem.type == GrammarElementType.CHAR_RNG_UPPER: if i == 0 or not is_char_element(rule[i - 1]): - raise RuntimeError( - "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " - + str(rule_id) - + "," - + str(i) - ) - print("-", file=file, end="") + raise ValueError(f"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: {rule_id}, {i}") + print(f"-", end="", file=file) print_grammar_char(file, elem.value) - elif case is llama_gretype.LLAMA_GRETYPE_CHAR_ALT: + elif elem.type == GrammarElementType.CHAR_ALT: if i == 0 or not is_char_element(rule[i - 1]): - raise RuntimeError( - "LLAMA_GRETYPE_CHAR_ALT without preceding char: " - + str(rule_id) - + "," - + str(i) - ) + raise ValueError(f"LLAMA_GRETYPE_CHAR_ALT without preceding char: {rule_id}, {i}") print_grammar_char(file, elem.value) - # if (is_char_element(elem)) { - # switch (rule[i + 1].type) { - # case LLAMA_GRETYPE_CHAR_ALT: - # case LLAMA_GRETYPE_CHAR_RNG_UPPER: - # break; - # default: - # fprintf(file, "] "); + elif elem.type == GrammarElementType.CHAR_ANY: + print(".", end="", file=file) if is_char_element(elem): - if rule[i + 1].type in ( - llama_gretype.LLAMA_GRETYPE_CHAR_ALT, - llama_gretype.LLAMA_GRETYPE_CHAR_RNG_UPPER, - ): - pass - else: - print("] ", file=file, end="") - # } - # } - # } - # fprintf(file, "\n"); - # } + if rule[i + 1].type in (GrammarElementType.CHAR_ALT, GrammarElementType.CHAR_RNG_UPPER, GrammarElementType.CHAR_ANY): + continue + print("] ", end="", file=file) print(file=file) -# void print_grammar(FILE * file, const parse_state & state) { -# try { -# std::map symbol_id_names; -# for (auto kv : state.symbol_ids) { -# symbol_id_names[kv.second] = kv.first; -# } -# for (size_t i = 0, end = state.rules.size(); i < end; i++) { -# // fprintf(file, "%zu: ", i); -# // print_rule_binary(file, state.rules[i]); -# print_rule(file, i, state.rules[i], symbol_id_names); -# // fprintf(file, "\n"); -# } -# } catch (const std::exception & err) { -# fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what()); -# } -# } -def print_grammar(file: TextIO, state: parse_state) -> None: +def print_grammar(file: typing.TextIO, state: ParseState) -> None: try: - symbol_id_names = std.map() # type: std.map[int, str] - for kv in state.symbol_ids.items(): - symbol_id_names[kv[1]] = kv[0] - + symbol_id_names = {v: k for k, v in state.symbol_ids.items()} for i, rule in enumerate(state.rules): print_rule(file, i, rule, symbol_id_names) except Exception as err: - print( - f"{print_grammar.__name__}: error printing grammar: {err}", - file=sys.stderr, + print(f"\nerror printing grammar: {err}", file=file) + raise err + + +class LlamaGrammar: + def __init__(self, parse_state: ParseState): + self.parse_state = parse_state + + self._grammar_rules = parse_state.rules + self._n_rules = len(self._grammar_rules) + self._start_rule_index = parse_state.symbol_ids["root"] + + self._element_lists = [ + [ + llama_cpp.llama_grammar_element(ctypes.c_int(elem.type), ctypes.c_uint32(elem.value)) + for elem in subvector + ] + for subvector in self._grammar_rules + ] + + # Step 2: Convert each list to llama_grammar_element array and get pointer + self._element_arrays = [ + (llama_cpp.llama_grammar_element * len(sublist))(*sublist) + for sublist in self._element_lists + ] + + # Step 3: Get pointer of each array + self._element_array_pointers = [ + ctypes.cast(subarray, llama_cpp.llama_grammar_element_p) for subarray in self._element_arrays + ] + + # Step 4: Make array of these pointers and get its pointer + self._rules = (llama_cpp.llama_grammar_element_p * len(self._element_array_pointers))( + *self._element_array_pointers ) + self.grammar = None + self._init_grammar() + + + def _init_grammar(self): + grammar = llama_cpp.llama_grammar_init( + self._rules, ctypes.c_size_t(self._n_rules), ctypes.c_size_t(self._start_rule_index) + ) + + if grammar is None: + raise ValueError("Failed to create grammar") + + self.grammar = grammar + + def __del__(self): + if self.grammar is not None: + llama_cpp.llama_grammar_free(self.grammar) + self.grammar = None + + def reset(self): + if self.grammar is not None: + llama_cpp.llama_grammar_free(self.grammar) + self._init_grammar() + + @classmethod + def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar": + parsed_grammar = parse(grammar) + if verbose: + print_grammar(file=sys.stdout, state=parsed_grammar) + return cls(parsed_grammar) + + @classmethod + def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar": + try: + with open(file) as f: + grammar = f.read() + except Exception as err: + raise Exception( + f"{cls.from_file.__name__}: error reading grammar file: {err}" + ) + + if grammar: + return cls.from_string(grammar, verbose=verbose) + + raise ValueError( + f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty" + ) + + @classmethod + def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGrammar": + return cls.from_string(json_schema_to_gbnf(json_schema), verbose=verbose) + """llama.cpp gbnf rules from vendor/llama.cpp/grammars""" @@ -1358,12 +1086,13 @@ def print_grammar(file: TextIO, state: parse_state) -> None: string ::= "\"" ( [^"\\\x7F\x00-\x1F] | - "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes + "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes )* "\"" ws -number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws +number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws -ws ::= ([ \t\n] ws)? +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= | " " | "\n" [ \t]{0,20} """ LIST_GBNF = r""" @@ -2091,4 +1820,4 @@ def json_schema_to_gbnf(schema: str, prop_order: Optional[List[str]] = None): ) schema = converter.resolve_refs(schema, "stdin") converter.visit(schema, "") - return converter.format_grammar() + return converter.format_grammar() \ No newline at end of file diff --git a/nexa/gguf/llama/llama_speculative.py b/nexa/gguf/llama/llama_speculative.py index 39dfb903..6188cb26 100644 --- a/nexa/gguf/llama/llama_speculative.py +++ b/nexa/gguf/llama/llama_speculative.py @@ -61,4 +61,4 @@ def __call__( input_ids=input_ids, max_ngram_size=self.max_ngram_size, num_pred_tokens=self.num_pred_tokens, - ) + ) \ No newline at end of file diff --git a/nexa/gguf/llama/llama_tokenizer.py b/nexa/gguf/llama/llama_tokenizer.py index 73925176..00c2b984 100644 --- a/nexa/gguf/llama/llama_tokenizer.py +++ b/nexa/gguf/llama/llama_tokenizer.py @@ -1,9 +1,14 @@ from __future__ import annotations import abc -from typing import Any, List, Optional +from typing import ( + List, + Optional, + Any, +) -from nexa.gguf.llama import llama_cpp +import nexa.gguf.llama.llama_cpp as llama_cpp +from nexa.gguf.llama.llama_types import List class BaseLlamaTokenizer(abc.ABC): @@ -98,4 +103,4 @@ def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenize hf_tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path ) - return cls(hf_tokenizer) + return cls(hf_tokenizer) \ No newline at end of file diff --git a/nexa/gguf/llama/llama_types.py b/nexa/gguf/llama/llama_types.py index 64663a20..3cc2122e 100644 --- a/nexa/gguf/llama/llama_types.py +++ b/nexa/gguf/llama/llama_types.py @@ -237,6 +237,7 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, ChatCompletionRequestAssistantMessage, + ChatCompletionRequestUserMessage, ChatCompletionRequestToolMessage, ChatCompletionRequestFunctionMessage, ] @@ -294,4 +295,4 @@ class ChatCompletionNamedToolChoice(TypedDict): ChatCompletionChunk = CreateChatCompletionStreamResponse ChatCompletionStreamResponse = CreateChatCompletionStreamResponse ChatCompletionResponseFunction = ChatCompletionFunction -ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall +ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall \ No newline at end of file diff --git a/nexa/gguf/llama/llava_cpp.py b/nexa/gguf/llama/llava_cpp.py index a1f425bc..fa8eb7c0 100644 --- a/nexa/gguf/llama/llava_cpp.py +++ b/nexa/gguf/llama/llava_cpp.py @@ -1,30 +1,32 @@ from __future__ import annotations +import sys +import os import ctypes import functools -from ctypes import _Pointer # type: ignore from ctypes import ( - POINTER, - Structure, c_bool, c_char_p, - c_float, c_int, c_uint8, + c_float, c_void_p, + POINTER, + _Pointer, # type: ignore + Structure, ) +import pathlib from typing import ( - TYPE_CHECKING, - Any, - Callable, - Generic, List, + Union, NewType, Optional, TypeVar, - Union, + Callable, + Any, + TYPE_CHECKING, + Generic, ) - from typing_extensions import TypeAlias import nexa.gguf.llama.llama_cpp as llama_cpp @@ -184,4 +186,4 @@ def clip_model_load( # /** free mmproj model */ # CLIP_API void clip_free(struct clip_ctx * ctx); @ctypes_function("clip_free", [clip_ctx_p_ctypes], None) -def clip_free(ctx: clip_ctx_p, /): ... +def clip_free(ctx: clip_ctx_p, /): ... \ No newline at end of file diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index a3d6114d..8a0202dd 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -86,7 +86,7 @@ def _load_model(self, model_path: str): lora_model_dir=self.params.get("lora_dir", ""), n_threads=self.params.get("n_threads", multiprocessing.cpu_count()), wtype=self.params.get( - "wtype", NEXA_RUN_MODEL_PRECISION_MAP.get(model_path, "default") + "wtype", NEXA_RUN_MODEL_PRECISION_MAP.get(model_path, "f32") ), # Weight type (options: default, f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0) control_net_path=self.params.get("control_net_path", ""), verbose=False, @@ -103,7 +103,7 @@ def _save_images(self, images): file_name = f"image_{i+1}_{int(time.time())}.png" file_path = os.path.join(output_dir, file_name) image.save(file_path) - logging.info(f"\nImage {i+1} saved to: {file_path}") + print(f"\nImage {i+1} saved to: {os.path.abspath(file_path)}") def txt2img(self, prompt, diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index 0e93f36e..c3f03aa1 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -165,7 +165,7 @@ def _load_model(self): chat_handler=self.projector, verbose=False, chat_format=self.chat_format, - n_ctx=self.params.get("max_new_tokens", 2048), + n_ctx=2048, n_gpu_layers=-1 if is_gpu_available() else 0, ) except Exception as e: @@ -178,7 +178,7 @@ def _load_model(self): chat_handler=self.projector, verbose=False, chat_format=self.chat_format, - n_ctx=self.params.get("max_new_tokens", 2048), + n_ctx=2048, n_gpu_layers=0, # hardcode to use CPU ) diff --git a/nexa/gguf/streamlit/streamlit_image_chat.py b/nexa/gguf/streamlit/streamlit_image_chat.py index d2e3cc78..5f4bbca6 100644 --- a/nexa/gguf/streamlit/streamlit_image_chat.py +++ b/nexa/gguf/streamlit/streamlit_image_chat.py @@ -1,6 +1,5 @@ import os import sys - from PIL import Image from nexa.general import pull_model import streamlit as st @@ -72,12 +71,6 @@ def generate_images(nexa_model: NexaImageInference, prompt: str, negative_prompt 100, st.session_state.nexa_model.params["num_inference_steps"], ) -num_images_per_prompt = st.sidebar.slider( - "Number of Images per Prompt", - 1, - 10, - st.session_state.nexa_model.params["num_images_per_prompt"], -) height = st.sidebar.slider( "Height", 64, 1024, st.session_state.nexa_model.params["height"] ) @@ -94,7 +87,6 @@ def generate_images(nexa_model: NexaImageInference, prompt: str, negative_prompt st.session_state.nexa_model.params.update( { "num_inference_steps": num_inference_steps, - "num_images_per_prompt": num_images_per_prompt, "height": height, "width": width, "guidance_scale": guidance_scale,