From ddc5a5033f948dc7ab0a3a6ec2d914d13c274077 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 Jan 2024 11:26:17 +0200 Subject: [PATCH 1/7] metal : show compile log messages --- ggml-metal.m | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 4b3eb491424d7..60fef1a1912b5 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -277,6 +277,10 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ NSURL * libURL = [NSURL fileURLWithPath:libPath]; GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + if (error) { + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } } else { GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); @@ -315,13 +319,12 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ //[options setFastMathEnabled:false]; ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; + if (error) { + GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } } } - - if (error) { - GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } } // print MTL GPU family: From faa3526a1eba458120987ed8269e5616385a76f4 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:58:53 +0200 Subject: [PATCH 2/7] Fix Q3_K_XS for MoE models (#5113) Co-authored-by: Iwan Kawrakow --- llama.cpp | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/llama.cpp b/llama.cpp index 114046db92675..6a7506e85f8ec 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8829,6 +8829,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty auto use_more_bits = [](int i_layer, int num_layers) -> bool { return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; }; + const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { + if (n_expert > 1) { + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work + // for getting the current layer as I initially thought, and we need to resort to parsing the + // tensor name. + n_layer /= n_expert; + if (sscanf(name, "blk.%d.", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); + } + if (i_layer < 0 || i_layer >= n_layer) { + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); + } + } + return std::make_pair(i_layer, n_layer); + }; if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; @@ -8890,24 +8907,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = GGML_TYPE_Q2_K; } } else if (name.find("ffn_down") != std::string::npos) { - const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); - int i_layer, n_layer; - if (n_expert == 1) { - i_layer = qs.i_ffn_down; - n_layer = qs.n_ffn_down; - } else { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly - // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work - // for getting the current layer as I initially thought, and we need to resort to parsing the - // tensor name. - n_layer = qs.n_ffn_down / n_expert; - if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { - throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); - } - if (i_layer < 0 || i_layer >= n_layer) { - throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer)); - } - } + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; @@ -8963,13 +8964,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) { + auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { new_type = GGML_TYPE_Q2_K; } ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) { + auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { new_type = GGML_TYPE_Q2_K; } ++qs.i_ffn_up; From 256d1bb0ddce6a0a21f5a7503019bdd5c1933cba Mon Sep 17 00:00:00 2001 From: Valentin Konovalov Date: Thu, 25 Jan 2024 12:05:51 -0500 Subject: [PATCH 3/7] android : use release cmake build type by default (#5123) --- examples/llama.android/app/build.gradle.kts | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index 7815a802593ba..aadbe22c91835 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -30,6 +30,7 @@ android { } externalNativeBuild { cmake { + arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() } From d292f4f2047963f558dd516f1baaa71793e9acf2 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 25 Jan 2024 14:51:24 -0500 Subject: [PATCH 4/7] examples : make pydantic scripts pass mypy and support py3.8 (#5099) --- .../pydantic-models-to-grammar-examples.py | 43 +---- examples/pydantic_models_to_grammar.py | 166 +++++++++--------- 2 files changed, 88 insertions(+), 121 deletions(-) diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py index cbf3766526ff7..160966649b05d 100644 --- a/examples/pydantic-models-to-grammar-examples.py +++ b/examples/pydantic-models-to-grammar-examples.py @@ -1,14 +1,14 @@ # Function calling example using pydantic models. import datetime +import importlib import json from enum import Enum -from typing import Union, Optional +from typing import Optional, Union import requests from pydantic import BaseModel, Field - -import importlib -from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function +from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model, + create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) # Function to get completion on the llama.cpp server with grammar. @@ -35,7 +35,7 @@ def run(self): print(self.message) -# Enum for the calculator function. +# Enum for the calculator tool. class MathOperation(Enum): ADD = "add" SUBTRACT = "subtract" @@ -43,7 +43,7 @@ class MathOperation(Enum): DIVIDE = "divide" -# Very simple calculator tool for the agent. +# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. class Calculator(BaseModel): """ Perform a math operation on two numbers. @@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None): return datetime.datetime.now().strftime(output_format) -# Enum for the calculator tool. -class MathOperation(Enum): - ADD = "add" - SUBTRACT = "subtract" - MULTIPLY = "multiply" - DIVIDE = "divide" - - - -# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. -class Calculator(BaseModel): - """ - Perform a math operation on two numbers. - """ - number_one: Union[int, float] = Field(..., description="First number.") - operation: MathOperation = Field(..., description="Math operation to perform.") - number_two: Union[int, float] = Field(..., description="Second number.") - - def run(self): - if self.operation == MathOperation.ADD: - return self.number_one + self.number_two - elif self.operation == MathOperation.SUBTRACT: - return self.number_one - self.number_two - elif self.operation == MathOperation.MULTIPLY: - return self.number_one * self.number_two - elif self.operation == MathOperation.DIVIDE: - return self.number_one / self.number_two - else: - raise ValueError("Unknown operation.") - - # Example function to get the weather def get_current_weather(location, unit): """Get the current weather in a given location""" diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 848c1c367d701..9acc7cc6dcd85 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -1,15 +1,21 @@ +from __future__ import annotations + import inspect import json +import re from copy import copy -from inspect import isclass, getdoc -from types import NoneType +from enum import Enum +from inspect import getdoc, isclass +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints from docstring_parser import parse -from pydantic import BaseModel, create_model, Field -from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias -from enum import Enum -from typing import get_type_hints, Callable -import re +from pydantic import BaseModel, Field, create_model + +if TYPE_CHECKING: + from types import GenericAlias +else: + # python 3.8 compat + from typing import _GenericAlias as GenericAlias class PydanticDataType(Enum): @@ -43,7 +49,7 @@ class PydanticDataType(Enum): SET = "set" -def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: +def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str: if isclass(pydantic_type) and issubclass(pydantic_type, str): return PydanticDataType.STRING.value elif isclass(pydantic_type) and issubclass(pydantic_type, bool): @@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel): return format_model_and_field_name(pydantic_type.__name__) - elif get_origin(pydantic_type) == list: + elif get_origin(pydantic_type) is list: element_type = get_args(pydantic_type)[0] return f"{map_pydantic_type_to_gbnf(element_type)}-list" - elif get_origin(pydantic_type) == set: + elif get_origin(pydantic_type) is set: element_type = get_args(pydantic_type)[0] return f"{map_pydantic_type_to_gbnf(element_type)}-set" - elif get_origin(pydantic_type) == Union: + elif get_origin(pydantic_type) is Union: union_types = get_args(pydantic_type) union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types] return f"union-{'-or-'.join(union_rules)}" - elif get_origin(pydantic_type) == Optional: + elif get_origin(pydantic_type) is Optional: element_type = get_args(pydantic_type)[0] return f"optional-{map_pydantic_type_to_gbnf(element_type)}" elif isclass(pydantic_type): return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}" - elif get_origin(pydantic_type) == dict: + elif get_origin(pydantic_type) is dict: key_type, value_type = get_args(pydantic_type) return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}" else: @@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name): return f"{cls.__name__.lower()} ::= " + " | ".join(members) if cls.__annotations__ and cls.__annotations__ != {}: result = f'{rule_name} ::= "{{"' - type_list_rules = [] # Modify this comprehension members = [ f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}' @@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name): result += '"," '.join(members) result += ' "}"' - return result, type_list_rules - elif rule_name == "custom-class-any": + return result + if rule_name == "custom-class-any": result = f"{rule_name} ::= " result += "value" - type_list_rules = [] - return result, type_list_rules - else: - init_signature = inspect.signature(cls.__init__) - parameters = init_signature.parameters - result = f'{rule_name} ::= "{{"' - type_list_rules = [] - # Modify this comprehension too - members = [ - f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' - for name, param in parameters.items() - if name != "self" and param.annotation != inspect.Parameter.empty - ] + return result - result += '", "'.join(members) - result += ' "}"' - return result, type_list_rules + init_signature = inspect.signature(cls.__init__) + parameters = init_signature.parameters + result = f'{rule_name} ::= "{{"' + # Modify this comprehension too + members = [ + f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' + for name, param in parameters.items() + if name != "self" and param.annotation != inspect.Parameter.empty + ] + + result += '", "'.join(members) + result += ' "}"' + return result def regex_to_gbnf(regex_pattern: str) -> str: @@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None def generate_gbnf_rule_for_type( model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None -) -> Tuple[str, list]: +) -> tuple[str, list[str]]: """ Generate GBNF rule for a given field type. @@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type( :param field_info: Additional information about the field (optional). :return: Tuple containing the GBNF type and a list of additional rules. - :rtype: Tuple[str, list] + :rtype: tuple[str, list] """ rules = [] @@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type( gbnf_type, rules = model_name + "-" + field_name, rules elif gbnf_type.startswith("custom-class-"): - nested_model_rules, field_types = get_members_structure(field_type, gbnf_type) - rules.append(nested_model_rules) + rules.append(get_members_structure(field_type, gbnf_type)) elif gbnf_type.startswith("custom-dict-"): key_type, value_type = get_args(field_type) @@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type( union_rules = [] for union_type in union_types: - if isinstance(union_type, _GenericAlias): + if isinstance(union_type, GenericAlias): union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( model_name, field_name, union_type, False, processed_models, created_rules ) union_rules.append(union_gbnf_type) rules.extend(union_rules_list) - elif not issubclass(union_type, NoneType): + elif not issubclass(union_type, type(None)): union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( model_name, field_name, union_type, False, processed_models, created_rules ) @@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type( else: gbnf_type, rules = gbnf_type, [] - if gbnf_type not in created_rules: - return gbnf_type, rules - else: - if gbnf_type in created_rules: - return gbnf_type, rules + return gbnf_type, rules -def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool): +def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]: """ Generate GBnF Grammar @@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created ``` """ if model in processed_models: - return [] + return [], False processed_models.add(model) model_name = format_model_and_field_name(model.__name__) @@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created def generate_gbnf_grammar_from_pydantic_models( - models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None, + models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None, list_of_outputs: bool = False ) -> str: """ @@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models( * grammar. Args: - models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. + models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from. outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. list_of_outputs (str, optional): Allows a list of output objects @@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models( # root ::= UserModel | PostModel # ... """ - processed_models = set() + processed_models: set[type[BaseModel]] = set() all_rules = [] - created_rules = {} + created_rules: dict[str, list[str]] = {} if outer_object_name is None: for model in models: model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules) @@ -608,7 +606,7 @@ def get_primitive_grammar(grammar): Returns: str: GBNF primitive grammar string. """ - type_list = [] + type_list: list[type[object]] = [] if "string-list" in grammar: type_list.append(str) if "boolean-list" in grammar: @@ -666,14 +664,14 @@ def get_primitive_grammar(grammar): def generate_markdown_documentation( - pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields", documentation_with_field_description=True ) -> str: """ Generate markdown documentation for a list of Pydantic models. Args: - pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + pydantic_models (list[type[BaseModel]]): list of Pydantic model classes. model_prefix (str): Prefix for the model section. fields_prefix (str): Prefix for the fields section. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -731,7 +729,7 @@ def generate_markdown_documentation( def generate_field_markdown( - field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + field_name: str, field_type: type[Any], model: type[BaseModel], depth=1, documentation_with_field_description=True ) -> str: """ @@ -739,8 +737,8 @@ def generate_field_markdown( Args: field_name (str): Name of the field. - field_type (Type[Any]): Type of the field. - model (Type[BaseModel]): Pydantic model class. + field_type (type[Any]): Type of the field. + model (type[BaseModel]): Pydantic model class. depth (int): Indentation depth in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -798,7 +796,7 @@ def generate_field_markdown( return field_text -def format_json_example(example: dict, depth: int) -> str: +def format_json_example(example: dict[str, Any], depth: int) -> str: """ Format a JSON example into a readable string with indentation. @@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str: def generate_text_documentation( - pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields", documentation_with_field_description=True ) -> str: """ Generate text documentation for a list of Pydantic models. Args: - pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + pydantic_models (list[type[BaseModel]]): List of Pydantic model classes. model_prefix (str): Prefix for the model section. fields_prefix (str): Prefix for the fields section. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -885,7 +883,7 @@ def generate_text_documentation( def generate_field_text( - field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + field_name: str, field_type: type[Any], model: type[BaseModel], depth=1, documentation_with_field_description=True ) -> str: """ @@ -893,8 +891,8 @@ def generate_field_text( Args: field_name (str): Name of the field. - field_type (Type[Any]): Type of the field. - model (Type[BaseModel]): Pydantic model class. + field_type (type[Any]): Type of the field. + model (type[BaseModel]): Pydantic model class. depth (int): Indentation depth in the documentation. documentation_with_field_description (bool): Include field descriptions in the documentation. @@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation( pydantic_model_list, grammar_file_path="./generated_grammar.gbnf", documentation_file_path="./generated_grammar_documentation.md", - outer_object_name: str = None, - outer_object_content: str = None, + outer_object_name: str | None = None, + outer_object_content: str | None = None, model_prefix: str = "Output Model", fields_prefix: str = "Output Fields", list_of_outputs: bool = False, @@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation( def generate_gbnf_grammar_and_documentation( pydantic_model_list, - outer_object_name: str = None, - outer_object_content: str = None, + outer_object_name: str | None = None, + outer_object_content: str | None = None, model_prefix: str = "Output Model", fields_prefix: str = "Output Fields", list_of_outputs: bool = False, @@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation( def generate_gbnf_grammar_and_documentation_from_dictionaries( - dictionaries: List[dict], - outer_object_name: str = None, - outer_object_content: str = None, + dictionaries: list[dict[str, Any]], + outer_object_name: str | None = None, + outer_object_content: str | None = None, model_prefix: str = "Output Model", fields_prefix: str = "Output Fields", list_of_outputs: bool = False, @@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries( Generate GBNF grammar and documentation from a list of dictionaries. Args: - dictionaries (List[dict]): List of dictionaries representing Pydantic models. + dictionaries (list[dict]): List of dictionaries representing Pydantic models. outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. model_prefix (str): Prefix for the model section in the documentation. @@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries( return grammar, documentation -def create_dynamic_model_from_function(func: Callable): +def create_dynamic_model_from_function(func: Callable[..., Any]): """ Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. @@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable): sig = inspect.signature(func) # Parse the docstring + assert func.__doc__ is not None docstring = parse(func.__doc__) dynamic_fields = {} @@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable): f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") # Add parameter details to the schema - param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) param_docs.append((param.name, param_doc)) if param.default == inspect.Parameter.empty: default_value = ... @@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable): dynamic_fields[param.name] = ( param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) + dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload] - for param_doc in param_docs: - dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description + for name, param_doc in param_docs: + dynamic_model.model_fields[name].description = param_doc.description dynamic_model.__doc__ = docstring.short_description @@ -1182,16 +1180,16 @@ def run_method_wrapper(self): return dynamic_model -def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): +def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]): """ Add a 'run' method to a dynamic Pydantic model, using the provided function. Args: - model (Type[BaseModel]): Dynamic Pydantic model class. + model (type[BaseModel]): Dynamic Pydantic model class. func (Callable): Function to be added as a 'run' method to the model. Returns: - Type[BaseModel]: Pydantic model class with the added 'run' method. + type[BaseModel]: Pydantic model class with the added 'run' method. """ def run_method_wrapper(self): @@ -1204,15 +1202,15 @@ def run_method_wrapper(self): return model -def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): +def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]): """ Create a list of dynamic Pydantic model classes from a list of dictionaries. Args: - dictionaries (List[dict]): List of dictionaries representing model structures. + dictionaries (list[dict]): List of dictionaries representing model structures. Returns: - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. + list[type[BaseModel]]: List of generated dynamic Pydantic model classes. """ dynamic_models = [] for func in dictionaries: @@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values): return Enum(enum_name, {value: value for value in values}) -def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]: +def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]: """ Convert a dictionary to a Pydantic model class. @@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu model_name (str): Name of the generated Pydantic model. Returns: - Type[BaseModel]: Generated Pydantic model class. + type[BaseModel]: Generated Pydantic model class. """ - fields = {} + fields: dict[str, Any] = {} if "properties" in dictionary: for field_name, field_data in dictionary.get("properties", {}).items(): @@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu if items != {}: array = {"properties": items} array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") - fields[field_name] = (List[array_type], ...) + fields[field_name] = (List[array_type], ...) # type: ignore[valid-type] else: fields[field_name] = (list, ...) elif field_type == "object": From 5eaf9964fc797d4585c214db32a463d557f3ed33 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Fri, 26 Jan 2024 05:06:22 +0900 Subject: [PATCH 5/7] llama : dynamic temperature sampling (#4972) * implemented dynamic temperature sampling from koboldcpp * removed trailing whitespace * removed unused temp parameter in llama_sample_entropy * exposed exponent_val in dynamic temp sampler * added debug check for printf statements * use nullptr in llama_sample_softmax call during llama_sample_entropy this avoids counting the time taken stats twice Co-authored-by: Georgi Gerganov * return earlier if there is only 1 candiate (i.e. max_entropy == 0) * reformat 't' case in llama_sample_queue Co-authored-by: Jared Van Bortel * check for one or zero candidates case in llama_sample_entropy --------- Co-authored-by: Georgi Gerganov Co-authored-by: Jared Van Bortel --- common/sampling.cpp | 12 +++++++- common/sampling.h | 2 ++ llama.cpp | 67 +++++++++++++++++++++++++++++++++++++++++++++ llama.h | 8 ++++++ 4 files changed, 88 insertions(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index dd1ffeb1b8083..efd7eab6e50b8 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -129,6 +129,8 @@ static void sampler_queue( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const float temp = params.temp; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; @@ -143,7 +145,15 @@ static void sampler_queue( case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + case 't': + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; default : break; } } diff --git a/common/sampling.h b/common/sampling.h index 2ee1803761aa2..88899c094866e 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -18,6 +18,8 @@ typedef struct llama_sampling_params { float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/llama.cpp b/llama.cpp index 6a7506e85f8ec..823d42d7fb50a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8151,6 +8151,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } +void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { + const int64_t t_start_sample_us = ggml_time_us(); + + // no need to do anything if there is only one (or zero) candidates + if(candidates_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / candidates_p->size); + + llama_sample_softmax(nullptr, candidates_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < candidates_p->size; ++i) { + float prob = candidates_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + +#ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); +#endif + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + double max_l_double = candidates_p->data[0].logit; + double cum_sum_double = 0.0; + for (size_t i = 0; i < candidates_p->size; ++i) { + double p = exp(candidates_p->data[i].logit - max_l_double); + candidates_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + +#ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); + } +#endif + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); diff --git a/llama.h b/llama.h index bb60545575932..7b3634aa68505 100644 --- a/llama.h +++ b/llama.h @@ -775,6 +775,14 @@ extern "C" { float p, size_t min_keep); + /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + LLAMA_API void llama_sample_entropy( + struct llama_context * ctx, + llama_token_data_array * candidates_p, + float min_temp, + float max_temp, + float exponent_val); + LLAMA_API void llama_sample_temp( struct llama_context * ctx, llama_token_data_array * candidates, From fe54033b69b83164cabb5f3ed92dc0ff7ea47605 Mon Sep 17 00:00:00 2001 From: XiaotaoChen Date: Fri, 26 Jan 2024 04:14:32 +0800 Subject: [PATCH 6/7] readme : add MobileVLM 1.7B/3B to the supported models list (#5107) Co-authored-by: Chenxiaotao03 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index cbfba01bc4b62..c2a33022cdfc3 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) +- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) **Bindings:** From 1182cf4d4f6ee383b92695c2e3fe438086dcdba7 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 26 Jan 2024 09:14:39 +0200 Subject: [PATCH 7/7] Another bucket sort (#5109) * Initial bucket sort * Bucket sort: slightly better version * Bucket sort: another minor improvement --------- Co-authored-by: Iwan Kawrakow --- llama.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 823d42d7fb50a..b03b67e169955 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7956,10 +7956,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; - if (k == (int) candidates->size) { - std::sort(candidates->data, candidates->data + candidates->size, comp); - } else { + if (k <= 128) { std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + } else { + constexpr int nbuckets = 128; + constexpr float bucket_low = -10.0f; + constexpr float bucket_high = 10.0f; + constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); + constexpr float bucker_inter = -bucket_low * bucket_scale; + + std::vector bucket_idx(candidates->size); + std::vector histo(nbuckets, 0); + + for (int i = 0; i < (int)candidates->size; ++i) { + const float val = candidates->data[i].logit; + int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + ib = std::max(0, std::min(nbuckets-1, ib)); + bucket_idx[i] = ib; + ++histo[ib]; + } + int nhave = 0; + int ib = nbuckets - 1; + for ( ; ib >= 0; --ib) { + nhave += histo[ib]; + if (nhave >= k) break; + } + std::vector tmp_tokens(nhave); + auto ptr = tmp_tokens.data(); + std::vector bucket_ptrs; + bucket_ptrs.reserve(nbuckets - ib); + for (int j = nbuckets - 1; j >= ib; --j) { + bucket_ptrs.push_back(ptr); + ptr += histo[j]; + } + for (int i = 0; i < (int)candidates->size; ++i) { + int j = bucket_idx[i]; + if (j >= ib) { + *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i]; + } + } + + ptr = tmp_tokens.data(); + int ndone = 0; + for (int j = nbuckets-1; j > ib; --j) { + std::sort(ptr, ptr + histo[j], comp); + ptr += histo[j]; + ndone += histo[j]; + } + std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp); + + std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data)); + } candidates->sorted = true; }