From 77f4b996ed379d17180714c073596f65cb47b446 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Fri, 24 Nov 2023 19:09:19 -0500 Subject: [PATCH] working Naming the unnamed ggml structures Here we add names for the nested structures of ggml adding print statements to main. This inserts the print probes at key points adding include for refl working now server has it --- .gitignore | 3 + CMakeLists.txt | 31 +- Makefile | 24 +- binding.py | 337 +++++++++++ examples/main/main.cpp | 21 +- examples/server/server.cpp | 120 +++- ggml-alloc.c => ggml-alloc.cpp | 8 +- ggml-backend.c => ggml-backend.cpp | 24 +- ggml-impl.h | 2 +- ggml-internal.hpp | 258 +++++++++ ggml-mpi.c => ggml-mpi.cpp | 0 ggml-quants.c => ggml-quants.cpp | 575 +++++++++--------- ggml-quants.h | 102 ++-- ggml.c => ggml.cpp | 319 +++++----- ggml.h | 18 +- llama-internal.hpp | 896 +++++++++++++++++++++++++++++ llama.h | 8 + print.hpp | 763 ++++++++++++++++++++++++ tests/CMakeLists.txt | 4 +- tests/{test-c.c => test-c.cpp} | 0 20 files changed, 2976 insertions(+), 537 deletions(-) create mode 100644 binding.py rename ggml-alloc.c => ggml-alloc.cpp (98%) rename ggml-backend.c => ggml-backend.cpp (96%) create mode 100644 ggml-internal.hpp rename ggml-mpi.c => ggml-mpi.cpp (100%) rename ggml-quants.c => ggml-quants.cpp (93%) rename ggml.c => ggml.cpp (98%) create mode 100644 llama-internal.hpp create mode 100644 print.hpp rename tests/{test-c.c => test-c.cpp} (100%) diff --git a/.gitignore b/.gitignore index 41259a12f50cb..f8a2a2dae5902 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe +/#llama.cpp# +#* +\\#* diff --git a/CMakeLists.txt b/CMakeLists.txt index f32df5fe52335..839aad003ca32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" # Compile flags # -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) @@ -230,7 +230,12 @@ if (LLAMA_BLAS) message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") add_compile_options(${BLAS_LINKER_FLAGS}) - add_compile_definitions(GGML_USE_OPENBLAS) + + # from https://github.com/NVIDIA/cutlass + make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp") + set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags}) + + # add_compile_definitions(GGML_USE_OPENBLAS) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() @@ -312,7 +317,7 @@ if (LLAMA_MPI) if (MPI_C_FOUND) message(STATUS "MPI found") set(GGML_HEADERS_MPI ggml-mpi.h) - set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) + set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h) add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) if (NOT MSVC) @@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "") set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags}) endif() +# +set(cuda_flags --verbose -G ${cuda_flags}) + add_compile_options("$<$:${cuda_flags}>") if (WIN32) @@ -485,8 +493,10 @@ if (NOT MSVC) add_link_options(-static-libgcc -static-libstdc++) endif() endif() + add_link_options("-Wl,-Map=${TARGET}.map") + if (LLAMA_GPROF) - add_compile_options(-pg) + add_compile_options(-pg) endif() endif() @@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM) endif() add_library(ggml OBJECT - ggml.c + ggml.cpp ggml.h - ggml-alloc.c + print.hpp + ggml-internal.hpp + llama-internal.hpp + ggml-alloc.cpp ggml-alloc.h - ggml-backend.c + ggml-backend.cpp ggml-backend.h - ggml-quants.c + ggml-quants.cpp ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} @@ -683,7 +696,7 @@ add_library(llama ) target_include_directories(llama PUBLIC .) -target_compile_features(llama PUBLIC cxx_std_11) # don't bump +target_compile_features(llama PUBLIC cxx_std_20) # don't bump target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} diff --git a/Makefile b/Makefile index a6d2c2ec0f380..3fe2af3d8beef 100644 --- a/Makefile +++ b/Makefile @@ -116,7 +116,7 @@ endif # keep standard at C11 and C++11 MK_CPPFLAGS = -I. -Icommon MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++11 -fPIC +MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY # -Ofast tends to produce faster code, but may not be available for some compilers. ifdef LLAMA_FAST @@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h endif # LLAMA_METAL ifdef LLAMA_MPI -ggml-mpi.o: ggml-mpi.c ggml-mpi.h +ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI @@ -537,17 +537,17 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml.o: ggml.cpp ggml.h ggml-cuda.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h + $(CXX) $(CXXFLAGS) -c $< -o $@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o @@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-c.o: tests/test-c.c llama.h - $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ +tests/test-c.o: tests/test-c.cpp llama.h + $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ diff --git a/binding.py b/binding.py new file mode 100644 index 0000000000000..217dce6846340 --- /dev/null +++ b/binding.py @@ -0,0 +1,337 @@ +import os +import json +import re +import clang.cindex + +# configurable part + +CLANG_VERSION='13.0.1' +# homebrew installs for llvm (brew info llvm gives details): +# x64: /usr/local/opt/llvm/lib +# arm64: /opt/homebrew/opt/llvm/lib +llvmLibPath = "/usr/lib/llvm-15/lib/" + +cxxClientRoot = "/home/mdupont/experiments/llama.cpp/" + +fileList = [ +# "ggml.cpp", +# "llama.cpp", + "examples/server/server.cpp", +] + +typeList = [ +] + +# end of configurable part + +clang.cindex.Config.set_library_path(llvmLibPath) + + +def list_headers_in_dir(path): + # enumerates a folder but keeps the full pathing for the files returned + # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx) + + # list all the files in the folder + files = os.listdir(path) + # only include .hxx files + files = list(filter(lambda x: x.endswith('.hxx'), files)) + # add the folder path back on + files = list(map(lambda x: path + x, files)) + return files + + +# parse through the list of files specified and expand wildcards +fullFileList = [] +for filePath in fileList: + if "*" in filePath: + # wildcard path + basePath = filePath[:-1] + if "*" in basePath: + # if there is still a wildcard, we have an issue... + raise NotImplementedError( + "wildcard only supported at end of file path") + files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath)) + fullFileList = fullFileList + files + else: + # normal path + ff = os.path.join(cxxClientRoot, filePath) + fullFileList.append(ff) + print("DBUG",ff) +# exclude _json.hxx files +fullFileList = list( + filter(lambda x: not x.endswith('_json.hxx'), fullFileList)) +# exclude _fmt.hxx files +fullFileList = list( + filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList)) + + +# generate a list of regexps from the type list (for handling wildcards) +typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList)) + + +def is_included_type(name, with_durability=False): + + # TODO(brett19): This should be generalized somehow... + if "is_compound_operation" in name: + return False + + if "replica_context" in name: + return False + + if with_durability is True and '_with_legacy_durability' not in name: + return False + + for x in typeListRe: + if re.fullmatch(x, name): + return True + return False + + +opTypes = [] +opEnums = [] + + +def parse_type(type): + typeStr = type.get_canonical().spelling + return parse_type_str(typeStr) + +std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"] + +def parse_type_str(typeStr): + if typeStr == "std::mutex": + return {"name": "std::mutex"} + if typeStr == "std::string": + return {"name": "std::string"} + if typeStr == "std::chrono::duration": + return {"name": "std::chrono::seconds"} + if typeStr == "std::chrono::duration>": + return {"name": "std::chrono::milliseconds"} + if typeStr == "std::chrono::duration>": + return {"name": "std::chrono::microseconds"} + if typeStr == "std::chrono::duration>": + return {"name": "std::chrono::nanoseconds"} + if typeStr == "std::error_code": + return {"name": "std::error_code"} + if typeStr == "std::monostate": + return {"name": "std::monostate"} + if typeStr == "std::byte": + return {"name": "std::byte"} + if typeStr == "unsigned long": + return {"name": "std::size_t"} + if typeStr == "char": + return {"name": "std::int8_t"} + if typeStr == "unsigned char": + return {"name": "std::uint8_t"} + if typeStr == "short": + return {"name": "std::int16_t"} + if typeStr == "unsigned short": + return {"name": "std::uint16_t"} + if typeStr == "int": + return {"name": "std::int32_t"} + if typeStr == "unsigned int": + return {"name": "std::uint32_t"} + if typeStr == "long long": + return {"name": "std::int64_t"} + if typeStr == "unsigned long long": + return {"name": "std::uint64_t"} + if typeStr == "bool": + return {"name": "std::bool"} + if typeStr == "float": + return {"name": "std::float"} + if typeStr == "double": + return {"name": "std::double"} + if typeStr == "std::nullptr_t": + return {"name": "std::nullptr_t"} + if typeStr in std_comparators: + return {"name": typeStr} + + tplParts = typeStr.split("<", 1) + if len(tplParts) > 1: + tplClassName = tplParts[0] + tplParams = tplParts[1][:-1] + if tplClassName == "std::function": + return { + "name": "std::function" + } + if tplClassName == "std::optional": + return { + "name": "std::optional", + "of": parse_type_str(tplParams) + } + if tplClassName == "std::vector": + return { + "name": "std::vector", + "of": parse_type_str(tplParams) + } + if tplClassName == "std::set": + return { + "name": "std::set", + "of": parse_type_str(tplParams) + } + if tplClassName == "std::variant": + variantParts = tplParams.split(", ") + variantTypes = [] + for variantPart in variantParts: + variantTypes.append(parse_type_str(variantPart)) + return { + "name": "std::variant", + "of": variantTypes + } + if tplClassName == "std::array": + variantParts = tplParams.split(", ") + if len(variantParts) != 2: + print("FAILED TO PARSE ARRAY TYPES: " + typeStr) + return {"name": "unknown", "str": typeStr} + return { + "name": "std::array", + "of": parse_type_str(variantParts[0]), + "size": int(variantParts[1]) + } + if tplClassName == "std::map": + variantParts = tplParams.split(", ") + if len(variantParts) < 2 or len(variantParts) > 3: + print("FAILED TO PARSE MAP TYPES: " + typeStr) + return {"name": "unknown", "str": typeStr} + + if len(variantParts) == 2: + return { + "name": "std::map", + "of": parse_type_str(variantParts[0]), + "to": parse_type_str(variantParts[1]) + } + else: + return { + "name": "std::map", + "of": parse_type_str(variantParts[0]), + "to": parse_type_str(variantParts[1]), + "comparator": parse_type_str(variantParts[2]) + } + + if tplClassName == "std::shared_ptr": + return { + "name": "std::shared_ptr", + "of": parse_type_str(tplParams) + } + + #return {"name": "unknown", "str": typeStr} + + if 'unnamed struct' in typeStr: + print("WARNING: Found unnamed struct: " + typeStr) + + return {"name": typeStr} + +internal_structs = [] +UNNAMED_STRUCT_DELIM = '::(unnamed struct' + +def traverse(node, namespace, main_file): + # only scan the elements of the file we parsed + + + if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL: + fullStructName = "::".join([*namespace, node.displayname]) + print("#FILE", node.location.file ) + print("REFL_TYPE(" + fullStructName + ")") + + structFields = [] + for child in node.get_children(): + if child.kind == clang.cindex.CursorKind.FIELD_DECL: + struct_type = parse_type(child.type) + type_str = child.type.get_canonical().spelling + print(" REFL_FIELD(" + child.displayname + ")") + if 'unnamed' in type_str: + name_tokens = type_str.split('::') + name_override = '::'.join(name_tokens[:-1] + [child.displayname]) + struct_type['name'] = name_override + internal_structs.append(name_override) + + structFields.append({ + "name": child.displayname, + "type": struct_type, + }) + # replica read changes introduced duplicate get requests + #if any(map(lambda op: op['name'] == fullStructName, opTypes)): + # return + + #opTypes.append({ + # "name": fullStructName, + # "fields": structFields, + #}) + print("REFL_END") + + + if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL: + fullStructName = "::".join([*namespace, node.displayname]) + if is_included_type(fullStructName, with_durability=True): + type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None) + if type_ref: + base_request_name = type_ref.displayname.replace('struct', '').strip() + base_request = next((op for op in opTypes if op['name'] == base_request_name), None) + if base_request: + new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level'] + new_fields.extend([ + {"name":"persist_to", "type":{"name":"couchbase::persist_to"}}, + {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}} + ]) + + opTypes.append({ + "name": fullStructName, + "fields": new_fields + }) + if node.kind == clang.cindex.CursorKind.ENUM_DECL: + fullEnumName = "::".join([*namespace, node.displayname]) + if is_included_type(fullEnumName): + enumValues = [] + + for child in node.get_children(): + if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL: + enumValues.append({ + "name": child.displayname, + "value": child.enum_value, + }) + opEnums.append({ + "name": fullEnumName, + "type": parse_type(node.enum_type), + "values": enumValues, + }) + + if node.kind == clang.cindex.CursorKind.NAMESPACE: + namespace = [*namespace, node.displayname] + if node.kind == clang.cindex.CursorKind.CLASS_DECL: + namespace = [*namespace, node.displayname] + if node.kind == clang.cindex.CursorKind.STRUCT_DECL: + namespace = [*namespace, node.displayname] + + for child in node.get_children(): + traverse(child, namespace, main_file) + +for headerPath in fullFileList: + print("processing " + headerPath) + index = clang.cindex.Index.create() + args = [ + '-std=c++17', + ] + + try: + translation_unit = index.parse(headerPath, args=args) + except Exception as e: + print(e) + import pdb + pdb.set_trace() + raise e + + # output clang compiler diagnostics information (for debugging) + + for diagnostic in translation_unit.diagnostics: + diagnosticMsg = diagnostic.format() + print(diagnostic) + + traverse(translation_unit.cursor, [], headerPath) + +jsonData = json.dumps({ + 'op_structs': opTypes, + 'op_enums': opEnums +}) + +f = open("bindings.json", "w") +f.write(jsonData) +f.close() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 31ec8cade19be..18d2d03c09196 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,6 +31,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#include "print.hpp" + static llama_context ** g_ctx; static llama_model ** g_model; static gpt_params * g_params; @@ -99,6 +101,7 @@ static void sigint_handler(int signo) { } } #endif +using namespace refl; int main(int argc, char ** argv) { gpt_params params; @@ -117,7 +120,8 @@ int main(int argc, char ** argv) { // TODO: Dump params ? //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); - + print_fields(params); + // save choice to use color for later // (note for later: this is a slightly awkward choice) console::init(params.simple_io, params.use_color); @@ -234,6 +238,8 @@ int main(int argc, char ** argv) { std::vector embd_inp; + print_fields(*model); + if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); if (params.chatml) { @@ -277,7 +283,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } - + print_fields(*ctx); + //print_fields(session_tokens); // debug message about similarity of saved session, if applicable size_t n_matching_session_tokens = 0; if (!session_tokens.empty()) { @@ -365,6 +372,10 @@ int main(int argc, char ** argv) { for (int i = 0; i < (int) guidance_inp.size(); i++) { LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); } + + print_fields(*ctx_guidance); + + } if (params.n_keep > 0) { @@ -473,7 +484,8 @@ int main(int argc, char ** argv) { std::vector embd_guidance; struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); - + print_fields(*ctx_sampling); + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { @@ -508,6 +520,7 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); + print_fields(*ctx); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); @@ -624,7 +637,7 @@ int main(int argc, char ** argv) { } const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - + //print_fields(id); llama_sampling_accept(ctx_sampling, ctx, id, true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 50f124b13e849..a42bba9b6d54f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -24,6 +24,7 @@ #include #include #include +#include "print.hpp" #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -33,6 +34,9 @@ using json = nlohmann::json; +REFL_TYPE(std::less< ::nlohmann::detail::value_t>) +REFL_END + struct server_params { std::string hostname = "127.0.0.1"; @@ -41,6 +45,13 @@ struct server_params int32_t read_timeout = 600; int32_t write_timeout = 600; }; +REFL_TYPE(server_params) + REFL_FIELD(hostname) + REFL_FIELD(public_path) + REFL_FIELD(port) + REFL_FIELD(read_timeout) + REFL_FIELD(write_timeout) +REFL_END static bool server_verbose = false; @@ -157,6 +168,15 @@ struct task_server { bool embedding_mode = false; }; +REFL_TYPE(task_server) + REFL_FIELD(id) + REFL_FIELD(target_id) + REFL_FIELD(type) + REFL_FIELD(data) + REFL_FIELD(infill_mode) + REFL_FIELD(embedding_mode) +REFL_END + struct task_result { int id; bool stop; @@ -193,6 +213,18 @@ struct slot_params json input_suffix; }; +REFL_TYPE(slot_params) + REFL_FIELD(stream) + REFL_FIELD(cache_prompt) + REFL_FIELD(seed) + REFL_FIELD(n_keep) + REFL_FIELD(n_predict) + REFL_FIELD(antiprompt) + REFL_FIELD(input_prefix) + REFL_FIELD(input_suffix) +REFL_END + + struct slot_image { int32_t id; @@ -220,6 +252,17 @@ struct completion_token_output std::string text_to_send; }; +REFL_TYPE(completion_token_output) + REFL_FIELD(probs) + REFL_FIELD(tok) + REFL_FIELD(text_to_send) +REFL_END + +REFL_TYPE(completion_token_output::token_prob) + REFL_FIELD(tok) + REFL_FIELD(prob) +REFL_END + static size_t common_part(const std::vector &a, const std::vector &b) { size_t i; @@ -496,6 +539,51 @@ struct llama_client_slot } }; +//REFL_TYPE(llama_client_slot::llama_sampling_params) +//REFL_END + +REFL_TYPE(llama_client_slot) + REFL_FIELD(id) + REFL_FIELD(task_id) + REFL_FIELD(params) + REFL_FIELD(state) + REFL_FIELD(command) + REFL_FIELD(t_last_used) + REFL_FIELD(n_ctx) + REFL_FIELD(n_past) + REFL_FIELD(n_decoded) + REFL_FIELD(n_remaining) + REFL_FIELD(i_batch) + REFL_FIELD(num_prompt_tokens) + REFL_FIELD(num_prompt_tokens_processed) + REFL_FIELD(multibyte_pending) + REFL_FIELD(prompt) + REFL_FIELD(generated_text) + REFL_FIELD(sampled) + REFL_FIELD(cache_tokens) + REFL_FIELD(generated_token_probs) + REFL_FIELD(infill) + REFL_FIELD(embedding) + REFL_FIELD(has_next_token) + REFL_FIELD(truncated) + REFL_FIELD(stopped_eos) + REFL_FIELD(stopped_word) + REFL_FIELD(stopped_limit) + REFL_FIELD(oaicompat) + REFL_FIELD(oaicompat_model) + REFL_FIELD(stopping_word) + REFL_FIELD(sparams) + REFL_FIELD(ctx_sampling) + REFL_FIELD(images) + REFL_FIELD(sent_count) + REFL_FIELD(sent_token_probs_index) + REFL_FIELD(t_start_process_prompt) + REFL_FIELD(t_start_genereration) + REFL_FIELD(t_prompt_processing) + REFL_FIELD(t_token_generation) +REFL_END + + struct llama_server_context { llama_model *model = nullptr; @@ -878,7 +966,7 @@ struct llama_server_context all_slots_are_idle = false; LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); - + print_fields(*slot); return true; } @@ -1787,6 +1875,31 @@ struct llama_server_context } }; +REFL_TYPE(llama_server_context) + REFL_FIELD(model) + REFL_FIELD(ctx) + REFL_FIELD(clp_ctx) + REFL_FIELD(params) + REFL_FIELD(batch) + REFL_FIELD(multimodal) + REFL_FIELD(clean_kv_cache) + REFL_FIELD(all_slots_are_idle) + REFL_FIELD(add_bos_token) + REFL_FIELD(id_gen) + REFL_FIELD(n_ctx) + REFL_FIELD(system_need_update) + REFL_FIELD(system_prompt) + REFL_FIELD(system_tokens) + REFL_FIELD(name_user) + REFL_FIELD(name_assistant) + REFL_FIELD(slots) + REFL_FIELD(queue_tasks) + REFL_FIELD(queue_results) + REFL_FIELD(mutex_tasks) + REFL_FIELD(mutex_results) +REFL_END + + static void server_print_usage(const char *argv0, const gpt_params ¶ms, const server_params &sparams) { @@ -2497,6 +2610,11 @@ struct token_translator std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } }; + +REFL_TYPE(token_translator) + REFL_FIELD(ctx) +REFL_END + static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) { auto & gtps = slot->generated_token_probs; diff --git a/ggml-alloc.c b/ggml-alloc.cpp similarity index 98% rename from ggml-alloc.c rename to ggml-alloc.cpp index cdfe4caf69613..46f4c9bd73d45 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.cpp @@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) { free(galloc->parse_seq); - galloc->parse_seq = malloc(sizeof(int) * n); + galloc->parse_seq = (int*)malloc(sizeof(int) * n); for (int i = 0; i < n; i++) { galloc->parse_seq[i] = list[i]; @@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st if (galloc->hash_values != NULL) { free(galloc->hash_values); } - galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size); + galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size); galloc->hash_set.size = hash_size; - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size); } // reset hash table @@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap // alloc hash_values if needed if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size); galloc->hash_values_size = hash_size; } diff --git a/ggml-backend.c b/ggml-backend.cpp similarity index 96% rename from ggml-backend.c rename to ggml-backend.cpp index f6e5fceed0f4d..47b60cb1e284e 100644 --- a/ggml-backend.c +++ b/ggml-backend.cpp @@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init( struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, size_t size) { - ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); + ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer)); GGML_ASSERT(iface.get_base != NULL); @@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst // TODO: allow backends to support copy to/from same backend if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) { - ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst); + ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst); } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) { - ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst); + ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst); } else { // shouldn't be hit when copying from/to CPU #ifndef NDEBUG @@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu { static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); + struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu)); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); cpu_plan->cgraph = *cgraph; if (cpu_plan->cplan.work_size > 0) { - cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size); } return cpu_plan; @@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.work_data = (uint8_t*)cpu_ctx->work_data; ggml_graph_compute(cgraph, &cplan); } @@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = { }; ggml_backend_t ggml_backend_cpu_init(void) { - struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context)); ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->work_data = NULL; ctx->work_size = 0; - ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend)); *cpu_backend = (struct ggml_backend) { /* .interface = */ cpu_backend_i, @@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) { ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); - struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched)); memset(sched, 0, sizeof(struct ggml_backend_sched)); fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024); @@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr // initialize hash tables size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; sched->hash_set.size = hash_size; - sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); - sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); - sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size); + sched->node_talloc = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0]) * hash_size); + sched->node_copies = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0]) * hash_size); sched_split_graph(sched, measure_graph); sched_alloc_splits(sched); diff --git a/ggml-impl.h b/ggml-impl.h index 06c07339e9269..1bf20a4af3985 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -22,7 +22,7 @@ extern "C" { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) #else -#define static_assert(cond, msg) struct global_scope_noop_trick + //#define static_assert(cond, msg) struct global_scope_noop_trick #endif #endif diff --git a/ggml-internal.hpp b/ggml-internal.hpp new file mode 100644 index 0000000000000..0725451fcbd3e --- /dev/null +++ b/ggml-internal.hpp @@ -0,0 +1,258 @@ +struct ggml_context { + size_t mem_size; + void * mem_buffer; + bool mem_buffer_owned; + bool no_alloc; + bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers + + int n_objects; + + struct ggml_object * objects_begin; + struct ggml_object * objects_end; + + struct ggml_scratch scratch; + struct ggml_scratch scratch_save; + + ggml_context(): + mem_size(0), + mem_buffer(0), + mem_buffer_owned(0), + no_alloc(0), + no_alloc_save(0), + n_objects(0), + objects_begin(0), + objects_end(0), + scratch(), + scratch_save() + { + + } +}; + +struct ggml_context_container { + bool used; + + struct ggml_context context; + + ggml_context_container(): used(0),context(){ + + } +}; + +typedef double ggml_float; +typedef void * thread_ret_t; + +#define MAX_FREE_BLOCKS 256 + +struct free_block { + void * addr; + size_t size; +}; + +struct ggml_tallocr { + struct ggml_backend_buffer * buffer; + bool buffer_owned; + void * base; + size_t alignment; + + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + + size_t max_size; + + bool measure; + +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; +#endif +}; + + +struct hash_node { + int n_children; + int n_views; +}; + +typedef struct ggml_tallocr * ggml_tallocr_t; +typedef struct ggml_gallocr * ggml_gallocr_t; + +struct ggml_gallocr { + ggml_tallocr_t talloc; + struct ggml_hash_set hash_set; + struct hash_node * hash_values; + size_t hash_values_size; + ggml_tallocr_t * hash_allocs; + int * parse_seq; + int parse_seq_len; +}; + +struct ggml_allocr { + ggml_tallocr_t talloc; + ggml_gallocr_t galloc; +}; + +#define GGML_NUMA_MAX_NODES 8 +#define GGML_NUMA_MAX_CPUS 512 + +struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node + uint32_t n_cpus; +}; + +struct ggml_numa_nodes { + struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; + uint32_t n_nodes; + uint32_t total_cpus; // hardware threads on system +}; + +struct ggml_state { + struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; + struct ggml_numa_nodes numa; + + ggml_state():contexts(), numa() + { + + } +}; + +struct gguf_str { + uint64_t n; // GGUFv2 + char * data; +}; + +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; +struct hash_map { + struct ggml_hash_set set; + struct ggml_tensor ** vals; +}; + +#if defined(_WIN32) +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; +#else +#include +using namespace std; +#endif + +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + void * abort_callback_data; +}; +typedef pthread_t ggml_thread_t; +struct ggml_compute_state { + ggml_thread_t thrd; + int ith; + struct ggml_compute_state_shared * shared; +}; + +union gguf_value { + uint8_t uint8; + int8_t int8; + uint16_t uint16; + int16_t int16; + uint32_t uint32; + int32_t int32; + float float32; + uint64_t uint64; + int64_t int64; + double float64; + bool bool_; + + struct gguf_str str; + + struct gguf_array_T { + enum gguf_type type; + + uint64_t n; // GGUFv2 + void * data; + } arr; +}; + +struct ggml_lbfgs_iteration_data { + float alpha; + float ys; + float * s; + float * y; +}; + +struct gguf_kv { + struct gguf_str key; + + enum gguf_type type; + union gguf_value value; +}; + + + +struct gguf_header { + char magic[4]; + uint32_t version; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 +}; + +struct gguf_tensor_info { + struct gguf_str name; + + uint32_t n_dims; + uint64_t ne[GGML_MAX_DIMS]; + + enum ggml_type type; + + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` + + // for writing API + const void * data; + size_t size; +}; + +struct gguf_context { + struct gguf_header header; + + struct gguf_kv * kv; + struct gguf_tensor_info * infos; + + size_t alignment; + size_t offset; // offset of `data` from beginning of file + size_t size; // size of `data` in bytes + + //uint8_t * padding; + void * data; +}; + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + + +#include "ggml-backend-impl.h" diff --git a/ggml-mpi.c b/ggml-mpi.cpp similarity index 100% rename from ggml-mpi.c rename to ggml-mpi.cpp diff --git a/ggml-quants.c b/ggml-quants.cpp similarity index 93% rename from ggml-quants.c rename to ggml-quants.cpp index 7285d5f7fbcc0..e49189394e4ac 100644 --- a/ggml-quants.c +++ b/ggml-quants.cpp @@ -5,7 +5,7 @@ #include #include #include - +#include #ifdef __ARM_NEON // if YCM cannot find , make a symbolic link to it, for example: @@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif // reference implementation for deterministic creation of model files -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict } } -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_0_reference(x, y, k); +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q4_0_reference(x, (block_q4_0*)y, k); } -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k) { const int qk = QK4_1; assert(k % qk == 0); @@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict } } -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_1_reference(x, y, k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q4_1_reference(x, (block_q4_1*)y, k); } -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict } } -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_0_reference(x, y, k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q5_0_reference(x, (block_q5_0*)y, k); } -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k) { const int qk = QK5_1; assert(k % qk == 0); @@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict } } -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_1_reference(x, y, k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q5_1_reference(x, (block_q5_1*)y, k); } // reference implementation for deterministic creation of model files -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict } } -void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0 * restrict y = vy; + block_q8_0 * GGML_RESTRICT y = (block_q8_0*)vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { } // reference implementation for deterministic creation of model files -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k) { assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict } } -void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; - block_q8_1 * restrict y = vy; + block_q8_1 * GGML_RESTRICT y = (block_q8_1*)vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { #endif } -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int } } -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK4_1; assert(k % qk == 0); @@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int } } -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int } } -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK5_1; assert(k % qk == 0); @@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int } } -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK8_0; assert(k % qk == 0); @@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { +static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { +static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, +static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, int ntry, float alpha) { float min = x[0]; float max = x[0]; @@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f } #if QK_K == 256 -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { +static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; } else { @@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } } -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q2_K_reference(x, vy, k); +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { + quantize_row_q2_K_reference(x, (block_q2_K*)vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + block_q2_K * GGML_RESTRICT y = (block_q2_K *)dst + j/QK_K; quantize_row_q2_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q2_K)); @@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n //========================= 3-bit (de)-quantization -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } #if QK_K == 256 -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; uint8_t m = 1; memcpy(aux, x[i].scales, 12); @@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } #else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); assert(QK_K == 64); const int nb = k / QK_K; @@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); @@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } #endif -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q3_K_reference(x, vy, k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { + quantize_row_q3_K_reference(x, (block_q3_K*)vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + block_q3_K * GGML_RESTRICT y = (block_q3_K *)dst + j/QK_K; quantize_row_q3_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q3_K)); @@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int } } -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q4_K * restrict y = vy; + block_q4_K * GGML_RESTRICT y = (block_q4_K*)vy; quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + block_q4_K * GGML_RESTRICT y = (block_q4_K *)dst + j/QK_K; quantize_row_q4_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q4_K)); @@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); for (int j = 0; j < 32; ++j) { @@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } #else float d = GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict s = x[i].scales; + const int8_t * GGML_RESTRICT s = x[i].scales; for (int l = 0; l < 8; ++l) { y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); @@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } } -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q5_K * restrict y = vy; + block_q5_K * GGML_RESTRICT y = (block_q5_K*)vy; quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + block_q5_K * GGML_RESTRICT y = (block_q5_K *)dst + j/QK_K; quantize_row_q5_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q5_K)); @@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].ql; + uint8_t * GGML_RESTRICT qh = y[i].qh; #if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict ql = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT ql = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT sc = x[i].scales; #if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { @@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int } } -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q6_K * restrict y = vy; + block_q6_K * GGML_RESTRICT y = (block_q6_K*)vy; quantize_row_q6_K_reference(x, y, k); } @@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + block_q6_K * GGML_RESTRICT y = (block_q6_K *)dst + j/QK_K; quantize_row_q6_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q6_K)); @@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * //===================================== Q8_K ============================================== -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2374,8 +2374,8 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int } } -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { - quantize_row_q8_K_reference(x, y, k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q8_K_reference(x, (block_q8_K*)y, k); } //===================================== Dot ptoducts ================================= @@ -2423,14 +2423,15 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { + //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy); const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q4_0 * GGML_RESTRICT x = (const block_q4_0*)vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2439,10 +2440,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); const int8x16_t s8b = vdupq_n_s8(0x8); @@ -2733,14 +2734,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, #endif } -void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_1_q8_1(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q4_1 * GGML_RESTRICT x = (const block_q4_1*)vx; + const block_q8_1 * GGML_RESTRICT y = (const block_q8_1*)vy; // TODO: add WASM SIMD #if defined(__ARM_NEON) @@ -2752,10 +2753,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i + 0]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q4_1 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q4_1 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[i + 1]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; @@ -2893,15 +2894,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_0_q8_0(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q5_0 * GGML_RESTRICT x = (const block_q5_0*)vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2916,10 +2917,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q5_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q5_0 * GGML_RESTRICT x0 = &x[i]; + const block_q5_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3000,8 +3001,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q8_0 * restrict y0 = &y[i]; + const block_q5_0 * GGML_RESTRICT x0 = &x[i]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i]; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3199,15 +3200,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_1_q8_1(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q5_1 * GGML_RESTRICT x = (const block_q5_1*)vx; + const block_q8_1 * GGML_RESTRICT y = (const block_q8_1*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3225,10 +3226,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q5_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q5_1 * GGML_RESTRICT x0 = &x[i]; + const block_q5_1 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i]; + const block_q8_1 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3314,8 +3315,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q8_1 * restrict y0 = &y[i]; + const block_q5_1 * GGML_RESTRICT x0 = &x[i]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s; @@ -3518,14 +3519,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q8_0_q8_0(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q8_0 * GGML_RESTRICT x = (const block_q8_0*)vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3534,10 +3535,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); @@ -3642,10 +3643,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri } #if QK_K == 256 -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = (const block_q2_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -3667,9 +3668,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; const uint8x16_t mins_and_scales = vld1q_u8(sc); const uint8x16_t scales = vandq_u8(mins_and_scales, m4); @@ -3746,8 +3747,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); @@ -3813,8 +3814,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // load mins and scales from block_q2_K.scales[QK_K/16] const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -4035,10 +4036,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4061,9 +4062,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4114,8 +4115,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * GGML_RESTRICT db = (const uint8_t *)&ud; + const uint8_t * GGML_RESTRICT mb = (const uint8_t *)&um; float summs = 0; @@ -4126,10 +4127,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4166,8 +4167,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * GGML_RESTRICT db = (const uint8_t *)&ud; + const uint8_t * GGML_RESTRICT mb = (const uint8_t *)&um; float summs = 0; @@ -4178,10 +4179,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4227,9 +4228,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4311,14 +4312,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = (const block_q3_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -4346,9 +4347,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -4454,8 +4455,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); @@ -4559,8 +4560,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales aux = (const uint32_t *)x[i].scales; @@ -4694,9 +4695,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(aux, x[i].scales, 12); utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); @@ -4806,11 +4807,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; @@ -4855,11 +4856,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4947,8 +4948,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5018,8 +5019,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5098,8 +5099,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5173,10 +5174,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 8; ++l) { a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); @@ -5213,11 +5214,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = (const block_q4_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -5262,8 +5263,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -5334,8 +5335,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri utmp[2] = uaux; utmp[0] &= kmask1; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -5393,8 +5394,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -5494,8 +5495,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vl = 32; @@ -5548,10 +5549,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); a += 32; @@ -5594,11 +5595,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } #else -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -5618,14 +5619,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sum_mins = 0.f; uint16_t aux16[2]; - const uint8_t * restrict scales = (const uint8_t *)aux16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)aux16; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint16_t * restrict a = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; @@ -5698,8 +5699,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m256i q4l = _mm256_and_si256(q4bits, m4); @@ -5744,8 +5745,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0); @@ -5778,16 +5779,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #elif defined __riscv_v_intrinsic uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5827,17 +5828,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri memset(sums, 0, 8*sizeof(float)); uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - uint8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + uint8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5861,11 +5862,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = (const block_q5_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -5911,9 +5912,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -5976,8 +5977,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; #if QK_K == 256 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6065,8 +6066,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -6163,9 +6164,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri vl = 8; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; @@ -6249,11 +6250,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); @@ -6302,11 +6303,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -6328,9 +6329,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint8x8_t qhbits = vld1_u8(qh); @@ -6387,8 +6388,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6433,8 +6434,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6490,9 +6491,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -6560,10 +6561,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 32; ++l) { a[l+ 0] = q4[l] & 0xF; a[l+32] = q4[l] >> 4; @@ -6574,7 +6575,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri } const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; + const int8_t * GGML_RESTRICT sc = x[i].scales; for (int j = 0; j < QK_K/16; ++j) { const float dl = d * sc[j]; @@ -6591,11 +6592,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #if QK_K == 256 -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = (const block_q6_K *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; const int nb = n / QK_K; @@ -6618,11 +6619,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); @@ -6750,9 +6751,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6830,9 +6831,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6942,11 +6943,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; size_t vl; @@ -7030,11 +7031,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; @@ -7067,11 +7068,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -7094,11 +7095,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; int32_t isum = 0; @@ -7157,9 +7158,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7214,9 +7215,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7281,11 +7282,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; int32_t isum = 0; @@ -7350,11 +7351,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 16; ++l) { a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; diff --git a/ggml-quants.h b/ggml-quants.h index 70c12c27465e8..2706e36ada7d3 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ // Quantization -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); - -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); - -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); - -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k); +void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k); +void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k); +void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k); +void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k); +void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k); + +void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k); +void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k); +void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k); +void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k); +void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k); +void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k); + +void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k); + +void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); -//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); - -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k); +//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k); + +void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); + +void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); diff --git a/ggml.c b/ggml.cpp similarity index 98% rename from ggml.c rename to ggml.cpp index f92292b39c635..f1a0e5358859a 100644 --- a/ggml.c +++ b/ggml.cpp @@ -38,6 +38,14 @@ #pragma warning(disable: 4996) #endif +// initializers for static data called in the ggml_init function +static size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {}; +static char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {}; + +void type_traits_init(); +void GGUF_TYPE_SIZE_init(); +void GGUF_TYPE_NAME_init(); + #if defined(_WIN32) #include @@ -86,7 +94,9 @@ static int sched_yield (void) { } #else #include -#include +//#include +#include +using namespace std; typedef void * thread_ret_t; @@ -96,6 +106,8 @@ typedef void * thread_ret_t; #endif +#include + #ifdef GGML_USE_CPU_HBM #include #endif @@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y); +static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y); -static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { - [GGML_TYPE_I8] = { + +static ggml_type_traits_t type_traits[GGML_TYPE_COUNT]; +void type_traits_init(){ + type_traits[GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, .type_size = sizeof(int8_t), .is_quantized = false, - }, - [GGML_TYPE_I16] = { + }; + type_traits[GGML_TYPE_I16] = { .type_name = "i16", .blck_size = 1, .type_size = sizeof(int16_t), .is_quantized = false, - }, - [GGML_TYPE_I32] = { + }; + type_traits[GGML_TYPE_I32] = { .type_name = "i32", .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, - }, - [GGML_TYPE_F32] = { + }; + type_traits[GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, .type_size = sizeof(float), .is_quantized = false, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, - }, - [GGML_TYPE_F16] = { + }; + type_traits[GGML_TYPE_F16] = { .type_name = "f16", .blck_size = 1, .type_size = sizeof(ggml_fp16_t), @@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, - }, - [GGML_TYPE_Q4_0] = { + }; + type_traits[GGML_TYPE_Q4_0] = { .type_name = "q4_0", .blck_size = QK4_0, .type_size = sizeof(block_q4_0), @@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q4_1] = { + }; + type_traits[GGML_TYPE_Q4_1] = { .type_name = "q4_1", .blck_size = QK4_1, .type_size = sizeof(block_q4_1), @@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, - }, - [4] = { // GGML_TYPE_Q4_2 + }; + type_traits[4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", .blck_size = 0, .type_size = 0, @@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, - }, - [5] = { // GGML_TYPE_Q4_3 + }; + type_traits[5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", .blck_size = 0, .type_size = 0, @@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, - }, - [GGML_TYPE_Q5_0] = { + }; + type_traits[GGML_TYPE_Q5_0] = { .type_name = "q5_0", .blck_size = QK5_0, .type_size = sizeof(block_q5_0), @@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q5_1] = { + }; + type_traits[GGML_TYPE_Q5_1] = { .type_name = "q5_1", .blck_size = QK5_1, .type_size = sizeof(block_q5_1), @@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, - }, - [GGML_TYPE_Q8_0] = { + }; + type_traits[GGML_TYPE_Q8_0] = { .type_name = "q8_0", .blck_size = QK8_0, .type_size = sizeof(block_q8_0), @@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q8_1] = { + }; + type_traits[GGML_TYPE_Q8_1] = { .type_name = "q8_1", .blck_size = QK8_1, .type_size = sizeof(block_q8_1), @@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, - }, - [GGML_TYPE_Q2_K] = { + }; + type_traits[GGML_TYPE_Q2_K] = { .type_name = "q2_K", .blck_size = QK_K, .type_size = sizeof(block_q2_K), @@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q3_K] = { + }; + type_traits[GGML_TYPE_Q3_K] = { .type_name = "q3_K", .blck_size = QK_K, .type_size = sizeof(block_q3_K), @@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q4_K] = { + }; + type_traits[GGML_TYPE_Q4_K] = { .type_name = "q4_K", .blck_size = QK_K, .type_size = sizeof(block_q4_K), @@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q5_K] = { + }; + type_traits[GGML_TYPE_Q5_K] = { .type_name = "q5_K", .blck_size = QK_K, .type_size = sizeof(block_q5_K), @@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q6_K] = { + }; + type_traits[GGML_TYPE_Q6_K] = { .type_name = "q6_K", .blck_size = QK_K, .type_size = sizeof(block_q6_K), @@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q8_K] = { + }; + type_traits[GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, .type_size = sizeof(block_q8_K), .is_quantized = true, .from_float = quantize_row_q8_K, - } -}; + }; +} // For internal test use ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { @@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) { #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest *s = sumf; } -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes -inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); @@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re } } -inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float } // xs and vs are byte strides of x and v -inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { +inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { - const float * restrict x[GGML_VEC_MAD_UNROLL]; - const float * restrict v[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { x[i] = (const float *) ((const char *) xv + i*xs); @@ -2175,18 +2189,26 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// -struct ggml_context * ggml_init(struct ggml_init_params params) { - // make this function thread safe - ggml_critical_section_start(); - - static bool is_first_call = true; - if (is_first_call) { - // initialize time system (required on Windows) - ggml_time_init(); +struct ggml_context * ggml_init(struct ggml_init_params params) { - // initialize GELU, Quick GELU, SILU and EXP F32 tables - { + // initialize the data in the arrays + type_traits_init(); + GGUF_TYPE_SIZE_init(); + GGUF_TYPE_NAME_init(); + + struct ggml_context * ctx = NULL; + static bool is_first_call = true; + // make this function thread safe + ggml_critical_section_start(); + + + if (is_first_call) { + // initialize time system (required on Windows) + ggml_time_init(); + + // initialize GELU, Quick GELU, SILU and EXP F32 tables + { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); ggml_fp16_t ii; @@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } // find non-used context in g_state - struct ggml_context * ctx = NULL; + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { if (!g_state.contexts[i].used) { @@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml // align to GGML_MEM_ALIGN size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN); - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { @@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( return NULL; } - data = (char * const) ctx->scratch.data + ctx->scratch.offs; + data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs); ctx->scratch.offs += data_size; } else { @@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; - char * const data = tensor->data; + char * const data = (char*)tensor->data; switch (tensor->type) { case GGML_TYPE_I8: @@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; - char * const data = tensor->data; + char * const data = (char*)tensor->data; switch (tensor->type) { case GGML_TYPE_I8: @@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor( struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; while (obj != NULL) { if (obj->type == GGML_OBJECT_TENSOR) { @@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE); obj = obj->next; - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; while (obj != NULL) { if (obj->type == GGML_OBJECT_TENSOR) { @@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { struct ggml_object * obj = ctx->objects_begin; - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; while (obj != NULL) { if (obj->type == GGML_OBJECT_TENSOR) { @@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ACC; @@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl( // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_SET; @@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d( }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; + int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_2D; @@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32( GGML_ASSERT(nb00 == sizeof(float)); if (ggml_is_contiguous(dst)) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0); } else { for (int k3 = 0; k3 < ne3; k3++) { for (int k2 = 0; k2 < ne2; k2++) { @@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { - char * wdata = params->wdata; + char * wdata = (char*)params->wdata; const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); for (int64_t i13 = 0; i13 < ne13; ++i13) { @@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32( return; } #endif - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0); return; } @@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_INIT) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0); return; } @@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d( struct ggml_tensor * dst) { const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; + enum ggml_op_pool op = (ggml_op_pool)opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; const int p0 = opts[3]; @@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d( } const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; + enum ggml_op_pool op = (ggml_op_pool)opts[0]; const int k0 = opts[1]; const int k1 = opts[2]; const int s0 = opts[3]; @@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; result.size = size; - result.keys = malloc(sizeof(struct ggml_tensor *) * size); + result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size); memset(result.keys, 0, sizeof(struct ggml_tensor *) * size); return result; } @@ -14113,9 +14136,9 @@ struct hash_map { }; static struct hash_map * ggml_new_hash_map(size_t size) { - struct hash_map * result = malloc(sizeof(struct hash_map)); + struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map)); result->set = ggml_hash_set_new(size); - result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size); + result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size); memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size); return result; } @@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads); // create thread pool if (n_threads > 1) { @@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { continue; } - GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0); + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0); } GGML_PRINT("========================================\n"); @@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam( const int n_accum = MAX(1, params.n_gradient_accumulation); const float accum_norm = 1.0f / (float) n_accum; - float * g = opt->adam.g->data; // gradients - float * m = opt->adam.m->data; // first moment - float * v = opt->adam.v->data; // second moment + float * g = (float*)opt->adam.g->data; // gradients + float * m = (float*)opt->adam.m->data; // first moment + float * v = (float*)opt->adam.v->data; // second moment - float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values + float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); @@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking( } else { // Armijo condition is satisfied if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { - return count; + return (ggml_opt_result)count; } ggml_vec_dot_f32(nx, &dg, g, d); @@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking( } else { if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { // regular Wolfe conditions - return count; + return (ggml_opt_result)count; } if(dg > -params->lbfgs.wolfe*dginit) { width = dec; } else { // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) - return count; + return (ggml_opt_result)count; } } } @@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - float * x = opt->lbfgs.x->data; // current parameters - float * xp = opt->lbfgs.xp->data; // previous parameters - float * g = opt->lbfgs.g->data; // current gradient - float * gp = opt->lbfgs.gp->data; // previous gradient - float * d = opt->lbfgs.d->data; // search direction + float * x = (float*)opt->lbfgs.x->data; // current parameters + float * xp = (float*)opt->lbfgs.xp->data; // previous parameters + float * g = (float*)opt->lbfgs.g->data; // current gradient + float * gp = (float*)opt->lbfgs.gp->data; // previous gradient + float * d = (float*)opt->lbfgs.d->data; // search direction - float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values + float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values const int n_accum = MAX(1, params.n_gradient_accumulation); const float accum_norm = 1.0f / (float) n_accum; @@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_opt_get_params(np, ps, x); // the L-BFGS memory - float * lm_alpha = opt->lbfgs.lmal->data; - float * lm_ys = opt->lbfgs.lmys->data; - float * lm_s = opt->lbfgs.lms->data; - float * lm_y = opt->lbfgs.lmy->data; + float * lm_alpha = (float*)opt->lbfgs.lmal->data; + float * lm_ys = (float*)opt->lbfgs.lmys->data; + float * lm_s = (float*)opt->lbfgs.lms->data; + float * lm_y = (float*)opt->lbfgs.lmy->data; bool cancel = false; @@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, x, xp); ggml_vec_cpy_f32(nx, g, gp); - return ls; + return (ggml_opt_result)ls; } opt->loss_after = fx; @@ -17564,7 +17587,7 @@ GGML_API void ggml_opt_init( opt->nx = nx; opt->just_initialized = true; if (opt->ctx == NULL) { - struct ggml_init_params ctx_opt_params; + struct ggml_init_params ctx_opt_params; if (opt->params.type == GGML_OPT_ADAM) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; if (opt->params.past > 0) { @@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_0; for (int b = 0; b < n; b += k) { - block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; + block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0; quantize_row_q4_0_reference(src + b, y, k); @@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_1; for (int b = 0; b < n; b += k) { - block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; + block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1; quantize_row_q4_1_reference(src + b, y, k); @@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK5_0; for (int b = 0; b < n; b += k) { - block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; + block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0; quantize_row_q5_0_reference(src + b, y, k); @@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK5_1; for (int b = 0; b < n; b += k) { - block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; + block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1; quantize_row_q5_1_reference(src + b, y, k); @@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK8_0; for (int b = 0; b < n; b += k) { - block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; + block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0; quantize_row_q8_0_reference(src + b, y, k); @@ -17928,37 +17951,39 @@ struct gguf_str { char * data; }; -static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = sizeof(uint8_t), - [GGUF_TYPE_INT8] = sizeof(int8_t), - [GGUF_TYPE_UINT16] = sizeof(uint16_t), - [GGUF_TYPE_INT16] = sizeof(int16_t), - [GGUF_TYPE_UINT32] = sizeof(uint32_t), - [GGUF_TYPE_INT32] = sizeof(int32_t), - [GGUF_TYPE_FLOAT32] = sizeof(float), - [GGUF_TYPE_BOOL] = sizeof(bool), - [GGUF_TYPE_STRING] = sizeof(struct gguf_str), - [GGUF_TYPE_UINT64] = sizeof(uint64_t), - [GGUF_TYPE_INT64] = sizeof(int64_t), - [GGUF_TYPE_FLOAT64] = sizeof(double), - [GGUF_TYPE_ARRAY] = 0, // undefined + +void GGUF_TYPE_SIZE_init() { + GGUF_TYPE_SIZE[GGUF_TYPE_UINT8] = sizeof(uint8_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT8] = sizeof(int8_t); + GGUF_TYPE_SIZE[GGUF_TYPE_UINT16] = sizeof(uint16_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT16] = sizeof(int16_t); + GGUF_TYPE_SIZE[GGUF_TYPE_UINT32] = sizeof(uint32_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT32] = sizeof(int32_t); + GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float); + GGUF_TYPE_SIZE[GGUF_TYPE_BOOL] = sizeof(bool); + GGUF_TYPE_SIZE[GGUF_TYPE_STRING] = sizeof(struct gguf_str); + GGUF_TYPE_SIZE[GGUF_TYPE_UINT64] = sizeof(uint64_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT64] = sizeof(int64_t); + GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double); + GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY] = 0; // undefined }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); -static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = "u8", - [GGUF_TYPE_INT8] = "i8", - [GGUF_TYPE_UINT16] = "u16", - [GGUF_TYPE_INT16] = "i16", - [GGUF_TYPE_UINT32] = "u32", - [GGUF_TYPE_INT32] = "i32", - [GGUF_TYPE_FLOAT32] = "f32", - [GGUF_TYPE_BOOL] = "bool", - [GGUF_TYPE_STRING] = "str", - [GGUF_TYPE_ARRAY] = "arr", - [GGUF_TYPE_UINT64] = "u64", - [GGUF_TYPE_INT64] = "i64", - [GGUF_TYPE_FLOAT64] = "f64", + +void GGUF_TYPE_NAME_init(){ + GGUF_TYPE_NAME[GGUF_TYPE_UINT8] = "u8"; + GGUF_TYPE_NAME[GGUF_TYPE_INT8] = "i8"; + GGUF_TYPE_NAME[GGUF_TYPE_UINT16] = "u16"; + GGUF_TYPE_NAME[GGUF_TYPE_INT16] = "i16"; + GGUF_TYPE_NAME[GGUF_TYPE_UINT32] = "u32"; + GGUF_TYPE_NAME[GGUF_TYPE_INT32] = "i32"; + GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32"; + GGUF_TYPE_NAME[GGUF_TYPE_BOOL] = "bool"; + GGUF_TYPE_NAME[GGUF_TYPE_STRING] = "str"; + GGUF_TYPE_NAME[GGUF_TYPE_ARRAY] = "arr"; + GGUF_TYPE_NAME[GGUF_TYPE_UINT64] = "u64"; + GGUF_TYPE_NAME[GGUF_TYPE_INT64] = "i64"; + GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64"; }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); @@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { bool ok = true; - ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1); ok = ok && gguf_fread_el(file, p->data, p->n, offset); return ok; } struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; @@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p bool ok = true; - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); // read the header { @@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the kv pairs { - ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; @@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos { - ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // create the tensors for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { const int64_t ne[GGML_MAX_DIMS] = { - ctx->infos[i].ne[0], - ctx->infos[i].ne[1], - ctx->infos[i].ne[2], - ctx->infos[i].ne[3], + (int64_t)ctx->infos[i].ne[0], + (int64_t)ctx->infos[i].ne[1], + (int64_t)ctx->infos[i].ne[2], + (int64_t)ctx->infos[i].ne[3], }; struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); @@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int n_kv = gguf_get_n_kv(ctx); - ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); ctx->kv[n_kv].key.n = strlen(key); ctx->kv[n_kv].key.data = strdup(key); ctx->header.n_kv++; @@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; } @@ -18760,7 +18785,7 @@ void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { const int idx = ctx->header.n_tensors; - ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); ctx->infos[idx].name.n = strlen(tensor->name); ctx->infos[idx].name.data = strdup(tensor->name); diff --git a/ggml.h b/ggml.h index f2fce0f22d357..1d69be2b00347 100644 --- a/ggml.h +++ b/ggml.h @@ -285,8 +285,10 @@ GGML_UNUSED(prefix##3); #ifdef __cplusplus +#ifndef CPP_ONLY extern "C" { #endif +#endif #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half ggml_fp16_t; @@ -1859,7 +1861,7 @@ extern "C" { int n_gradient_accumulation; // ADAM parameters - struct { + struct ggml_adam{ int n_iter; float sched; // schedule multiplier (fixed, decay or warmup) @@ -1875,7 +1877,7 @@ extern "C" { } adam; // LBFGS parameters - struct { + struct ggml_lbfgs{ int m; // number of corrections to approximate the inv. Hessian int n_iter; int max_linesearch; @@ -1902,7 +1904,7 @@ extern "C" { float loss_before; float loss_after; - struct { + struct ggml_grad{ struct ggml_tensor * g; // current gradient struct ggml_tensor * m; // first moment struct ggml_tensor * v; // second moment @@ -1912,7 +1914,7 @@ extern "C" { int n_no_improvement; } adam; - struct { + struct ggml_params{ struct ggml_tensor * x; // current parameters struct ggml_tensor * xp; // previous parameters struct ggml_tensor * g; // current gradient @@ -2134,15 +2136,15 @@ extern "C" { #ifdef __cplusplus // restrict not standard in C++ -#define GGML_RESTRICT +#define GGML_RESTRICT #else -#define GGML_RESTRICT restrict +#define GGML_RESTRICT __restrict__ #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); - typedef struct { + typedef struct ggml_something{ const char * type_name; int blck_size; size_t type_size; @@ -2157,5 +2159,7 @@ extern "C" { GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus +#ifndef CPP_ONLY } #endif +#endif diff --git a/llama-internal.hpp b/llama-internal.hpp new file mode 100644 index 0000000000000..33cf39e5d4f58 --- /dev/null +++ b/llama-internal.hpp @@ -0,0 +1,896 @@ +#include +#include +enum llm_arch { + LLM_ARCH_LLAMA, + LLM_ARCH_FALCON, + LLM_ARCH_BAICHUAN, + LLM_ARCH_GPT2, + LLM_ARCH_GPTJ, + LLM_ARCH_GPTNEOX, + LLM_ARCH_MPT, + LLM_ARCH_STARCODER, + LLM_ARCH_PERSIMMON, + LLM_ARCH_REFACT, + LLM_ARCH_BLOOM, + LLM_ARCH_STABLELM, + LLM_ARCH_UNKNOWN, +}; + +enum llm_kv { + LLM_KV_GENERAL_ARCHITECTURE, + LLM_KV_GENERAL_QUANTIZATION_VERSION, + LLM_KV_GENERAL_ALIGNMENT, + LLM_KV_GENERAL_NAME, + LLM_KV_GENERAL_AUTHOR, + LLM_KV_GENERAL_URL, + LLM_KV_GENERAL_DESCRIPTION, + LLM_KV_GENERAL_LICENSE, + LLM_KV_GENERAL_SOURCE_URL, + LLM_KV_GENERAL_SOURCE_HF_REPO, + + LLM_KV_CONTEXT_LENGTH, + LLM_KV_EMBEDDING_LENGTH, + LLM_KV_BLOCK_COUNT, + LLM_KV_FEED_FORWARD_LENGTH, + LLM_KV_USE_PARALLEL_RESIDUAL, + LLM_KV_TENSOR_DATA_LAYOUT, + + LLM_KV_ATTENTION_HEAD_COUNT, + LLM_KV_ATTENTION_HEAD_COUNT_KV, + LLM_KV_ATTENTION_MAX_ALIBI_BIAS, + LLM_KV_ATTENTION_CLAMP_KQV, + LLM_KV_ATTENTION_LAYERNORM_EPS, + LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + + LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_FREQ_BASE, + LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, + + LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_LIST, + LLM_KV_TOKENIZER_TOKEN_TYPE, + LLM_KV_TOKENIZER_SCORES, + LLM_KV_TOKENIZER_MERGES, + LLM_KV_TOKENIZER_BOS_ID, + LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_UNK_ID, + LLM_KV_TOKENIZER_SEP_ID, + LLM_KV_TOKENIZER_PAD_ID, + LLM_KV_TOKENIZER_ADD_BOS, + LLM_KV_TOKENIZER_ADD_EOS, + LLM_KV_TOKENIZER_HF_JSON, + LLM_KV_TOKENIZER_RWKV, +}; + +// available llama models +enum e_model { + MODEL_UNKNOWN, + MODEL_1B, + MODEL_3B, + MODEL_7B, + MODEL_8B, + MODEL_13B, + MODEL_15B, + MODEL_30B, + MODEL_34B, + MODEL_40B, + MODEL_65B, + MODEL_70B, +}; + +enum llama_fver { + GGUF_FILE_VERSION_V1 = 1, + GGUF_FILE_VERSION_V2 = 2, + GGUF_FILE_VERSION_V3 = 3, +}; + +struct LLM_KV { + LLM_KV(llm_arch arch) : arch(arch) {} + + llm_arch arch; + + std::string operator()(llm_kv kv) const; // moved to llama.cpp file + +}; + +enum llm_tensor { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, +}; + + +struct llama_cparams { + uint32_t n_ctx; // context size used during inference + uint32_t n_batch; + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing + + float rope_freq_base; + float rope_freq_scale; + + uint32_t n_yarn_orig_ctx; + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; + + bool mul_mat_q; +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attn_norm; + struct ggml_tensor * attn_norm_b; + struct ggml_tensor * attn_norm_2; + struct ggml_tensor * attn_norm_2_b; + struct ggml_tensor * attn_q_norm; + struct ggml_tensor * attn_q_norm_b; + struct ggml_tensor * attn_k_norm; + struct ggml_tensor * attn_k_norm_b; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + struct ggml_tensor * wqkv; + + // attention bias + struct ggml_tensor * bo; + struct ggml_tensor * bqkv; + + // normalization + struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; + + // ff + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 + + // ff bias + struct ggml_tensor * ffn_down_b; // b2 + struct ggml_tensor * ffn_up_b; // b3 +}; + +struct llama_kv_cell { + llama_pos pos = -1; + llama_pos delta = 0; + + std::set seq_id; + + bool has_seq_id(const llama_seq_id & id) const { + return seq_id.find(id) != seq_id.end(); + } +}; + +struct llama_buffer { + void * data = NULL; + size_t size = 0; + + // fallback to malloc / free + // useful in cases where CUDA can try to allocate PINNED memory + bool fallback = false; + + void resize(size_t n) ; + + + ~llama_buffer(); + +}; + +// ring-buffer of cached KV data +struct llama_kv_cache { + bool has_shift = false; + + // Note: The value of head isn't only used to optimize searching + // for a free KV slot. llama_decode_internal also uses it, so it + // cannot be freely changed after a slot has been allocated. + uint32_t head = 0; + uint32_t size = 0; + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + + struct ggml_tensor * k = NULL; + struct ggml_tensor * v = NULL; + + struct ggml_context * ctx = NULL; + + llama_buffer buf; + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + if (ggml_cublas_loaded()) { + ggml_cuda_free_data(k); + ggml_cuda_free_data(v); + } +#endif + } +}; + +struct llama_vocab { + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + + struct token_data { + token text; + float score; + ttype type; + }; + + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + + std::unordered_map token_to_id; + std::vector id_to_token; + + std::unordered_map special_tokens_cache; + + std::map, int> bpe_ranks; + + // default LLaMA special tokens + id special_bos_id = 1; + id special_eos_id = 2; + id special_unk_id = 0; + id special_sep_id = -1; + id special_pad_id = -1; + + int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. + int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. + + id linefeed_id = 13; + id special_prefix_id = 32007; + id special_middle_id = 32009; + id special_suffix_id = 32008; + id special_eot_id = 32010; + + int find_bpe_rank(std::string token_left, std::string token_right) const { + GGML_ASSERT(token_left.find(" ") == std::string::npos); + GGML_ASSERT(token_left.find("\n") == std::string::npos); + GGML_ASSERT(token_right.find(" ") == std::string::npos); + GGML_ASSERT(token_right.find("\n") == std::string::npos); + + auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); + if (it == bpe_ranks.end()) { + return -1; + } + + return it->second; + } +}; + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false); + ~llama_mmap(); + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; +#else + static constexpr bool SUPPORTED = false; +#endif +}; + + +struct llama_hparams { + bool vocab_only; + uint32_t n_vocab; + uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_embd; + uint32_t n_head; + uint32_t n_head_kv; + uint32_t n_layer; + uint32_t n_rot; + uint32_t n_ff; + + float f_norm_eps; + float f_norm_rms_eps; + + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_yarn_orig_ctx; + int8_t rope_scaling_type_train : 3; + bool rope_finetuned : 1; + + float f_clamp_kqv; + float f_max_alibi_bias; + + bool operator!=(const llama_hparams & other) const; + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); + } +}; + +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + llama_mlock() ; + + llama_mlock(const llama_mlock &) = delete; + ~llama_mlock(); + void init(void * ptr); + void grow_to(size_t target_size); +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + static size_t lock_granularity(); +#ifdef __APPLE__ +#define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" +#else +#define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" +#endif + bool raw_lock(const void * addr, size_t size) const ; +#undef MLOCK_SUGGESTION + static void raw_unlock(void * addr, size_t size); +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + static size_t lock_granularity(); + bool raw_lock(void * ptr, size_t len) const ; + static void raw_unlock(void * ptr, size_t len); +#else + static constexpr bool SUPPORTED = false; + static size_t lock_granularity(); + bool raw_lock(const void * addr, size_t len) const; + static void raw_unlock(const void * addr, size_t len); +#endif +}; + + +struct llama_model { + e_model type = MODEL_UNKNOWN; + llm_arch arch = LLM_ARCH_UNKNOWN; + llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + + std::string name = "n/a"; + + llama_hparams hparams = {}; + llama_vocab vocab; + + struct ggml_tensor * tok_embd; + struct ggml_tensor * pos_embd; + struct ggml_tensor * tok_norm; + struct ggml_tensor * tok_norm_b; + + struct ggml_tensor * output_norm; + struct ggml_tensor * output_norm_b; + struct ggml_tensor * output; + + std::vector layers; + + int n_gpu_layers; + + // gguf metadata + std::unordered_map gguf_kv; + + // context + struct ggml_context * ctx = NULL; + + // the model memory buffer + llama_buffer buf; + + // model memory mapped file + std::unique_ptr mapping; + + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + ~llama_model() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + if (ggml_cublas_loaded()) { + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cuda_free_data(tensors_by_name[i].second); + } + ggml_cuda_free_scratch(); + } +#endif + +#if defined(GGML_USE_CLBLAST) + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cl_free_data(tensors_by_name[i].second); + } +#endif + } +}; + +struct llama_context { + llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} + ~llama_context(); + + llama_cparams cparams; + + const llama_model & model; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; + + std::mt19937 rng; + + bool has_evaluated_once = false; + + int64_t t_start_us; + int64_t t_load_us; + int64_t t_sample_us = 0; + int64_t t_p_eval_us = 0; + int64_t t_eval_us = 0; + + int32_t n_sample = 0; // number of tokens sampled + int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) + int32_t n_eval = 0; // number of eval calls + + // decode output (2-dimensional array: [n_tokens][n_vocab]) + std::vector logits; + bool logits_all = false; + + // input embedding (1-dimensional array: [n_embd]) + std::vector embedding; + + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; + + // memory buffers used to evaluate the model + llama_buffer buf_compute; + + llama_buffer buf_alloc; + ggml_allocr * alloc = NULL; + +#ifdef GGML_USE_METAL + ggml_metal_context * ctx_metal = NULL; +#endif + +#ifdef GGML_USE_MPI + ggml_mpi_context * ctx_mpi = NULL; +#endif +}; + + +struct LLM_TN { + LLM_TN(llm_arch arch) ; + + llm_arch arch; + + std::string operator()(llm_tensor tensor) const; + + std::string operator()(llm_tensor tensor, const std::string & suffix) const ; + + std::string operator()(llm_tensor tensor, int bid) const ; + + std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ; + +}; + + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) ; + size_t tell() const; + void seek(size_t offset, int whence) const; + void read_raw(void * ptr, size_t len) const; + uint32_t read_u32() const; + void write_raw(const void * ptr, size_t len) const ; + void write_u32(std::uint32_t val) const; + ~llama_file(); + +}; + + +struct llama_state { + llama_state(); + // We save the log callback globally + ggml_log_callback log_callback; + void * log_callback_user_data = nullptr; +}; + + + +struct llama_model_loader { + int n_kv = 0; + int n_tensors = 0; + int n_created = 0; + + int64_t n_elements = 0; + size_t n_bytes = 0; + + bool use_mmap = false; + + llama_file file; + llama_ftype ftype; + llama_fver fver; + + std::unique_ptr mapping; + + struct gguf_context * ctx_gguf = NULL; + struct ggml_context * ctx_meta = NULL; + + llama_model_loader(const std::string & fname, bool use_mmap) ; + + ~llama_model_loader(); + + std::string get_arch_name() const; + + enum llm_arch get_arch() const ; + const char * get_tensor_name(int i) const; + + struct ggml_tensor * get_tensor_meta(int i) const; + + void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const; + + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ; + + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend) ; + + void done_getting_tensors() const; + + size_t file_offset(const char * name) const; + + + void load_data_for(struct ggml_tensor * cur) const ; + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ; +}; + +struct llama_data_context { + virtual void write(const void * src, size_t size) = 0; + virtual size_t get_size_written() = 0; + virtual ~llama_data_context() = default; +}; + +struct llama_data_buffer_context : llama_data_context { + uint8_t * ptr; + size_t size_written = 0; + llama_data_buffer_context(uint8_t * p) ; + void write(const void * src, size_t size) override ; + size_t get_size_written() override ; +}; + +struct llama_data_file_context : llama_data_context { + llama_file * file; + size_t size_written = 0; + llama_data_file_context(llama_file * f); + size_t get_size_written() override ; + void write(const void * src, size_t size); +}; + + +struct llama_beam { + std::vector tokens; + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Initialize end-of-beam to false. Callback sets this to true. + // Sort beams by probability. In case of ties, prefer beams at eob. + bool operator<(const llama_beam & rhs) const ; + void shift_tokens(const size_t n) ; + llama_beam_view view() const; +}; + +// A struct for calculating logit-related info. +struct llama_logit_info { + const float * const logits; + const int n_vocab; + const float max_l; + const float normalizer; + struct sum_exp { + float max_l; + float operator()(float sum, float l) const { return sum + std::exp(l - max_l); } + }; + llama_logit_info(llama_context * ctx); + llama_token_data get_token_data(const llama_token token_id) const ; + std::vector top_k(size_t k) ; + float probability_from_logit(float logit) const ; +}; + + +struct llama_beam_search_data { + llama_context * ctx; + size_t n_beams; + int n_past; + int n_predict; + std::vector beams; + std::vector next_beams; + size_t common_prefix_length; + std::vector beam_views; + llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict); + void collapse_beams(const size_t beam_idx) ; + void fill_next_beams_by_top_probabilities(llama_beam & beam) ; + size_t find_common_prefix_length() ; + llama_beams_state get_beams_state(const bool last_call) ; + void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data); + static void renormalize_beam_probabilities(std::vector & beams) ; + size_t top_beam_index(); + void update_beams_from_beam_views(); +}; + +using llm_build_cb = std::function; + +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; + +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; + +struct llm_build_context { + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_batch & batch; + const llama_kv_cache & kv_self; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head; + const int64_t n_embd_gqa; + + const float freq_base; + const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; + const float norm_eps; + const float norm_rms_eps; + + const int32_t n_tokens; + const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t kv_head; // index of where we store new KV data in the cache + const int32_t n_orig_ctx; + + const bool do_rope_shift; + + const llm_build_cb & cb; + + llama_buffer & buf_compute; + + struct ggml_context * ctx0 = nullptr; + + // TODO: consider making the entire interface noexcept + llm_build_context( + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case); + + void init() ; + void free() ; + struct ggml_cgraph * build_llama() ; + struct ggml_cgraph * build_baichuan() ; + struct ggml_cgraph * build_falcon() ; + struct ggml_cgraph * build_starcoder() ; + struct ggml_cgraph * build_persimmon() ; + struct ggml_cgraph * build_refact() ; + struct ggml_cgraph * build_bloom() ; + struct ggml_cgraph * build_mpt() ; + struct ggml_cgraph * build_stablelm(); +}; + + +enum llm_offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; + +struct llm_offload_trie { + struct node { + ~node() ; + node * children[256] = { nullptr }; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; + }; + node * root = nullptr; + llm_offload_trie(); + llm_offload_trie(const std::unordered_map & map) ; + ~llm_offload_trie(); + void add(const char * name, llm_offload_func_e func); + llm_offload_func_e find(const char * name) const; + +}; + +struct llm_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; +}; + + +struct llm_bigram_spm { + struct comparator { + bool operator()(llm_bigram_spm & l, llm_bigram_spm & r); + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + float score; + size_t size; +}; + +struct llm_tokenizer_spm { + llm_tokenizer_spm(const llama_vocab & vocab); + void tokenize(const std::string & text, std::vector & output); + + +private: + void resegment(llm_symbol & symbol, std::vector & output) ; + void try_add_bigram(int left, int right) ; + const llama_vocab & vocab; + + std::vector symbols; + llm_bigram_spm::queue work_queue; + + std::map> rev_merge; +}; + +// BPE tokenizer +// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] +// tried to simplify unicode stuff, so most likely does not work 100% correctly! + +// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused + +struct llm_bigram_bpe { + struct comparator { + bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ; + }; + + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + std::string text; + int rank; + size_t size; +}; + +struct llm_tokenizer_bpe { + llm_tokenizer_bpe(const llama_vocab & vocab); + + void tokenize(const std::string & text, std::vector & output); + +private: + void add_new_bigram(int left, int right) ; + + std::vector bpe_gpt2_preprocess(const std::string & text) ; + + const llama_vocab & vocab; + + std::vector symbols; + std::vector symbols_final; + + llm_bigram_bpe::queue work_queue; +}; + +typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ + FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, + FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT +} FRAGMENT_BUFFER_VARIANT_TYPE; + +struct fragment_buffer_variant{ + fragment_buffer_variant(llama_vocab::id _token); + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length); + const FRAGMENT_BUFFER_VARIANT_TYPE type; + const llama_vocab::id token; + const std::string _dummy; + const std::string & raw_text; + const uint64_t offset; + const uint64_t length; +}; + +struct llama_partial_utf8 { + uint32_t value; // bit value so far (unshifted) + int n_remain; // num bytes remaining; -1 indicates invalid sequence +}; + +struct llama_grammar { + const std::vector> rules; + std::vector> stacks; + + // buffer for partially generated UTF-8 sequence from accepted tokens + llama_partial_utf8 partial_utf8; +}; + +struct llama_grammar_candidate { + size_t index; + const uint32_t * code_points; + llama_partial_utf8 partial_utf8; +}; + +struct quantize_state_internal { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; + + int n_k_quantized = 0; + int n_fallback = 0; + + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; diff --git a/llama.h b/llama.h index 1a62058d1406b..9a1e7d04e050a 100644 --- a/llama.h +++ b/llama.h @@ -50,7 +50,9 @@ #endif #ifdef __cplusplus +#ifndef CPP_ONLY extern "C" { +#endif #endif // @@ -827,8 +829,10 @@ extern "C" { LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); #ifdef __cplusplus +#ifndef CPP_ONLY } #endif +#endif // Internal API to be implemented by llama.cpp and used by tests/benchmarks only #ifdef LLAMA_API_INTERNAL @@ -844,4 +848,8 @@ const std::vector> & llama_internal #endif // LLAMA_API_INTERNAL + + #endif // LLAMA_H + + diff --git a/print.hpp b/print.hpp new file mode 100644 index 0000000000000..6f0dc6bf5fa6c --- /dev/null +++ b/print.hpp @@ -0,0 +1,763 @@ +#include +#include +#include "llama.h" +#include "ggml-internal.hpp" +#include "llama-internal.hpp" + +REFL_TYPE(ggml_init_params ) +REFL_END + +// we use the named data type patch +#define ggml_opt_params_names +#ifdef ggml_opt_params_names +REFL_TYPE(ggml_opt_params::ggml_adam) +REFL_END + +REFL_TYPE(ggml_opt_params::ggml_lbfgs) +REFL_END + + +REFL_TYPE(ggml_opt_context::ggml_grad ) +REFL_END +#endif +REFL_TYPE(gpt_params ) + +REFL_FIELD( seed ) +REFL_FIELD( n_threads) +REFL_FIELD( n_threads_batch) +REFL_FIELD( n_predict ) +REFL_FIELD( n_ctx ) +REFL_FIELD( n_batch) +REFL_FIELD( n_keep ) +REFL_FIELD( n_draft) +REFL_FIELD( n_chunks ) +REFL_FIELD( n_parallel) +REFL_FIELD( n_sequences) +REFL_FIELD( p_accept ) +REFL_FIELD( p_split ) +REFL_FIELD( n_gpu_layers) +REFL_FIELD( n_gpu_layers_draft) +REFL_FIELD( main_gpu ) +REFL_FIELD( tensor_split) +REFL_FIELD( n_beams ) +REFL_FIELD(rope_freq_base) +REFL_FIELD( rope_freq_scale ) +REFL_FIELD( yarn_ext_factor ) +REFL_FIELD( yarn_attn_factor ) +REFL_FIELD( yarn_beta_fast ) +REFL_FIELD( yarn_beta_slow ) +REFL_FIELD( yarn_orig_ctx) +REFL_FIELD( rope_scaling_type) +REFL_FIELD( sparams) +REFL_FIELD(model ) +REFL_FIELD(model_draft ) +REFL_FIELD(model_alias) +REFL_FIELD(prompt ) +REFL_FIELD(prompt_file ) +REFL_FIELD(path_prompt_cache ) +REFL_FIELD(input_prefix ) +REFL_FIELD(input_suffix ) +REFL_FIELD( antiprompt) +REFL_FIELD(logdir ) +REFL_FIELD( lora_adapter) +REFL_FIELD(lora_base ) +REFL_FIELD( ppl_stride ) +REFL_FIELD( ppl_output_type ) +REFL_FIELD( hellaswag ) +REFL_FIELD( hellaswag_tasks ) +REFL_FIELD( mul_mat_q ) +REFL_FIELD( memory_f16) +REFL_FIELD( random_prompt ) +REFL_FIELD( use_color ) +REFL_FIELD( interactive ) +REFL_FIELD( chatml ) +REFL_FIELD( prompt_cache_all ) +REFL_FIELD( prompt_cache_ro ) +REFL_FIELD( embedding ) +REFL_FIELD( escape ) +REFL_FIELD( interactive_first ) +REFL_FIELD( multiline_input ) +REFL_FIELD( simple_io ) +REFL_FIELD( cont_batching ) +REFL_FIELD( input_prefix_bos ) +REFL_FIELD( ignore_eos ) +REFL_FIELD( instruct ) +REFL_FIELD( logits_all ) +REFL_FIELD( use_mmap) +REFL_FIELD( use_mlock ) +REFL_FIELD( numa ) +REFL_FIELD( verbose_prompt ) +REFL_FIELD( infill ) +REFL_FIELD(mmproj ) +REFL_FIELD( image) + +REFL_END + +REFL_TYPE(llama_sampling_params) +REFL_END + +REFL_TYPE(llm_arch) +REFL_END + +REFL_TYPE(llama_sampling_context ) +REFL_FIELD( params) +REFL_FIELD( mirostat_mu) +REFL_FIELD( grammar) +REFL_FIELD( parsed_grammar) +//REFL_FIELD( prev) // TODO fixme has null data +//REFL_FIELD( cur) +REFL_END + +REFL_TYPE(llama_token_data ) +REFL_END + + +REFL_TYPE(llama_token_data_array ) +REFL_END + +REFL_TYPE(llama_batch ) +REFL_END + + +REFL_TYPE(ggml_object) + REFL_FIELD(offs) +REFL_END + +REFL_TYPE(ggml_tensor) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(ggml_cplan) + REFL_FIELD(work_size) +REFL_END + +REFL_TYPE(ggml_hash_set) + REFL_FIELD(size) +REFL_END + +REFL_TYPE(ggml_cgraph) + REFL_FIELD(size) +REFL_END + +REFL_TYPE(ggml_scratch) + REFL_FIELD(offs) +REFL_END + +REFL_TYPE(ggml_compute_params) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(ggml_opt_params) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(ggml_opt_context) + REFL_FIELD(ctx) +REFL_END + +REFL_TYPE(gguf_init_params) +REFL_END + +REFL_TYPE(ggml_something) + REFL_FIELD(type_name) +REFL_END + +REFL_TYPE(ggml_context) + REFL_FIELD(mem_size) +REFL_FIELD(mem_buffer) +REFL_FIELD(mem_buffer_owned) +REFL_FIELD( no_alloc) +REFL_FIELD( no_alloc_save) +REFL_FIELD( n_objects) +REFL_FIELD( objects_begin) +REFL_FIELD( objects_end) +REFL_FIELD( scratch) +REFL_FIELD( scratch_save) + +REFL_END + +REFL_TYPE(ggml_context_container) + REFL_FIELD(used) + REFL_FIELD(context) +REFL_END + + REFL_TYPE(ggml_numa_node) + REFL_FIELD(cpus) + REFL_FIELD(n_cpus) + REFL_END + + REFL_TYPE(ggml_numa_nodes) + REFL_FIELD(nodes) + REFL_FIELD(n_nodes) + REFL_END + + REFL_TYPE(ggml_state) + REFL_FIELD(contexts) + REFL_FIELD(numa) + REFL_END + + REFL_TYPE(gguf_str) + REFL_FIELD(n) + REFL_FIELD(data) + REFL_END + + REFL_TYPE(ggml_map_custom1_op_params) + REFL_FIELD(fun) + REFL_FIELD(n_tasks) + REFL_END + +REFL_TYPE(ggml_map_custom2_op_params) + REFL_FIELD(fun) + REFL_FIELD(n_tasks) +REFL_END + +REFL_TYPE(ggml_map_custom3_op_params) + REFL_FIELD(fun) + REFL_FIELD(n_tasks) +REFL_END + +REFL_TYPE(hash_map) + REFL_FIELD(set) + REFL_FIELD(vals) +REFL_END +REFL_TYPE(ggml_compute_state_shared) + REFL_FIELD(cgraph) + REFL_FIELD(cplan) +REFL_END +REFL_TYPE(ggml_compute_state) + REFL_FIELD(thrd) + REFL_FIELD(ith) +REFL_END +REFL_TYPE(ggml_lbfgs_iteration_data) + REFL_FIELD(alpha) + REFL_FIELD(ys) +REFL_END + +REFL_TYPE(gguf_kv) + REFL_FIELD(key) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(gguf_header) + REFL_FIELD(magic) + REFL_FIELD(version) +REFL_END + +REFL_TYPE(gguf_tensor_info) + REFL_FIELD(name) + REFL_FIELD(n_dims) +REFL_END + +REFL_TYPE(gguf_context) + REFL_FIELD(header) + REFL_FIELD(kv) +REFL_END + +REFL_TYPE(gguf_buf) + REFL_FIELD(data) + REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_model_params) + REFL_FIELD(n_gpu_layers) +REFL_END +REFL_TYPE(llama_context_params) + REFL_FIELD(seed) +REFL_END +REFL_TYPE(llama_model_quantize_params) + REFL_FIELD(nthread) +REFL_END + +REFL_TYPE(llama_grammar_element) +REFL_END + +REFL_TYPE(llama_timings) + REFL_FIELD(t_start_ms) +REFL_END +REFL_TYPE(llama_beam_view) + REFL_FIELD(tokens) +REFL_END + +REFL_TYPE(llama_beams_state) + REFL_FIELD(beam_views) +REFL_END + +REFL_TYPE(ggml_backend) +REFL_END + +REFL_TYPE(ggml_backend_buffer) +REFL_END + +REFL_TYPE(ggml_allocr) +REFL_END + +REFL_TYPE(ggml_tallocr) +REFL_END + +REFL_TYPE(ggml_gallocr) +REFL_END + + +REFL_TYPE(llama_buffer) +REFL_FIELD(data) +REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_file) +REFL_FIELD(fp) +REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_mmap) +REFL_FIELD(addr) +REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_mlock) + REFL_FIELD(addr) + REFL_FIELD(size) +REFL_END + +REFL_TYPE(llama_state) + REFL_FIELD(log_callback) + REFL_FIELD(log_callback_user_data) + REFL_END + + +REFL_TYPE(llama_hparams) + REFL_FIELD(vocab_only) + REFL_FIELD(n_vocab) + REFL_END + + +REFL_TYPE(llama_cparams) + REFL_FIELD(n_ctx) + REFL_FIELD(n_batch) +REFL_END + +REFL_TYPE(llama_layer) + REFL_FIELD(attn_norm) + REFL_FIELD(attn_norm_b) +REFL_END + +REFL_TYPE(llama_kv_cell) + REFL_FIELD(pos) + REFL_FIELD(delta) +REFL_END + +REFL_TYPE(llama_kv_cache) + REFL_FIELD(has_shift) + REFL_FIELD(head) + REFL_END + +REFL_TYPE(e_model) +REFL_END + +REFL_TYPE(llama_ftype) +REFL_END + +REFL_TYPE(llama_model) + REFL_FIELD(type) + REFL_FIELD(arch) +REFL_FIELD(ftype ) + +REFL_FIELD( name ) + + REFL_FIELD( hparams ) +REFL_FIELD( vocab) + +REFL_FIELD( tok_embd) +REFL_FIELD( pos_embd) +REFL_FIELD( tok_norm) +REFL_FIELD( tok_norm_b) + +REFL_FIELD( output_norm) +REFL_FIELD( output_norm_b) +REFL_FIELD( output) + +REFL_FIELD( layers) + +REFL_FIELD( n_gpu_layers) + + REFL_FIELD( gguf_kv) //unordered map + REFL_FIELD( ctx) + REFL_FIELD( buf) + REFL_FIELD( mapping) //std::unique_ptr +REFL_FIELD( mlock_buf) +REFL_FIELD( mlock_mmap) +REFL_FIELD( tensors_by_name) + REFL_FIELD( t_load_us) +REFL_FIELD( t_start_us) + +REFL_END + +REFL_TYPE(llama_vocab) + REFL_END + + REFL_TYPE(grammar_parser::parse_state) + REFL_END + +REFL_TYPE(llama_context) +REFL_FIELD( cparams) +//REFL_FIELD(model) +REFL_FIELD(kv_self) + REFL_FIELD(rng) //random numbers +REFL_FIELD(has_evaluated_once ) +REFL_FIELD( t_start_us) +REFL_FIELD( t_load_us) + REFL_FIELD( t_sample_us ) +REFL_FIELD( t_p_eval_us ) + REFL_FIELD( t_eval_us) +REFL_FIELD( n_sample ) +REFL_FIELD( n_p_eval ) + REFL_FIELD( n_eval ) +//REFL_FIELD( logits) crash +REFL_FIELD( logits_all ) +REFL_FIELD( embedding) +//REFL_FIELD( work_buffer) + REFL_FIELD( buf_compute) + REFL_FIELD( buf_alloc) +REFL_FIELD( alloc ) + +#ifdef GGML_USE_METAL +REFL_FIELD( ctx_metal ) +#endif + +#ifdef GGML_USE_MPI +REFL_FIELD( ctx_mpi ) + +#endif +REFL_END + +REFL_TYPE(llama_model_loader) + REFL_FIELD(n_kv) + REFL_FIELD(n_tensors) +REFL_END + +REFL_TYPE(llm_build_context) +// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’ +// REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’ +REFL_END + +REFL_TYPE(llm_offload_trie) +REFL_END + +REFL_TYPE(llm_symbol) + REFL_FIELD(prev) +REFL_END + +REFL_TYPE(llm_bigram_spm) +REFL_END + +REFL_TYPE(llm_tokenizer_spm) +REFL_END + +REFL_TYPE(llm_bigram_bpe) +REFL_END + +REFL_TYPE(llm_tokenizer_bpe) +REFL_END + + +REFL_TYPE(fragment_buffer_variant) +REFL_END + + +REFL_TYPE(llama_partial_utf8) + REFL_FIELD(value) + REFL_FIELD(n_remain) +REFL_END + + +REFL_TYPE(llama_grammar) + REFL_FIELD(rules) + REFL_FIELD(stacks) +REFL_END + + +REFL_TYPE(llama_grammar_candidate) + REFL_FIELD(index) + REFL_FIELD(code_points) +REFL_END + + +REFL_TYPE(llama_beam) + REFL_FIELD(tokens) + REFL_FIELD(p) +REFL_END + + +REFL_TYPE(llama_logit_info) +// REFL_FIELD(logits) + REFL_FIELD(n_vocab) +REFL_END + +REFL_TYPE(llama_beam_search_data) + REFL_FIELD(ctx) + REFL_FIELD(n_beams) +REFL_END + + +REFL_TYPE(quantize_state_internal) +// REFL_FIELD(model) + REFL_FIELD(params) +REFL_FIELD( n_attention_wv ) +REFL_FIELD( n_feed_forward_w2 ) + REFL_FIELD( i_attention_wv ) + REFL_FIELD( i_feed_forward_w2 ) +REFL_FIELD( n_k_quantized ) +REFL_FIELD( n_fallback ) + +REFL_END + +REFL_TYPE(llama_data_context) +REFL_END + +REFL_TYPE(llama_data_buffer_context) + REFL_FIELD(ptr) +REFL_END + +REFL_TYPE(llama_data_file_context) + REFL_FIELD(file) +REFL_END + +template +constexpr auto get_value_type_name(const T t) noexcept +{ + return t.value_type; +} + +namespace runtime2 + { + using namespace refl; + using namespace refl::descriptor; + template + void debug(std::basic_ostream& os, const T& value, bool compact = false); + + namespace detail + { + template &>() << std::declval())> + std::true_type is_ostream_printable_test(int); + + template + std::false_type is_ostream_printable_test(...); + + template + constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test(0))::value }; + + namespace + { + [[maybe_unused]] int next_depth(int depth) + { + return depth == -1 || depth > 8 + ? -1 + : depth + 1; + } + } + + template + void indent(std::basic_ostream& os, int depth) + { + for (int i = 0; i < depth; i++) { + os << " "; + } + } + + template + void debug_impl(std::basic_ostream& os, const T& value, [[maybe_unused]] int depth); + + template + void debug_detailed(std::basic_ostream& os, const T& value, int depth) + { + + using type_descriptor = type_descriptor; + bool compact = depth == -1; + // print type with members enclosed in braces + os << type_descriptor::name << " { "; + if (!compact) os << '\n'; + + constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); }); + for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) { + int new_depth = next_depth(depth); + + indent(os, new_depth); + os << get_display_name(member) << " = "; + + if constexpr (util::contains_instance(member.attributes)) { + // use the debug attribute to print + auto debug_attr = util::get_instance(member.attributes); + debug_attr.write(os, value); + } + else { + debug_impl(os, member(value), new_depth); + } + + if (!compact || index + 1 != readable_members.size) { + os << ", "; + } + if (!compact) { + indent(os, depth); + os << '\n'; + } + }); + + if (compact) os << ' '; + indent(os, depth); + os << '}'; + } + + template + void debug_reflectable(std::basic_ostream& os, const T& value, [[maybe_unused]] int depth) + { + using type_descriptor = type_descriptor; + if constexpr (trait::contains_instance_v) { + // use the debug attribute to print + auto debug_attr = util::get_instance(type_descriptor::attributes); + debug_attr.write(os, value); + } + else if constexpr (detail::is_ostream_printable_v) { + // type supports printing natively, just use that + + os << value; + + } + else { + debug_detailed(os, value, depth); + } + } + + template + void debug_container(std::basic_ostream& os, const T& value, int depth) + { + bool compact = depth == -1; + os << "["; + + auto end = value.end(); + for (auto it = value.begin(); it != end; ++it) + { + if (!compact) os << '\n'; + int new_depth = next_depth(depth); + indent(os, new_depth); + + debug_impl(os, *it, new_depth); + if (std::next(it, 1) != end) { + os << ", "; + } + else if (!compact) { + os << '\n'; + } + + } + + indent(os, depth); + os << "]"; + } + + template + void debug_impl(std::basic_ostream& os, const T& value, [[maybe_unused]] int depth) + { + using no_pointer_t = std::remove_pointer_t; + + if constexpr (std::is_same_v) { + os << (value ? "true" : "false"); + } + else if constexpr (std::is_pointer_v && !std::is_void_v && trait::is_reflectable_v) { + if (value == nullptr) { + os << "nullptr"; + } + else { + os << '&'; + debug_impl(os, *value, -1); + } + } + else if constexpr (trait::is_reflectable_v) { + debug_reflectable(os, value, depth); + } + else if constexpr (detail::is_ostream_printable_v) { + os << value; + } + else if constexpr (trait::is_container_v) { + debug_container(os, value, depth); + } + else { + os << "(not printable)"; + } + } + } + + /** + * Writes the debug representation of value to the given std::ostream. + * Calls the function specified by the debug attribute whenever possible, + * before falling back to recursively interating the members and printing them. + * Takes an optional arguments specifying whether to print a compact representation. + * The compact representation contains no newlines. + */ + template + void debug(std::basic_ostream& os, const T& value, [[maybe_unused]] bool compact) + { + static_assert(trait::is_reflectable_v || trait::is_container_v || detail::is_ostream_printable_v, + "Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!"); + + detail::debug_impl(os, value, compact ? -1 : 0); + } + + /** + * Writes the compact debug representation of the provided values to the given std::ostream. + */ + template + void debug_all(std::basic_ostream& os, const Ts&... values) + { + refl::runtime::debug(os, std::forward_as_tuple(static_cast(values)...), true); + } + + /** + * Writes the debug representation of the provided value to an std::string and returns it. + * Takes an optional arguments specifying whether to print a compact representation. + * The compact representation contains no newlines. + */ + template + std::basic_string debug_str(const T& value, bool compact = false) + { + std::basic_stringstream ss; + debug(ss, value, compact); + return ss.str(); + } + + /** + * Writes the compact debug representation of the provided values to an std::string and returns it. + */ + template + std::basic_string debug_all_str(const Ts&... values) + { + return refl::runtime::debug_str(std::forward_as_tuple(static_cast(values)...), true); + } +} + +// // A generic function to print out the fields of any object +template +void print_fields(const T& t) { + runtime2::debug(std::cout, t); + constexpr auto type = refl::reflect(); + + constexpr auto membertype = refl::member_list(); + + constexpr auto members = get_members(type); + std::cout << "DEBUG Type: " << type.name.c_str() << "\n"; + std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n"; + std::cout << "DEBUG Type3: " << typeid(members).name() << "\n"; + refl::util::for_each(members, [&](auto member) { + //using member_t = decltype(member::value_type); + //typename type3 = member::value_type; + //typename trait::remove_qualifiers_t::value_type>; + //constexpr auto type2 = refl::reflect(type3); + //std::cout << "Auto:" << foo <<"\n"; + std::cout << "Auto:" << member.name <<"\n"; + //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n"; + //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n"; + }); + std::cout << "\n"; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c8b4bc254f4c6..28f6254630010 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-rope.cpp) # dummy executable - not installed -get_filename_component(TEST_TARGET test-c.c NAME_WE) -add_executable(${TEST_TARGET} test-c.c) +get_filename_component(TEST_TARGET test-c.cpp NAME_WE) +add_executable(${TEST_TARGET} test-c.cpp) target_link_libraries(${TEST_TARGET} PRIVATE llama) diff --git a/tests/test-c.c b/tests/test-c.cpp similarity index 100% rename from tests/test-c.c rename to tests/test-c.cpp