diff --git a/.gitignore b/.gitignore index 41259a12f50cb..f8a2a2dae5902 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe +/#llama.cpp# +#* +\\#* diff --git a/CMakeLists.txt b/CMakeLists.txt index f32df5fe52335..839aad003ca32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" # Compile flags # -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) @@ -230,7 +230,12 @@ if (LLAMA_BLAS) message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") add_compile_options(${BLAS_LINKER_FLAGS}) - add_compile_definitions(GGML_USE_OPENBLAS) + + # from https://github.com/NVIDIA/cutlass + make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp") + set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags}) + + # add_compile_definitions(GGML_USE_OPENBLAS) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() @@ -312,7 +317,7 @@ if (LLAMA_MPI) if (MPI_C_FOUND) message(STATUS "MPI found") set(GGML_HEADERS_MPI ggml-mpi.h) - set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) + set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h) add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) if (NOT MSVC) @@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "") set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags}) endif() +# +set(cuda_flags --verbose -G ${cuda_flags}) + add_compile_options("$<$:${cuda_flags}>") if (WIN32) @@ -485,8 +493,10 @@ if (NOT MSVC) add_link_options(-static-libgcc -static-libstdc++) endif() endif() + add_link_options("-Wl,-Map=${TARGET}.map") + if (LLAMA_GPROF) - add_compile_options(-pg) + add_compile_options(-pg) endif() endif() @@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM) endif() add_library(ggml OBJECT - ggml.c + ggml.cpp ggml.h - ggml-alloc.c + print.hpp + ggml-internal.hpp + llama-internal.hpp + ggml-alloc.cpp ggml-alloc.h - ggml-backend.c + ggml-backend.cpp ggml-backend.h - ggml-quants.c + ggml-quants.cpp ggml-quants.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} @@ -683,7 +696,7 @@ add_library(llama ) target_include_directories(llama PUBLIC .) -target_compile_features(llama PUBLIC cxx_std_11) # don't bump +target_compile_features(llama PUBLIC cxx_std_20) # don't bump target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} diff --git a/Makefile b/Makefile index a6d2c2ec0f380..3fe2af3d8beef 100644 --- a/Makefile +++ b/Makefile @@ -116,7 +116,7 @@ endif # keep standard at C11 and C++11 MK_CPPFLAGS = -I. -Icommon MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++11 -fPIC +MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY # -Ofast tends to produce faster code, but may not be available for some compilers. ifdef LLAMA_FAST @@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h endif # LLAMA_METAL ifdef LLAMA_MPI -ggml-mpi.o: ggml-mpi.c ggml-mpi.h +ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI @@ -537,17 +537,17 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml.o: ggml.cpp ggml.h ggml-cuda.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h - $(CC) $(CFLAGS) -c $< -o $@ +ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h + $(CXX) $(CXXFLAGS) -c $< -o $@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o @@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-c.o: tests/test-c.c llama.h - $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ +tests/test-c.o: tests/test-c.cpp llama.h + $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ diff --git a/binding.py b/binding.py new file mode 100644 index 0000000000000..217dce6846340 --- /dev/null +++ b/binding.py @@ -0,0 +1,337 @@ +import os +import json +import re +import clang.cindex + +# configurable part + +CLANG_VERSION='13.0.1' +# homebrew installs for llvm (brew info llvm gives details): +# x64: /usr/local/opt/llvm/lib +# arm64: /opt/homebrew/opt/llvm/lib +llvmLibPath = "/usr/lib/llvm-15/lib/" + +cxxClientRoot = "/home/mdupont/experiments/llama.cpp/" + +fileList = [ +# "ggml.cpp", +# "llama.cpp", + "examples/server/server.cpp", +] + +typeList = [ +] + +# end of configurable part + +clang.cindex.Config.set_library_path(llvmLibPath) + + +def list_headers_in_dir(path): + # enumerates a folder but keeps the full pathing for the files returned + # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx) + + # list all the files in the folder + files = os.listdir(path) + # only include .hxx files + files = list(filter(lambda x: x.endswith('.hxx'), files)) + # add the folder path back on + files = list(map(lambda x: path + x, files)) + return files + + +# parse through the list of files specified and expand wildcards +fullFileList = [] +for filePath in fileList: + if "*" in filePath: + # wildcard path + basePath = filePath[:-1] + if "*" in basePath: + # if there is still a wildcard, we have an issue... + raise NotImplementedError( + "wildcard only supported at end of file path") + files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath)) + fullFileList = fullFileList + files + else: + # normal path + ff = os.path.join(cxxClientRoot, filePath) + fullFileList.append(ff) + print("DBUG",ff) +# exclude _json.hxx files +fullFileList = list( + filter(lambda x: not x.endswith('_json.hxx'), fullFileList)) +# exclude _fmt.hxx files +fullFileList = list( + filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList)) + + +# generate a list of regexps from the type list (for handling wildcards) +typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList)) + + +def is_included_type(name, with_durability=False): + + # TODO(brett19): This should be generalized somehow... + if "is_compound_operation" in name: + return False + + if "replica_context" in name: + return False + + if with_durability is True and '_with_legacy_durability' not in name: + return False + + for x in typeListRe: + if re.fullmatch(x, name): + return True + return False + + +opTypes = [] +opEnums = [] + + +def parse_type(type): + typeStr = type.get_canonical().spelling + return parse_type_str(typeStr) + +std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"] + +def parse_type_str(typeStr): + if typeStr == "std::mutex": + return {"name": "std::mutex"} + if typeStr == "std::string": + return {"name": "std::string"} + if typeStr == "std::chrono::duration": + return {"name": "std::chrono::seconds"} + if typeStr == "std::chrono::duration>": + return {"name": "std::chrono::milliseconds"} + if typeStr == "std::chrono::duration>": + return {"name": "std::chrono::microseconds"} + if typeStr == "std::chrono::duration>": + return {"name": "std::chrono::nanoseconds"} + if typeStr == "std::error_code": + return {"name": "std::error_code"} + if typeStr == "std::monostate": + return {"name": "std::monostate"} + if typeStr == "std::byte": + return {"name": "std::byte"} + if typeStr == "unsigned long": + return {"name": "std::size_t"} + if typeStr == "char": + return {"name": "std::int8_t"} + if typeStr == "unsigned char": + return {"name": "std::uint8_t"} + if typeStr == "short": + return {"name": "std::int16_t"} + if typeStr == "unsigned short": + return {"name": "std::uint16_t"} + if typeStr == "int": + return {"name": "std::int32_t"} + if typeStr == "unsigned int": + return {"name": "std::uint32_t"} + if typeStr == "long long": + return {"name": "std::int64_t"} + if typeStr == "unsigned long long": + return {"name": "std::uint64_t"} + if typeStr == "bool": + return {"name": "std::bool"} + if typeStr == "float": + return {"name": "std::float"} + if typeStr == "double": + return {"name": "std::double"} + if typeStr == "std::nullptr_t": + return {"name": "std::nullptr_t"} + if typeStr in std_comparators: + return {"name": typeStr} + + tplParts = typeStr.split("<", 1) + if len(tplParts) > 1: + tplClassName = tplParts[0] + tplParams = tplParts[1][:-1] + if tplClassName == "std::function": + return { + "name": "std::function" + } + if tplClassName == "std::optional": + return { + "name": "std::optional", + "of": parse_type_str(tplParams) + } + if tplClassName == "std::vector": + return { + "name": "std::vector", + "of": parse_type_str(tplParams) + } + if tplClassName == "std::set": + return { + "name": "std::set", + "of": parse_type_str(tplParams) + } + if tplClassName == "std::variant": + variantParts = tplParams.split(", ") + variantTypes = [] + for variantPart in variantParts: + variantTypes.append(parse_type_str(variantPart)) + return { + "name": "std::variant", + "of": variantTypes + } + if tplClassName == "std::array": + variantParts = tplParams.split(", ") + if len(variantParts) != 2: + print("FAILED TO PARSE ARRAY TYPES: " + typeStr) + return {"name": "unknown", "str": typeStr} + return { + "name": "std::array", + "of": parse_type_str(variantParts[0]), + "size": int(variantParts[1]) + } + if tplClassName == "std::map": + variantParts = tplParams.split(", ") + if len(variantParts) < 2 or len(variantParts) > 3: + print("FAILED TO PARSE MAP TYPES: " + typeStr) + return {"name": "unknown", "str": typeStr} + + if len(variantParts) == 2: + return { + "name": "std::map", + "of": parse_type_str(variantParts[0]), + "to": parse_type_str(variantParts[1]) + } + else: + return { + "name": "std::map", + "of": parse_type_str(variantParts[0]), + "to": parse_type_str(variantParts[1]), + "comparator": parse_type_str(variantParts[2]) + } + + if tplClassName == "std::shared_ptr": + return { + "name": "std::shared_ptr", + "of": parse_type_str(tplParams) + } + + #return {"name": "unknown", "str": typeStr} + + if 'unnamed struct' in typeStr: + print("WARNING: Found unnamed struct: " + typeStr) + + return {"name": typeStr} + +internal_structs = [] +UNNAMED_STRUCT_DELIM = '::(unnamed struct' + +def traverse(node, namespace, main_file): + # only scan the elements of the file we parsed + + + if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL: + fullStructName = "::".join([*namespace, node.displayname]) + print("#FILE", node.location.file ) + print("REFL_TYPE(" + fullStructName + ")") + + structFields = [] + for child in node.get_children(): + if child.kind == clang.cindex.CursorKind.FIELD_DECL: + struct_type = parse_type(child.type) + type_str = child.type.get_canonical().spelling + print(" REFL_FIELD(" + child.displayname + ")") + if 'unnamed' in type_str: + name_tokens = type_str.split('::') + name_override = '::'.join(name_tokens[:-1] + [child.displayname]) + struct_type['name'] = name_override + internal_structs.append(name_override) + + structFields.append({ + "name": child.displayname, + "type": struct_type, + }) + # replica read changes introduced duplicate get requests + #if any(map(lambda op: op['name'] == fullStructName, opTypes)): + # return + + #opTypes.append({ + # "name": fullStructName, + # "fields": structFields, + #}) + print("REFL_END") + + + if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL: + fullStructName = "::".join([*namespace, node.displayname]) + if is_included_type(fullStructName, with_durability=True): + type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None) + if type_ref: + base_request_name = type_ref.displayname.replace('struct', '').strip() + base_request = next((op for op in opTypes if op['name'] == base_request_name), None) + if base_request: + new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level'] + new_fields.extend([ + {"name":"persist_to", "type":{"name":"couchbase::persist_to"}}, + {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}} + ]) + + opTypes.append({ + "name": fullStructName, + "fields": new_fields + }) + if node.kind == clang.cindex.CursorKind.ENUM_DECL: + fullEnumName = "::".join([*namespace, node.displayname]) + if is_included_type(fullEnumName): + enumValues = [] + + for child in node.get_children(): + if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL: + enumValues.append({ + "name": child.displayname, + "value": child.enum_value, + }) + opEnums.append({ + "name": fullEnumName, + "type": parse_type(node.enum_type), + "values": enumValues, + }) + + if node.kind == clang.cindex.CursorKind.NAMESPACE: + namespace = [*namespace, node.displayname] + if node.kind == clang.cindex.CursorKind.CLASS_DECL: + namespace = [*namespace, node.displayname] + if node.kind == clang.cindex.CursorKind.STRUCT_DECL: + namespace = [*namespace, node.displayname] + + for child in node.get_children(): + traverse(child, namespace, main_file) + +for headerPath in fullFileList: + print("processing " + headerPath) + index = clang.cindex.Index.create() + args = [ + '-std=c++17', + ] + + try: + translation_unit = index.parse(headerPath, args=args) + except Exception as e: + print(e) + import pdb + pdb.set_trace() + raise e + + # output clang compiler diagnostics information (for debugging) + + for diagnostic in translation_unit.diagnostics: + diagnosticMsg = diagnostic.format() + print(diagnostic) + + traverse(translation_unit.cursor, [], headerPath) + +jsonData = json.dumps({ + 'op_structs': opTypes, + 'op_enums': opEnums +}) + +f = open("bindings.json", "w") +f.write(jsonData) +f.close() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 31ec8cade19be..18d2d03c09196 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,6 +31,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#include "print.hpp" + static llama_context ** g_ctx; static llama_model ** g_model; static gpt_params * g_params; @@ -99,6 +101,7 @@ static void sigint_handler(int signo) { } } #endif +using namespace refl; int main(int argc, char ** argv) { gpt_params params; @@ -117,7 +120,8 @@ int main(int argc, char ** argv) { // TODO: Dump params ? //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); - + print_fields(params); + // save choice to use color for later // (note for later: this is a slightly awkward choice) console::init(params.simple_io, params.use_color); @@ -234,6 +238,8 @@ int main(int argc, char ** argv) { std::vector embd_inp; + print_fields(*model); + if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); if (params.chatml) { @@ -277,7 +283,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } - + print_fields(*ctx); + //print_fields(session_tokens); // debug message about similarity of saved session, if applicable size_t n_matching_session_tokens = 0; if (!session_tokens.empty()) { @@ -365,6 +372,10 @@ int main(int argc, char ** argv) { for (int i = 0; i < (int) guidance_inp.size(); i++) { LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); } + + print_fields(*ctx_guidance); + + } if (params.n_keep > 0) { @@ -473,7 +484,8 @@ int main(int argc, char ** argv) { std::vector embd_guidance; struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); - + print_fields(*ctx_sampling); + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { @@ -508,6 +520,7 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); + print_fields(*ctx); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); @@ -624,7 +637,7 @@ int main(int argc, char ** argv) { } const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - + //print_fields(id); llama_sampling_accept(ctx_sampling, ctx, id, true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 50f124b13e849..a42bba9b6d54f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -24,6 +24,7 @@ #include #include #include +#include "print.hpp" #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -33,6 +34,9 @@ using json = nlohmann::json; +REFL_TYPE(std::less< ::nlohmann::detail::value_t>) +REFL_END + struct server_params { std::string hostname = "127.0.0.1"; @@ -41,6 +45,13 @@ struct server_params int32_t read_timeout = 600; int32_t write_timeout = 600; }; +REFL_TYPE(server_params) + REFL_FIELD(hostname) + REFL_FIELD(public_path) + REFL_FIELD(port) + REFL_FIELD(read_timeout) + REFL_FIELD(write_timeout) +REFL_END static bool server_verbose = false; @@ -157,6 +168,15 @@ struct task_server { bool embedding_mode = false; }; +REFL_TYPE(task_server) + REFL_FIELD(id) + REFL_FIELD(target_id) + REFL_FIELD(type) + REFL_FIELD(data) + REFL_FIELD(infill_mode) + REFL_FIELD(embedding_mode) +REFL_END + struct task_result { int id; bool stop; @@ -193,6 +213,18 @@ struct slot_params json input_suffix; }; +REFL_TYPE(slot_params) + REFL_FIELD(stream) + REFL_FIELD(cache_prompt) + REFL_FIELD(seed) + REFL_FIELD(n_keep) + REFL_FIELD(n_predict) + REFL_FIELD(antiprompt) + REFL_FIELD(input_prefix) + REFL_FIELD(input_suffix) +REFL_END + + struct slot_image { int32_t id; @@ -220,6 +252,17 @@ struct completion_token_output std::string text_to_send; }; +REFL_TYPE(completion_token_output) + REFL_FIELD(probs) + REFL_FIELD(tok) + REFL_FIELD(text_to_send) +REFL_END + +REFL_TYPE(completion_token_output::token_prob) + REFL_FIELD(tok) + REFL_FIELD(prob) +REFL_END + static size_t common_part(const std::vector &a, const std::vector &b) { size_t i; @@ -496,6 +539,51 @@ struct llama_client_slot } }; +//REFL_TYPE(llama_client_slot::llama_sampling_params) +//REFL_END + +REFL_TYPE(llama_client_slot) + REFL_FIELD(id) + REFL_FIELD(task_id) + REFL_FIELD(params) + REFL_FIELD(state) + REFL_FIELD(command) + REFL_FIELD(t_last_used) + REFL_FIELD(n_ctx) + REFL_FIELD(n_past) + REFL_FIELD(n_decoded) + REFL_FIELD(n_remaining) + REFL_FIELD(i_batch) + REFL_FIELD(num_prompt_tokens) + REFL_FIELD(num_prompt_tokens_processed) + REFL_FIELD(multibyte_pending) + REFL_FIELD(prompt) + REFL_FIELD(generated_text) + REFL_FIELD(sampled) + REFL_FIELD(cache_tokens) + REFL_FIELD(generated_token_probs) + REFL_FIELD(infill) + REFL_FIELD(embedding) + REFL_FIELD(has_next_token) + REFL_FIELD(truncated) + REFL_FIELD(stopped_eos) + REFL_FIELD(stopped_word) + REFL_FIELD(stopped_limit) + REFL_FIELD(oaicompat) + REFL_FIELD(oaicompat_model) + REFL_FIELD(stopping_word) + REFL_FIELD(sparams) + REFL_FIELD(ctx_sampling) + REFL_FIELD(images) + REFL_FIELD(sent_count) + REFL_FIELD(sent_token_probs_index) + REFL_FIELD(t_start_process_prompt) + REFL_FIELD(t_start_genereration) + REFL_FIELD(t_prompt_processing) + REFL_FIELD(t_token_generation) +REFL_END + + struct llama_server_context { llama_model *model = nullptr; @@ -878,7 +966,7 @@ struct llama_server_context all_slots_are_idle = false; LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); - + print_fields(*slot); return true; } @@ -1787,6 +1875,31 @@ struct llama_server_context } }; +REFL_TYPE(llama_server_context) + REFL_FIELD(model) + REFL_FIELD(ctx) + REFL_FIELD(clp_ctx) + REFL_FIELD(params) + REFL_FIELD(batch) + REFL_FIELD(multimodal) + REFL_FIELD(clean_kv_cache) + REFL_FIELD(all_slots_are_idle) + REFL_FIELD(add_bos_token) + REFL_FIELD(id_gen) + REFL_FIELD(n_ctx) + REFL_FIELD(system_need_update) + REFL_FIELD(system_prompt) + REFL_FIELD(system_tokens) + REFL_FIELD(name_user) + REFL_FIELD(name_assistant) + REFL_FIELD(slots) + REFL_FIELD(queue_tasks) + REFL_FIELD(queue_results) + REFL_FIELD(mutex_tasks) + REFL_FIELD(mutex_results) +REFL_END + + static void server_print_usage(const char *argv0, const gpt_params ¶ms, const server_params &sparams) { @@ -2497,6 +2610,11 @@ struct token_translator std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } }; + +REFL_TYPE(token_translator) + REFL_FIELD(ctx) +REFL_END + static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) { auto & gtps = slot->generated_token_probs; diff --git a/ggml-alloc.c b/ggml-alloc.cpp similarity index 98% rename from ggml-alloc.c rename to ggml-alloc.cpp index cdfe4caf69613..46f4c9bd73d45 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.cpp @@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) { free(galloc->parse_seq); - galloc->parse_seq = malloc(sizeof(int) * n); + galloc->parse_seq = (int*)malloc(sizeof(int) * n); for (int i = 0; i < n; i++) { galloc->parse_seq[i] = list[i]; @@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st if (galloc->hash_values != NULL) { free(galloc->hash_values); } - galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size); + galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size); galloc->hash_set.size = hash_size; - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size); } // reset hash table @@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap // alloc hash_values if needed if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size); galloc->hash_values_size = hash_size; } diff --git a/ggml-backend.c b/ggml-backend.cpp similarity index 96% rename from ggml-backend.c rename to ggml-backend.cpp index f6e5fceed0f4d..47b60cb1e284e 100644 --- a/ggml-backend.c +++ b/ggml-backend.cpp @@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init( struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, size_t size) { - ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); + ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer)); GGML_ASSERT(iface.get_base != NULL); @@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst // TODO: allow backends to support copy to/from same backend if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) { - ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst); + ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst); } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) { - ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst); + ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst); } else { // shouldn't be hit when copying from/to CPU #ifndef NDEBUG @@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu { static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); + struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu)); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); cpu_plan->cgraph = *cgraph; if (cpu_plan->cplan.work_size > 0) { - cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size); } return cpu_plan; @@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.work_data = (uint8_t*)cpu_ctx->work_data; ggml_graph_compute(cgraph, &cplan); } @@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = { }; ggml_backend_t ggml_backend_cpu_init(void) { - struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context)); ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->work_data = NULL; ctx->work_size = 0; - ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend)); *cpu_backend = (struct ggml_backend) { /* .interface = */ cpu_backend_i, @@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) { ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); - struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched)); memset(sched, 0, sizeof(struct ggml_backend_sched)); fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024); @@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr // initialize hash tables size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; sched->hash_set.size = hash_size; - sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); - sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); - sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size); + sched->node_talloc = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0]) * hash_size); + sched->node_copies = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0]) * hash_size); sched_split_graph(sched, measure_graph); sched_alloc_splits(sched); diff --git a/ggml-impl.h b/ggml-impl.h index 06c07339e9269..1bf20a4af3985 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -22,7 +22,7 @@ extern "C" { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) #else -#define static_assert(cond, msg) struct global_scope_noop_trick + //#define static_assert(cond, msg) struct global_scope_noop_trick #endif #endif diff --git a/ggml-internal.hpp b/ggml-internal.hpp new file mode 100644 index 0000000000000..0725451fcbd3e --- /dev/null +++ b/ggml-internal.hpp @@ -0,0 +1,258 @@ +struct ggml_context { + size_t mem_size; + void * mem_buffer; + bool mem_buffer_owned; + bool no_alloc; + bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers + + int n_objects; + + struct ggml_object * objects_begin; + struct ggml_object * objects_end; + + struct ggml_scratch scratch; + struct ggml_scratch scratch_save; + + ggml_context(): + mem_size(0), + mem_buffer(0), + mem_buffer_owned(0), + no_alloc(0), + no_alloc_save(0), + n_objects(0), + objects_begin(0), + objects_end(0), + scratch(), + scratch_save() + { + + } +}; + +struct ggml_context_container { + bool used; + + struct ggml_context context; + + ggml_context_container(): used(0),context(){ + + } +}; + +typedef double ggml_float; +typedef void * thread_ret_t; + +#define MAX_FREE_BLOCKS 256 + +struct free_block { + void * addr; + size_t size; +}; + +struct ggml_tallocr { + struct ggml_backend_buffer * buffer; + bool buffer_owned; + void * base; + size_t alignment; + + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + + size_t max_size; + + bool measure; + +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; +#endif +}; + + +struct hash_node { + int n_children; + int n_views; +}; + +typedef struct ggml_tallocr * ggml_tallocr_t; +typedef struct ggml_gallocr * ggml_gallocr_t; + +struct ggml_gallocr { + ggml_tallocr_t talloc; + struct ggml_hash_set hash_set; + struct hash_node * hash_values; + size_t hash_values_size; + ggml_tallocr_t * hash_allocs; + int * parse_seq; + int parse_seq_len; +}; + +struct ggml_allocr { + ggml_tallocr_t talloc; + ggml_gallocr_t galloc; +}; + +#define GGML_NUMA_MAX_NODES 8 +#define GGML_NUMA_MAX_CPUS 512 + +struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node + uint32_t n_cpus; +}; + +struct ggml_numa_nodes { + struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; + uint32_t n_nodes; + uint32_t total_cpus; // hardware threads on system +}; + +struct ggml_state { + struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; + struct ggml_numa_nodes numa; + + ggml_state():contexts(), numa() + { + + } +}; + +struct gguf_str { + uint64_t n; // GGUFv2 + char * data; +}; + +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; +struct hash_map { + struct ggml_hash_set set; + struct ggml_tensor ** vals; +}; + +#if defined(_WIN32) +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; +#else +#include +using namespace std; +#endif + +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + void * abort_callback_data; +}; +typedef pthread_t ggml_thread_t; +struct ggml_compute_state { + ggml_thread_t thrd; + int ith; + struct ggml_compute_state_shared * shared; +}; + +union gguf_value { + uint8_t uint8; + int8_t int8; + uint16_t uint16; + int16_t int16; + uint32_t uint32; + int32_t int32; + float float32; + uint64_t uint64; + int64_t int64; + double float64; + bool bool_; + + struct gguf_str str; + + struct gguf_array_T { + enum gguf_type type; + + uint64_t n; // GGUFv2 + void * data; + } arr; +}; + +struct ggml_lbfgs_iteration_data { + float alpha; + float ys; + float * s; + float * y; +}; + +struct gguf_kv { + struct gguf_str key; + + enum gguf_type type; + union gguf_value value; +}; + + + +struct gguf_header { + char magic[4]; + uint32_t version; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 +}; + +struct gguf_tensor_info { + struct gguf_str name; + + uint32_t n_dims; + uint64_t ne[GGML_MAX_DIMS]; + + enum ggml_type type; + + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` + + // for writing API + const void * data; + size_t size; +}; + +struct gguf_context { + struct gguf_header header; + + struct gguf_kv * kv; + struct gguf_tensor_info * infos; + + size_t alignment; + size_t offset; // offset of `data` from beginning of file + size_t size; // size of `data` in bytes + + //uint8_t * padding; + void * data; +}; + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + + +#include "ggml-backend-impl.h" diff --git a/ggml-mpi.c b/ggml-mpi.cpp similarity index 100% rename from ggml-mpi.c rename to ggml-mpi.cpp diff --git a/ggml-quants.c b/ggml-quants.cpp similarity index 93% rename from ggml-quants.c rename to ggml-quants.cpp index 7285d5f7fbcc0..e49189394e4ac 100644 --- a/ggml-quants.c +++ b/ggml-quants.cpp @@ -5,7 +5,7 @@ #include #include #include - +#include #ifdef __ARM_NEON // if YCM cannot find , make a symbolic link to it, for example: @@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif // reference implementation for deterministic creation of model files -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict } } -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_0_reference(x, y, k); +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q4_0_reference(x, (block_q4_0*)y, k); } -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k) { const int qk = QK4_1; assert(k % qk == 0); @@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict } } -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_1_reference(x, y, k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q4_1_reference(x, (block_q4_1*)y, k); } -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict } } -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_0_reference(x, y, k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q5_0_reference(x, (block_q5_0*)y, k); } -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k) { const int qk = QK5_1; assert(k % qk == 0); @@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict } } -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { - quantize_row_q5_1_reference(x, y, k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q5_1_reference(x, (block_q5_1*)y, k); } // reference implementation for deterministic creation of model files -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict } } -void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0 * restrict y = vy; + block_q8_0 * GGML_RESTRICT y = (block_q8_0*)vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { } // reference implementation for deterministic creation of model files -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k) { assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict } } -void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; - block_q8_1 * restrict y = vy; + block_q8_1 * GGML_RESTRICT y = (block_q8_1*)vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { #endif } -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int } } -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK4_1; assert(k % qk == 0); @@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int } } -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int } } -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK5_1; assert(k % qk == 0); @@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int } } -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK8_0; assert(k % qk == 0); @@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { +static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { +static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, +static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, int ntry, float alpha) { float min = x[0]; float max = x[0]; @@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f } #if QK_K == 256 -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { +static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; } else { @@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } } -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q2_K_reference(x, vy, k); +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { + quantize_row_q2_K_reference(x, (block_q2_K*)vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + block_q2_K * GGML_RESTRICT y = (block_q2_K *)dst + j/QK_K; quantize_row_q2_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q2_K)); @@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n //========================= 3-bit (de)-quantization -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } #if QK_K == 256 -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; uint8_t m = 1; memcpy(aux, x[i].scales, 12); @@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } #else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); assert(QK_K == 64); const int nb = k / QK_K; @@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); @@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } #endif -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { - quantize_row_q3_K_reference(x, vy, k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { + quantize_row_q3_K_reference(x, (block_q3_K*)vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + block_q3_K * GGML_RESTRICT y = (block_q3_K *)dst + j/QK_K; quantize_row_q3_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q3_K)); @@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int } } -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q4_K * restrict y = vy; + block_q4_K * GGML_RESTRICT y = (block_q4_K*)vy; quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + block_q4_K * GGML_RESTRICT y = (block_q4_K *)dst + j/QK_K; quantize_row_q4_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q4_K)); @@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); for (int j = 0; j < 32; ++j) { @@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } #else float d = GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict s = x[i].scales; + const int8_t * GGML_RESTRICT s = x[i].scales; for (int l = 0; l < 8; ++l) { y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); @@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } } -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q5_K * restrict y = vy; + block_q5_K * GGML_RESTRICT y = (block_q5_K*)vy; quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + block_q5_K * GGML_RESTRICT y = (block_q5_K *)dst + j/QK_K; quantize_row_q5_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q5_K)); @@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].ql; + uint8_t * GGML_RESTRICT qh = y[i].qh; #if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict ql = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT ql = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT sc = x[i].scales; #if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { @@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int } } -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q6_K * restrict y = vy; + block_q6_K * GGML_RESTRICT y = (block_q6_K*)vy; quantize_row_q6_K_reference(x, y, k); } @@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + block_q6_K * GGML_RESTRICT y = (block_q6_K *)dst + j/QK_K; quantize_row_q6_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q6_K)); @@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * //===================================== Q8_K ============================================== -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2374,8 +2374,8 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int } } -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { - quantize_row_q8_K_reference(x, y, k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { + quantize_row_q8_K_reference(x, (block_q8_K*)y, k); } //===================================== Dot ptoducts ================================= @@ -2423,14 +2423,15 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { + //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy); const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q4_0 * GGML_RESTRICT x = (const block_q4_0*)vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2439,10 +2440,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); const int8x16_t s8b = vdupq_n_s8(0x8); @@ -2733,14 +2734,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, #endif } -void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_1_q8_1(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q4_1 * GGML_RESTRICT x = (const block_q4_1*)vx; + const block_q8_1 * GGML_RESTRICT y = (const block_q8_1*)vy; // TODO: add WASM SIMD #if defined(__ARM_NEON) @@ -2752,10 +2753,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i + 0]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q4_1 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q4_1 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[i + 1]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; @@ -2893,15 +2894,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_0_q8_0(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q5_0 * GGML_RESTRICT x = (const block_q5_0*)vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2916,10 +2917,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q5_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q5_0 * GGML_RESTRICT x0 = &x[i]; + const block_q5_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3000,8 +3001,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q8_0 * restrict y0 = &y[i]; + const block_q5_0 * GGML_RESTRICT x0 = &x[i]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i]; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3199,15 +3200,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_1_q8_1(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q5_1 * GGML_RESTRICT x = (const block_q5_1*)vx; + const block_q8_1 * GGML_RESTRICT y = (const block_q8_1*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3225,10 +3226,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q5_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q5_1 * GGML_RESTRICT x0 = &x[i]; + const block_q5_1 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i]; + const block_q8_1 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3314,8 +3315,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q8_1 * restrict y0 = &y[i]; + const block_q5_1 * GGML_RESTRICT x0 = &x[i]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s; @@ -3518,14 +3519,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q8_0_q8_0(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q8_0 * GGML_RESTRICT x = (const block_q8_0*)vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0*)vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3534,10 +3535,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); @@ -3642,10 +3643,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri } #if QK_K == 256 -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = (const block_q2_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -3667,9 +3668,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; const uint8x16_t mins_and_scales = vld1q_u8(sc); const uint8x16_t scales = vandq_u8(mins_and_scales, m4); @@ -3746,8 +3747,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); @@ -3813,8 +3814,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // load mins and scales from block_q2_K.scales[QK_K/16] const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -4035,10 +4036,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4061,9 +4062,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4114,8 +4115,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * GGML_RESTRICT db = (const uint8_t *)&ud; + const uint8_t * GGML_RESTRICT mb = (const uint8_t *)&um; float summs = 0; @@ -4126,10 +4127,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4166,8 +4167,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * GGML_RESTRICT db = (const uint8_t *)&ud; + const uint8_t * GGML_RESTRICT mb = (const uint8_t *)&um; float summs = 0; @@ -4178,10 +4179,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4227,9 +4228,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4311,14 +4312,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = (const block_q3_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -4346,9 +4347,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -4454,8 +4455,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); @@ -4559,8 +4560,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales aux = (const uint32_t *)x[i].scales; @@ -4694,9 +4695,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(aux, x[i].scales, 12); utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); @@ -4806,11 +4807,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; @@ -4855,11 +4856,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4947,8 +4948,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5018,8 +5019,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5098,8 +5099,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5173,10 +5174,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 8; ++l) { a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); @@ -5213,11 +5214,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = (const block_q4_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -5262,8 +5263,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -5334,8 +5335,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri utmp[2] = uaux; utmp[0] &= kmask1; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -5393,8 +5394,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -5494,8 +5495,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vl = 32; @@ -5548,10 +5549,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); a += 32; @@ -5594,11 +5595,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } #else -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -5618,14 +5619,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sum_mins = 0.f; uint16_t aux16[2]; - const uint8_t * restrict scales = (const uint8_t *)aux16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)aux16; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint16_t * restrict a = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; @@ -5698,8 +5699,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m256i q4l = _mm256_and_si256(q4bits, m4); @@ -5744,8 +5745,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0); @@ -5778,16 +5779,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #elif defined __riscv_v_intrinsic uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5827,17 +5828,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri memset(sums, 0, 8*sizeof(float)); uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - uint8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + uint8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5861,11 +5862,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = (const block_q5_K*)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K*)vy; const int nb = n / QK_K; @@ -5911,9 +5912,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -5976,8 +5977,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; #if QK_K == 256 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6065,8 +6066,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -6163,9 +6164,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri vl = 8; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; @@ -6249,11 +6250,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); @@ -6302,11 +6303,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -6328,9 +6329,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint8x8_t qhbits = vld1_u8(qh); @@ -6387,8 +6388,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6433,8 +6434,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6490,9 +6491,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -6560,10 +6561,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 32; ++l) { a[l+ 0] = q4[l] & 0xF; a[l+32] = q4[l] >> 4; @@ -6574,7 +6575,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri } const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; + const int8_t * GGML_RESTRICT sc = x[i].scales; for (int j = 0; j < QK_K/16; ++j) { const float dl = d * sc[j]; @@ -6591,11 +6592,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #if QK_K == 256 -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = (const block_q6_K *)vx; + const block_q8_K * GGML_RESTRICT y = (const block_q8_K *)vy; const int nb = n / QK_K; @@ -6618,11 +6619,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); @@ -6750,9 +6751,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6830,9 +6831,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6942,11 +6943,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; size_t vl; @@ -7030,11 +7031,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; @@ -7067,11 +7068,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -7094,11 +7095,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; int32_t isum = 0; @@ -7157,9 +7158,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7214,9 +7215,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7281,11 +7282,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; int32_t isum = 0; @@ -7350,11 +7351,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 16; ++l) { a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; diff --git a/ggml-quants.h b/ggml-quants.h index 70c12c27465e8..2706e36ada7d3 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ // Quantization -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); - -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); - -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); - -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k); +void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k); +void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k); +void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k); +void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k); +void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k); + +void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k); +void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k); +void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k); +void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k); +void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k); +void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k); + +void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k); + +void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k); +void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); -//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); - -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k); +//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k); + +void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k); +void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); + +void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy); diff --git a/ggml.c b/ggml.cpp similarity index 98% rename from ggml.c rename to ggml.cpp index f92292b39c635..f1a0e5358859a 100644 --- a/ggml.c +++ b/ggml.cpp @@ -38,6 +38,14 @@ #pragma warning(disable: 4996) #endif +// initializers for static data called in the ggml_init function +static size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {}; +static char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {}; + +void type_traits_init(); +void GGUF_TYPE_SIZE_init(); +void GGUF_TYPE_NAME_init(); + #if defined(_WIN32) #include @@ -86,7 +94,9 @@ static int sched_yield (void) { } #else #include -#include +//#include +#include +using namespace std; typedef void * thread_ret_t; @@ -96,6 +106,8 @@ typedef void * thread_ret_t; #endif +#include + #ifdef GGML_USE_CPU_HBM #include #endif @@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y); +static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y); -static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { - [GGML_TYPE_I8] = { + +static ggml_type_traits_t type_traits[GGML_TYPE_COUNT]; +void type_traits_init(){ + type_traits[GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, .type_size = sizeof(int8_t), .is_quantized = false, - }, - [GGML_TYPE_I16] = { + }; + type_traits[GGML_TYPE_I16] = { .type_name = "i16", .blck_size = 1, .type_size = sizeof(int16_t), .is_quantized = false, - }, - [GGML_TYPE_I32] = { + }; + type_traits[GGML_TYPE_I32] = { .type_name = "i32", .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, - }, - [GGML_TYPE_F32] = { + }; + type_traits[GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, .type_size = sizeof(float), .is_quantized = false, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, - }, - [GGML_TYPE_F16] = { + }; + type_traits[GGML_TYPE_F16] = { .type_name = "f16", .blck_size = 1, .type_size = sizeof(ggml_fp16_t), @@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, - }, - [GGML_TYPE_Q4_0] = { + }; + type_traits[GGML_TYPE_Q4_0] = { .type_name = "q4_0", .blck_size = QK4_0, .type_size = sizeof(block_q4_0), @@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q4_1] = { + }; + type_traits[GGML_TYPE_Q4_1] = { .type_name = "q4_1", .blck_size = QK4_1, .type_size = sizeof(block_q4_1), @@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, - }, - [4] = { // GGML_TYPE_Q4_2 + }; + type_traits[4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", .blck_size = 0, .type_size = 0, @@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, - }, - [5] = { // GGML_TYPE_Q4_3 + }; + type_traits[5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", .blck_size = 0, .type_size = 0, @@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, - }, - [GGML_TYPE_Q5_0] = { + }; + type_traits[GGML_TYPE_Q5_0] = { .type_name = "q5_0", .blck_size = QK5_0, .type_size = sizeof(block_q5_0), @@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q5_1] = { + }; + type_traits[GGML_TYPE_Q5_1] = { .type_name = "q5_1", .blck_size = QK5_1, .type_size = sizeof(block_q5_1), @@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, - }, - [GGML_TYPE_Q8_0] = { + }; + type_traits[GGML_TYPE_Q8_0] = { .type_name = "q8_0", .blck_size = QK8_0, .type_size = sizeof(block_q8_0), @@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, - }, - [GGML_TYPE_Q8_1] = { + }; + type_traits[GGML_TYPE_Q8_1] = { .type_name = "q8_1", .blck_size = QK8_1, .type_size = sizeof(block_q8_1), @@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, - }, - [GGML_TYPE_Q2_K] = { + }; + type_traits[GGML_TYPE_Q2_K] = { .type_name = "q2_K", .blck_size = QK_K, .type_size = sizeof(block_q2_K), @@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q3_K] = { + }; + type_traits[GGML_TYPE_Q3_K] = { .type_name = "q3_K", .blck_size = QK_K, .type_size = sizeof(block_q3_K), @@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q4_K] = { + }; + type_traits[GGML_TYPE_Q4_K] = { .type_name = "q4_K", .blck_size = QK_K, .type_size = sizeof(block_q4_K), @@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q5_K] = { + }; + type_traits[GGML_TYPE_Q5_K] = { .type_name = "q5_K", .blck_size = QK_K, .type_size = sizeof(block_q5_K), @@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q6_K] = { + }; + type_traits[GGML_TYPE_Q6_K] = { .type_name = "q6_K", .blck_size = QK_K, .type_size = sizeof(block_q6_K), @@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, - }, - [GGML_TYPE_Q8_K] = { + }; + type_traits[GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, .type_size = sizeof(block_q8_K), .is_quantized = true, .from_float = quantize_row_q8_K, - } -}; + }; +} // For internal test use ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { @@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) { #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest *s = sumf; } -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes -inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); @@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re } } -inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float } // xs and vs are byte strides of x and v -inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { +inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { - const float * restrict x[GGML_VEC_MAD_UNROLL]; - const float * restrict v[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { x[i] = (const float *) ((const char *) xv + i*xs); @@ -2175,18 +2189,26 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// -struct ggml_context * ggml_init(struct ggml_init_params params) { - // make this function thread safe - ggml_critical_section_start(); - - static bool is_first_call = true; - if (is_first_call) { - // initialize time system (required on Windows) - ggml_time_init(); +struct ggml_context * ggml_init(struct ggml_init_params params) { - // initialize GELU, Quick GELU, SILU and EXP F32 tables - { + // initialize the data in the arrays + type_traits_init(); + GGUF_TYPE_SIZE_init(); + GGUF_TYPE_NAME_init(); + + struct ggml_context * ctx = NULL; + static bool is_first_call = true; + // make this function thread safe + ggml_critical_section_start(); + + + if (is_first_call) { + // initialize time system (required on Windows) + ggml_time_init(); + + // initialize GELU, Quick GELU, SILU and EXP F32 tables + { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); ggml_fp16_t ii; @@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } // find non-used context in g_state - struct ggml_context * ctx = NULL; + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { if (!g_state.contexts[i].used) { @@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml // align to GGML_MEM_ALIGN size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN); - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { @@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( return NULL; } - data = (char * const) ctx->scratch.data + ctx->scratch.offs; + data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs); ctx->scratch.offs += data_size; } else { @@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; - char * const data = tensor->data; + char * const data = (char*)tensor->data; switch (tensor->type) { case GGML_TYPE_I8: @@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; - char * const data = tensor->data; + char * const data = (char*)tensor->data; switch (tensor->type) { case GGML_TYPE_I8: @@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor( struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; while (obj != NULL) { if (obj->type == GGML_OBJECT_TENSOR) { @@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE); obj = obj->next; - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; while (obj != NULL) { if (obj->type == GGML_OBJECT_TENSOR) { @@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { struct ggml_object * obj = ctx->objects_begin; - char * const mem_buffer = ctx->mem_buffer; + char * const mem_buffer = (char*)ctx->mem_buffer; while (obj != NULL) { if (obj->type == GGML_OBJECT_TENSOR) { @@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ACC; @@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl( // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_SET; @@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d( }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; + int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_2D; @@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32( GGML_ASSERT(nb00 == sizeof(float)); if (ggml_is_contiguous(dst)) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0); } else { for (int k3 = 0; k3 < ne3; k3++) { for (int k2 = 0; k2 < ne2; k2++) { @@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { - char * wdata = params->wdata; + char * wdata = (char*)params->wdata; const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); for (int64_t i13 = 0; i13 < ne13; ++i13) { @@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32( return; } #endif - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0); return; } @@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_INIT) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0); return; } @@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d( struct ggml_tensor * dst) { const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; + enum ggml_op_pool op = (ggml_op_pool)opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; const int p0 = opts[3]; @@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d( } const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = opts[0]; + enum ggml_op_pool op = (ggml_op_pool)opts[0]; const int k0 = opts[1]; const int k1 = opts[2]; const int s0 = opts[3]; @@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; result.size = size; - result.keys = malloc(sizeof(struct ggml_tensor *) * size); + result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size); memset(result.keys, 0, sizeof(struct ggml_tensor *) * size); return result; } @@ -14113,9 +14136,9 @@ struct hash_map { }; static struct hash_map * ggml_new_hash_map(size_t size) { - struct hash_map * result = malloc(sizeof(struct hash_map)); + struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map)); result->set = ggml_hash_set_new(size); - result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size); + result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size); memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size); return result; } @@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads); // create thread pool if (n_threads > 1) { @@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { continue; } - GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0); + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0); } GGML_PRINT("========================================\n"); @@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam( const int n_accum = MAX(1, params.n_gradient_accumulation); const float accum_norm = 1.0f / (float) n_accum; - float * g = opt->adam.g->data; // gradients - float * m = opt->adam.m->data; // first moment - float * v = opt->adam.v->data; // second moment + float * g = (float*)opt->adam.g->data; // gradients + float * m = (float*)opt->adam.m->data; // first moment + float * v = (float*)opt->adam.v->data; // second moment - float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values + float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); @@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking( } else { // Armijo condition is satisfied if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { - return count; + return (ggml_opt_result)count; } ggml_vec_dot_f32(nx, &dg, g, d); @@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking( } else { if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { // regular Wolfe conditions - return count; + return (ggml_opt_result)count; } if(dg > -params->lbfgs.wolfe*dginit) { width = dec; } else { // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) - return count; + return (ggml_opt_result)count; } } } @@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; - float * x = opt->lbfgs.x->data; // current parameters - float * xp = opt->lbfgs.xp->data; // previous parameters - float * g = opt->lbfgs.g->data; // current gradient - float * gp = opt->lbfgs.gp->data; // previous gradient - float * d = opt->lbfgs.d->data; // search direction + float * x = (float*)opt->lbfgs.x->data; // current parameters + float * xp = (float*)opt->lbfgs.xp->data; // previous parameters + float * g = (float*)opt->lbfgs.g->data; // current gradient + float * gp = (float*)opt->lbfgs.gp->data; // previous gradient + float * d = (float*)opt->lbfgs.d->data; // search direction - float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values + float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values const int n_accum = MAX(1, params.n_gradient_accumulation); const float accum_norm = 1.0f / (float) n_accum; @@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_opt_get_params(np, ps, x); // the L-BFGS memory - float * lm_alpha = opt->lbfgs.lmal->data; - float * lm_ys = opt->lbfgs.lmys->data; - float * lm_s = opt->lbfgs.lms->data; - float * lm_y = opt->lbfgs.lmy->data; + float * lm_alpha = (float*)opt->lbfgs.lmal->data; + float * lm_ys = (float*)opt->lbfgs.lmys->data; + float * lm_s = (float*)opt->lbfgs.lms->data; + float * lm_y = (float*)opt->lbfgs.lmy->data; bool cancel = false; @@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, x, xp); ggml_vec_cpy_f32(nx, g, gp); - return ls; + return (ggml_opt_result)ls; } opt->loss_after = fx; @@ -17564,7 +17587,7 @@ GGML_API void ggml_opt_init( opt->nx = nx; opt->just_initialized = true; if (opt->ctx == NULL) { - struct ggml_init_params ctx_opt_params; + struct ggml_init_params ctx_opt_params; if (opt->params.type == GGML_OPT_ADAM) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; if (opt->params.past > 0) { @@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_0; for (int b = 0; b < n; b += k) { - block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; + block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0; quantize_row_q4_0_reference(src + b, y, k); @@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_1; for (int b = 0; b < n; b += k) { - block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; + block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1; quantize_row_q4_1_reference(src + b, y, k); @@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK5_0; for (int b = 0; b < n; b += k) { - block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; + block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0; quantize_row_q5_0_reference(src + b, y, k); @@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK5_1; for (int b = 0; b < n; b += k) { - block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; + block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1; quantize_row_q5_1_reference(src + b, y, k); @@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK8_0; for (int b = 0; b < n; b += k) { - block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; + block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0; quantize_row_q8_0_reference(src + b, y, k); @@ -17928,37 +17951,39 @@ struct gguf_str { char * data; }; -static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = sizeof(uint8_t), - [GGUF_TYPE_INT8] = sizeof(int8_t), - [GGUF_TYPE_UINT16] = sizeof(uint16_t), - [GGUF_TYPE_INT16] = sizeof(int16_t), - [GGUF_TYPE_UINT32] = sizeof(uint32_t), - [GGUF_TYPE_INT32] = sizeof(int32_t), - [GGUF_TYPE_FLOAT32] = sizeof(float), - [GGUF_TYPE_BOOL] = sizeof(bool), - [GGUF_TYPE_STRING] = sizeof(struct gguf_str), - [GGUF_TYPE_UINT64] = sizeof(uint64_t), - [GGUF_TYPE_INT64] = sizeof(int64_t), - [GGUF_TYPE_FLOAT64] = sizeof(double), - [GGUF_TYPE_ARRAY] = 0, // undefined + +void GGUF_TYPE_SIZE_init() { + GGUF_TYPE_SIZE[GGUF_TYPE_UINT8] = sizeof(uint8_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT8] = sizeof(int8_t); + GGUF_TYPE_SIZE[GGUF_TYPE_UINT16] = sizeof(uint16_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT16] = sizeof(int16_t); + GGUF_TYPE_SIZE[GGUF_TYPE_UINT32] = sizeof(uint32_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT32] = sizeof(int32_t); + GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float); + GGUF_TYPE_SIZE[GGUF_TYPE_BOOL] = sizeof(bool); + GGUF_TYPE_SIZE[GGUF_TYPE_STRING] = sizeof(struct gguf_str); + GGUF_TYPE_SIZE[GGUF_TYPE_UINT64] = sizeof(uint64_t); + GGUF_TYPE_SIZE[GGUF_TYPE_INT64] = sizeof(int64_t); + GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double); + GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY] = 0; // undefined }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); -static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = "u8", - [GGUF_TYPE_INT8] = "i8", - [GGUF_TYPE_UINT16] = "u16", - [GGUF_TYPE_INT16] = "i16", - [GGUF_TYPE_UINT32] = "u32", - [GGUF_TYPE_INT32] = "i32", - [GGUF_TYPE_FLOAT32] = "f32", - [GGUF_TYPE_BOOL] = "bool", - [GGUF_TYPE_STRING] = "str", - [GGUF_TYPE_ARRAY] = "arr", - [GGUF_TYPE_UINT64] = "u64", - [GGUF_TYPE_INT64] = "i64", - [GGUF_TYPE_FLOAT64] = "f64", + +void GGUF_TYPE_NAME_init(){ + GGUF_TYPE_NAME[GGUF_TYPE_UINT8] = "u8"; + GGUF_TYPE_NAME[GGUF_TYPE_INT8] = "i8"; + GGUF_TYPE_NAME[GGUF_TYPE_UINT16] = "u16"; + GGUF_TYPE_NAME[GGUF_TYPE_INT16] = "i16"; + GGUF_TYPE_NAME[GGUF_TYPE_UINT32] = "u32"; + GGUF_TYPE_NAME[GGUF_TYPE_INT32] = "i32"; + GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32"; + GGUF_TYPE_NAME[GGUF_TYPE_BOOL] = "bool"; + GGUF_TYPE_NAME[GGUF_TYPE_STRING] = "str"; + GGUF_TYPE_NAME[GGUF_TYPE_ARRAY] = "arr"; + GGUF_TYPE_NAME[GGUF_TYPE_UINT64] = "u64"; + GGUF_TYPE_NAME[GGUF_TYPE_INT64] = "i64"; + GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64"; }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); @@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { bool ok = true; - ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1); ok = ok && gguf_fread_el(file, p->data, p->n, offset); return ok; } struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; @@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p bool ok = true; - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); // read the header { @@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the kv pairs { - ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; @@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos { - ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // create the tensors for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { const int64_t ne[GGML_MAX_DIMS] = { - ctx->infos[i].ne[0], - ctx->infos[i].ne[1], - ctx->infos[i].ne[2], - ctx->infos[i].ne[3], + (int64_t)ctx->infos[i].ne[0], + (int64_t)ctx->infos[i].ne[1], + (int64_t)ctx->infos[i].ne[2], + (int64_t)ctx->infos[i].ne[3], }; struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); @@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int n_kv = gguf_get_n_kv(ctx); - ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); ctx->kv[n_kv].key.n = strlen(key); ctx->kv[n_kv].key.data = strdup(key); ctx->header.n_kv++; @@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; } @@ -18760,7 +18785,7 @@ void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { const int idx = ctx->header.n_tensors; - ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); ctx->infos[idx].name.n = strlen(tensor->name); ctx->infos[idx].name.data = strdup(tensor->name); diff --git a/ggml.h b/ggml.h index f2fce0f22d357..1d69be2b00347 100644 --- a/ggml.h +++ b/ggml.h @@ -285,8 +285,10 @@ GGML_UNUSED(prefix##3); #ifdef __cplusplus +#ifndef CPP_ONLY extern "C" { #endif +#endif #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half ggml_fp16_t; @@ -1859,7 +1861,7 @@ extern "C" { int n_gradient_accumulation; // ADAM parameters - struct { + struct ggml_adam{ int n_iter; float sched; // schedule multiplier (fixed, decay or warmup) @@ -1875,7 +1877,7 @@ extern "C" { } adam; // LBFGS parameters - struct { + struct ggml_lbfgs{ int m; // number of corrections to approximate the inv. Hessian int n_iter; int max_linesearch; @@ -1902,7 +1904,7 @@ extern "C" { float loss_before; float loss_after; - struct { + struct ggml_grad{ struct ggml_tensor * g; // current gradient struct ggml_tensor * m; // first moment struct ggml_tensor * v; // second moment @@ -1912,7 +1914,7 @@ extern "C" { int n_no_improvement; } adam; - struct { + struct ggml_params{ struct ggml_tensor * x; // current parameters struct ggml_tensor * xp; // previous parameters struct ggml_tensor * g; // current gradient @@ -2134,15 +2136,15 @@ extern "C" { #ifdef __cplusplus // restrict not standard in C++ -#define GGML_RESTRICT +#define GGML_RESTRICT #else -#define GGML_RESTRICT restrict +#define GGML_RESTRICT __restrict__ #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); - typedef struct { + typedef struct ggml_something{ const char * type_name; int blck_size; size_t type_size; @@ -2157,5 +2159,7 @@ extern "C" { GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus +#ifndef CPP_ONLY } #endif +#endif diff --git a/llama-internal.hpp b/llama-internal.hpp new file mode 100644 index 0000000000000..33cf39e5d4f58 --- /dev/null +++ b/llama-internal.hpp @@ -0,0 +1,896 @@ +#include +#include +enum llm_arch { + LLM_ARCH_LLAMA, + LLM_ARCH_FALCON, + LLM_ARCH_BAICHUAN, + LLM_ARCH_GPT2, + LLM_ARCH_GPTJ, + LLM_ARCH_GPTNEOX, + LLM_ARCH_MPT, + LLM_ARCH_STARCODER, + LLM_ARCH_PERSIMMON, + LLM_ARCH_REFACT, + LLM_ARCH_BLOOM, + LLM_ARCH_STABLELM, + LLM_ARCH_UNKNOWN, +}; + +enum llm_kv { + LLM_KV_GENERAL_ARCHITECTURE, + LLM_KV_GENERAL_QUANTIZATION_VERSION, + LLM_KV_GENERAL_ALIGNMENT, + LLM_KV_GENERAL_NAME, + LLM_KV_GENERAL_AUTHOR, + LLM_KV_GENERAL_URL, + LLM_KV_GENERAL_DESCRIPTION, + LLM_KV_GENERAL_LICENSE, + LLM_KV_GENERAL_SOURCE_URL, + LLM_KV_GENERAL_SOURCE_HF_REPO, + + LLM_KV_CONTEXT_LENGTH, + LLM_KV_EMBEDDING_LENGTH, + LLM_KV_BLOCK_COUNT, + LLM_KV_FEED_FORWARD_LENGTH, + LLM_KV_USE_PARALLEL_RESIDUAL, + LLM_KV_TENSOR_DATA_LAYOUT, + + LLM_KV_ATTENTION_HEAD_COUNT, + LLM_KV_ATTENTION_HEAD_COUNT_KV, + LLM_KV_ATTENTION_MAX_ALIBI_BIAS, + LLM_KV_ATTENTION_CLAMP_KQV, + LLM_KV_ATTENTION_LAYERNORM_EPS, + LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + + LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_FREQ_BASE, + LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, + + LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_LIST, + LLM_KV_TOKENIZER_TOKEN_TYPE, + LLM_KV_TOKENIZER_SCORES, + LLM_KV_TOKENIZER_MERGES, + LLM_KV_TOKENIZER_BOS_ID, + LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_UNK_ID, + LLM_KV_TOKENIZER_SEP_ID, + LLM_KV_TOKENIZER_PAD_ID, + LLM_KV_TOKENIZER_ADD_BOS, + LLM_KV_TOKENIZER_ADD_EOS, + LLM_KV_TOKENIZER_HF_JSON, + LLM_KV_TOKENIZER_RWKV, +}; + +// available llama models +enum e_model { + MODEL_UNKNOWN, + MODEL_1B, + MODEL_3B, + MODEL_7B, + MODEL_8B, + MODEL_13B, + MODEL_15B, + MODEL_30B, + MODEL_34B, + MODEL_40B, + MODEL_65B, + MODEL_70B, +}; + +enum llama_fver { + GGUF_FILE_VERSION_V1 = 1, + GGUF_FILE_VERSION_V2 = 2, + GGUF_FILE_VERSION_V3 = 3, +}; + +struct LLM_KV { + LLM_KV(llm_arch arch) : arch(arch) {} + + llm_arch arch; + + std::string operator()(llm_kv kv) const; // moved to llama.cpp file + +}; + +enum llm_tensor { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, +}; + + +struct llama_cparams { + uint32_t n_ctx; // context size used during inference + uint32_t n_batch; + uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads_batch; // number of threads to use for batch processing + + float rope_freq_base; + float rope_freq_scale; + + uint32_t n_yarn_orig_ctx; + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; + + bool mul_mat_q; +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attn_norm; + struct ggml_tensor * attn_norm_b; + struct ggml_tensor * attn_norm_2; + struct ggml_tensor * attn_norm_2_b; + struct ggml_tensor * attn_q_norm; + struct ggml_tensor * attn_q_norm_b; + struct ggml_tensor * attn_k_norm; + struct ggml_tensor * attn_k_norm_b; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + struct ggml_tensor * wqkv; + + // attention bias + struct ggml_tensor * bo; + struct ggml_tensor * bqkv; + + // normalization + struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; + + // ff + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 + + // ff bias + struct ggml_tensor * ffn_down_b; // b2 + struct ggml_tensor * ffn_up_b; // b3 +}; + +struct llama_kv_cell { + llama_pos pos = -1; + llama_pos delta = 0; + + std::set seq_id; + + bool has_seq_id(const llama_seq_id & id) const { + return seq_id.find(id) != seq_id.end(); + } +}; + +struct llama_buffer { + void * data = NULL; + size_t size = 0; + + // fallback to malloc / free + // useful in cases where CUDA can try to allocate PINNED memory + bool fallback = false; + + void resize(size_t n) ; + + + ~llama_buffer(); + +}; + +// ring-buffer of cached KV data +struct llama_kv_cache { + bool has_shift = false; + + // Note: The value of head isn't only used to optimize searching + // for a free KV slot. llama_decode_internal also uses it, so it + // cannot be freely changed after a slot has been allocated. + uint32_t head = 0; + uint32_t size = 0; + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + + struct ggml_tensor * k = NULL; + struct ggml_tensor * v = NULL; + + struct ggml_context * ctx = NULL; + + llama_buffer buf; + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + if (ggml_cublas_loaded()) { + ggml_cuda_free_data(k); + ggml_cuda_free_data(v); + } +#endif + } +}; + +struct llama_vocab { + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + + struct token_data { + token text; + float score; + ttype type; + }; + + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + + std::unordered_map token_to_id; + std::vector id_to_token; + + std::unordered_map special_tokens_cache; + + std::map, int> bpe_ranks; + + // default LLaMA special tokens + id special_bos_id = 1; + id special_eos_id = 2; + id special_unk_id = 0; + id special_sep_id = -1; + id special_pad_id = -1; + + int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. + int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. + + id linefeed_id = 13; + id special_prefix_id = 32007; + id special_middle_id = 32009; + id special_suffix_id = 32008; + id special_eot_id = 32010; + + int find_bpe_rank(std::string token_left, std::string token_right) const { + GGML_ASSERT(token_left.find(" ") == std::string::npos); + GGML_ASSERT(token_left.find("\n") == std::string::npos); + GGML_ASSERT(token_right.find(" ") == std::string::npos); + GGML_ASSERT(token_right.find("\n") == std::string::npos); + + auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); + if (it == bpe_ranks.end()) { + return -1; + } + + return it->second; + } +}; + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false); + ~llama_mmap(); + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; +#else + static constexpr bool SUPPORTED = false; +#endif +}; + + +struct llama_hparams { + bool vocab_only; + uint32_t n_vocab; + uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_embd; + uint32_t n_head; + uint32_t n_head_kv; + uint32_t n_layer; + uint32_t n_rot; + uint32_t n_ff; + + float f_norm_eps; + float f_norm_rms_eps; + + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_yarn_orig_ctx; + int8_t rope_scaling_type_train : 3; + bool rope_finetuned : 1; + + float f_clamp_kqv; + float f_max_alibi_bias; + + bool operator!=(const llama_hparams & other) const; + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); + } +}; + +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + llama_mlock() ; + + llama_mlock(const llama_mlock &) = delete; + ~llama_mlock(); + void init(void * ptr); + void grow_to(size_t target_size); +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + static size_t lock_granularity(); +#ifdef __APPLE__ +#define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" +#else +#define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" +#endif + bool raw_lock(const void * addr, size_t size) const ; +#undef MLOCK_SUGGESTION + static void raw_unlock(void * addr, size_t size); +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + static size_t lock_granularity(); + bool raw_lock(void * ptr, size_t len) const ; + static void raw_unlock(void * ptr, size_t len); +#else + static constexpr bool SUPPORTED = false; + static size_t lock_granularity(); + bool raw_lock(const void * addr, size_t len) const; + static void raw_unlock(const void * addr, size_t len); +#endif +}; + + +struct llama_model { + e_model type = MODEL_UNKNOWN; + llm_arch arch = LLM_ARCH_UNKNOWN; + llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + + std::string name = "n/a"; + + llama_hparams hparams = {}; + llama_vocab vocab; + + struct ggml_tensor * tok_embd; + struct ggml_tensor * pos_embd; + struct ggml_tensor * tok_norm; + struct ggml_tensor * tok_norm_b; + + struct ggml_tensor * output_norm; + struct ggml_tensor * output_norm_b; + struct ggml_tensor * output; + + std::vector layers; + + int n_gpu_layers; + + // gguf metadata + std::unordered_map gguf_kv; + + // context + struct ggml_context * ctx = NULL; + + // the model memory buffer + llama_buffer buf; + + // model memory mapped file + std::unique_ptr mapping; + + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + ~llama_model() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + if (ggml_cublas_loaded()) { + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cuda_free_data(tensors_by_name[i].second); + } + ggml_cuda_free_scratch(); + } +#endif + +#if defined(GGML_USE_CLBLAST) + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cl_free_data(tensors_by_name[i].second); + } +#endif + } +}; + +struct llama_context { + llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} + ~llama_context(); + + llama_cparams cparams; + + const llama_model & model; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; + + std::mt19937 rng; + + bool has_evaluated_once = false; + + int64_t t_start_us; + int64_t t_load_us; + int64_t t_sample_us = 0; + int64_t t_p_eval_us = 0; + int64_t t_eval_us = 0; + + int32_t n_sample = 0; // number of tokens sampled + int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) + int32_t n_eval = 0; // number of eval calls + + // decode output (2-dimensional array: [n_tokens][n_vocab]) + std::vector logits; + bool logits_all = false; + + // input embedding (1-dimensional array: [n_embd]) + std::vector embedding; + + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; + + // memory buffers used to evaluate the model + llama_buffer buf_compute; + + llama_buffer buf_alloc; + ggml_allocr * alloc = NULL; + +#ifdef GGML_USE_METAL + ggml_metal_context * ctx_metal = NULL; +#endif + +#ifdef GGML_USE_MPI + ggml_mpi_context * ctx_mpi = NULL; +#endif +}; + + +struct LLM_TN { + LLM_TN(llm_arch arch) ; + + llm_arch arch; + + std::string operator()(llm_tensor tensor) const; + + std::string operator()(llm_tensor tensor, const std::string & suffix) const ; + + std::string operator()(llm_tensor tensor, int bid) const ; + + std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ; + +}; + + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) ; + size_t tell() const; + void seek(size_t offset, int whence) const; + void read_raw(void * ptr, size_t len) const; + uint32_t read_u32() const; + void write_raw(const void * ptr, size_t len) const ; + void write_u32(std::uint32_t val) const; + ~llama_file(); + +}; + + +struct llama_state { + llama_state(); + // We save the log callback globally + ggml_log_callback log_callback; + void * log_callback_user_data = nullptr; +}; + + + +struct llama_model_loader { + int n_kv = 0; + int n_tensors = 0; + int n_created = 0; + + int64_t n_elements = 0; + size_t n_bytes = 0; + + bool use_mmap = false; + + llama_file file; + llama_ftype ftype; + llama_fver fver; + + std::unique_ptr mapping; + + struct gguf_context * ctx_gguf = NULL; + struct ggml_context * ctx_meta = NULL; + + llama_model_loader(const std::string & fname, bool use_mmap) ; + + ~llama_model_loader(); + + std::string get_arch_name() const; + + enum llm_arch get_arch() const ; + const char * get_tensor_name(int i) const; + + struct ggml_tensor * get_tensor_meta(int i) const; + + void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const; + + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ; + + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend) ; + + void done_getting_tensors() const; + + size_t file_offset(const char * name) const; + + + void load_data_for(struct ggml_tensor * cur) const ; + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ; +}; + +struct llama_data_context { + virtual void write(const void * src, size_t size) = 0; + virtual size_t get_size_written() = 0; + virtual ~llama_data_context() = default; +}; + +struct llama_data_buffer_context : llama_data_context { + uint8_t * ptr; + size_t size_written = 0; + llama_data_buffer_context(uint8_t * p) ; + void write(const void * src, size_t size) override ; + size_t get_size_written() override ; +}; + +struct llama_data_file_context : llama_data_context { + llama_file * file; + size_t size_written = 0; + llama_data_file_context(llama_file * f); + size_t get_size_written() override ; + void write(const void * src, size_t size); +}; + + +struct llama_beam { + std::vector tokens; + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Initialize end-of-beam to false. Callback sets this to true. + // Sort beams by probability. In case of ties, prefer beams at eob. + bool operator<(const llama_beam & rhs) const ; + void shift_tokens(const size_t n) ; + llama_beam_view view() const; +}; + +// A struct for calculating logit-related info. +struct llama_logit_info { + const float * const logits; + const int n_vocab; + const float max_l; + const float normalizer; + struct sum_exp { + float max_l; + float operator()(float sum, float l) const { return sum + std::exp(l - max_l); } + }; + llama_logit_info(llama_context * ctx); + llama_token_data get_token_data(const llama_token token_id) const ; + std::vector top_k(size_t k) ; + float probability_from_logit(float logit) const ; +}; + + +struct llama_beam_search_data { + llama_context * ctx; + size_t n_beams; + int n_past; + int n_predict; + std::vector beams; + std::vector next_beams; + size_t common_prefix_length; + std::vector beam_views; + llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict); + void collapse_beams(const size_t beam_idx) ; + void fill_next_beams_by_top_probabilities(llama_beam & beam) ; + size_t find_common_prefix_length() ; + llama_beams_state get_beams_state(const bool last_call) ; + void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data); + static void renormalize_beam_probabilities(std::vector & beams) ; + size_t top_beam_index(); + void update_beams_from_beam_views(); +}; + +using llm_build_cb = std::function; + +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; + +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; + +struct llm_build_context { + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_batch & batch; + const llama_kv_cache & kv_self; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head; + const int64_t n_embd_gqa; + + const float freq_base; + const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; + const float norm_eps; + const float norm_rms_eps; + + const int32_t n_tokens; + const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t kv_head; // index of where we store new KV data in the cache + const int32_t n_orig_ctx; + + const bool do_rope_shift; + + const llm_build_cb & cb; + + llama_buffer & buf_compute; + + struct ggml_context * ctx0 = nullptr; + + // TODO: consider making the entire interface noexcept + llm_build_context( + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case); + + void init() ; + void free() ; + struct ggml_cgraph * build_llama() ; + struct ggml_cgraph * build_baichuan() ; + struct ggml_cgraph * build_falcon() ; + struct ggml_cgraph * build_starcoder() ; + struct ggml_cgraph * build_persimmon() ; + struct ggml_cgraph * build_refact() ; + struct ggml_cgraph * build_bloom() ; + struct ggml_cgraph * build_mpt() ; + struct ggml_cgraph * build_stablelm(); +}; + + +enum llm_offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; + +struct llm_offload_trie { + struct node { + ~node() ; + node * children[256] = { nullptr }; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; + }; + node * root = nullptr; + llm_offload_trie(); + llm_offload_trie(const std::unordered_map & map) ; + ~llm_offload_trie(); + void add(const char * name, llm_offload_func_e func); + llm_offload_func_e find(const char * name) const; + +}; + +struct llm_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; +}; + + +struct llm_bigram_spm { + struct comparator { + bool operator()(llm_bigram_spm & l, llm_bigram_spm & r); + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + float score; + size_t size; +}; + +struct llm_tokenizer_spm { + llm_tokenizer_spm(const llama_vocab & vocab); + void tokenize(const std::string & text, std::vector & output); + + +private: + void resegment(llm_symbol & symbol, std::vector & output) ; + void try_add_bigram(int left, int right) ; + const llama_vocab & vocab; + + std::vector symbols; + llm_bigram_spm::queue work_queue; + + std::map> rev_merge; +}; + +// BPE tokenizer +// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] +// tried to simplify unicode stuff, so most likely does not work 100% correctly! + +// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused + +struct llm_bigram_bpe { + struct comparator { + bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ; + }; + + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + std::string text; + int rank; + size_t size; +}; + +struct llm_tokenizer_bpe { + llm_tokenizer_bpe(const llama_vocab & vocab); + + void tokenize(const std::string & text, std::vector & output); + +private: + void add_new_bigram(int left, int right) ; + + std::vector bpe_gpt2_preprocess(const std::string & text) ; + + const llama_vocab & vocab; + + std::vector symbols; + std::vector symbols_final; + + llm_bigram_bpe::queue work_queue; +}; + +typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ + FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, + FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT +} FRAGMENT_BUFFER_VARIANT_TYPE; + +struct fragment_buffer_variant{ + fragment_buffer_variant(llama_vocab::id _token); + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length); + const FRAGMENT_BUFFER_VARIANT_TYPE type; + const llama_vocab::id token; + const std::string _dummy; + const std::string & raw_text; + const uint64_t offset; + const uint64_t length; +}; + +struct llama_partial_utf8 { + uint32_t value; // bit value so far (unshifted) + int n_remain; // num bytes remaining; -1 indicates invalid sequence +}; + +struct llama_grammar { + const std::vector> rules; + std::vector> stacks; + + // buffer for partially generated UTF-8 sequence from accepted tokens + llama_partial_utf8 partial_utf8; +}; + +struct llama_grammar_candidate { + size_t index; + const uint32_t * code_points; + llama_partial_utf8 partial_utf8; +}; + +struct quantize_state_internal { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; + + int n_k_quantized = 0; + int n_fallback = 0; + + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; diff --git a/llama.h b/llama.h index 1a62058d1406b..9a1e7d04e050a 100644 --- a/llama.h +++ b/llama.h @@ -50,7 +50,9 @@ #endif #ifdef __cplusplus +#ifndef CPP_ONLY extern "C" { +#endif #endif // @@ -827,8 +829,10 @@ extern "C" { LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); #ifdef __cplusplus +#ifndef CPP_ONLY } #endif +#endif // Internal API to be implemented by llama.cpp and used by tests/benchmarks only #ifdef LLAMA_API_INTERNAL @@ -844,4 +848,8 @@ const std::vector> & llama_internal #endif // LLAMA_API_INTERNAL + + #endif // LLAMA_H + + diff --git a/print.hpp b/print.hpp new file mode 100644 index 0000000000000..6f0dc6bf5fa6c --- /dev/null +++ b/print.hpp @@ -0,0 +1,763 @@ +#include +#include +#include "llama.h" +#include "ggml-internal.hpp" +#include "llama-internal.hpp" + +REFL_TYPE(ggml_init_params ) +REFL_END + +// we use the named data type patch +#define ggml_opt_params_names +#ifdef ggml_opt_params_names +REFL_TYPE(ggml_opt_params::ggml_adam) +REFL_END + +REFL_TYPE(ggml_opt_params::ggml_lbfgs) +REFL_END + + +REFL_TYPE(ggml_opt_context::ggml_grad ) +REFL_END +#endif +REFL_TYPE(gpt_params ) + +REFL_FIELD( seed ) +REFL_FIELD( n_threads) +REFL_FIELD( n_threads_batch) +REFL_FIELD( n_predict ) +REFL_FIELD( n_ctx ) +REFL_FIELD( n_batch) +REFL_FIELD( n_keep ) +REFL_FIELD( n_draft) +REFL_FIELD( n_chunks ) +REFL_FIELD( n_parallel) +REFL_FIELD( n_sequences) +REFL_FIELD( p_accept ) +REFL_FIELD( p_split ) +REFL_FIELD( n_gpu_layers) +REFL_FIELD( n_gpu_layers_draft) +REFL_FIELD( main_gpu ) +REFL_FIELD( tensor_split) +REFL_FIELD( n_beams ) +REFL_FIELD(rope_freq_base) +REFL_FIELD( rope_freq_scale ) +REFL_FIELD( yarn_ext_factor ) +REFL_FIELD( yarn_attn_factor ) +REFL_FIELD( yarn_beta_fast ) +REFL_FIELD( yarn_beta_slow ) +REFL_FIELD( yarn_orig_ctx) +REFL_FIELD( rope_scaling_type) +REFL_FIELD( sparams) +REFL_FIELD(model ) +REFL_FIELD(model_draft ) +REFL_FIELD(model_alias) +REFL_FIELD(prompt ) +REFL_FIELD(prompt_file ) +REFL_FIELD(path_prompt_cache ) +REFL_FIELD(input_prefix ) +REFL_FIELD(input_suffix ) +REFL_FIELD( antiprompt) +REFL_FIELD(logdir ) +REFL_FIELD( lora_adapter) +REFL_FIELD(lora_base ) +REFL_FIELD( ppl_stride ) +REFL_FIELD( ppl_output_type ) +REFL_FIELD( hellaswag ) +REFL_FIELD( hellaswag_tasks ) +REFL_FIELD( mul_mat_q ) +REFL_FIELD( memory_f16) +REFL_FIELD( random_prompt ) +REFL_FIELD( use_color ) +REFL_FIELD( interactive ) +REFL_FIELD( chatml ) +REFL_FIELD( prompt_cache_all ) +REFL_FIELD( prompt_cache_ro ) +REFL_FIELD( embedding ) +REFL_FIELD( escape ) +REFL_FIELD( interactive_first ) +REFL_FIELD( multiline_input ) +REFL_FIELD( simple_io ) +REFL_FIELD( cont_batching ) +REFL_FIELD( input_prefix_bos ) +REFL_FIELD( ignore_eos ) +REFL_FIELD( instruct ) +REFL_FIELD( logits_all ) +REFL_FIELD( use_mmap) +REFL_FIELD( use_mlock ) +REFL_FIELD( numa ) +REFL_FIELD( verbose_prompt ) +REFL_FIELD( infill ) +REFL_FIELD(mmproj ) +REFL_FIELD( image) + +REFL_END + +REFL_TYPE(llama_sampling_params) +REFL_END + +REFL_TYPE(llm_arch) +REFL_END + +REFL_TYPE(llama_sampling_context ) +REFL_FIELD( params) +REFL_FIELD( mirostat_mu) +REFL_FIELD( grammar) +REFL_FIELD( parsed_grammar) +//REFL_FIELD( prev) // TODO fixme has null data +//REFL_FIELD( cur) +REFL_END + +REFL_TYPE(llama_token_data ) +REFL_END + + +REFL_TYPE(llama_token_data_array ) +REFL_END + +REFL_TYPE(llama_batch ) +REFL_END + + +REFL_TYPE(ggml_object) + REFL_FIELD(offs) +REFL_END + +REFL_TYPE(ggml_tensor) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(ggml_cplan) + REFL_FIELD(work_size) +REFL_END + +REFL_TYPE(ggml_hash_set) + REFL_FIELD(size) +REFL_END + +REFL_TYPE(ggml_cgraph) + REFL_FIELD(size) +REFL_END + +REFL_TYPE(ggml_scratch) + REFL_FIELD(offs) +REFL_END + +REFL_TYPE(ggml_compute_params) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(ggml_opt_params) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(ggml_opt_context) + REFL_FIELD(ctx) +REFL_END + +REFL_TYPE(gguf_init_params) +REFL_END + +REFL_TYPE(ggml_something) + REFL_FIELD(type_name) +REFL_END + +REFL_TYPE(ggml_context) + REFL_FIELD(mem_size) +REFL_FIELD(mem_buffer) +REFL_FIELD(mem_buffer_owned) +REFL_FIELD( no_alloc) +REFL_FIELD( no_alloc_save) +REFL_FIELD( n_objects) +REFL_FIELD( objects_begin) +REFL_FIELD( objects_end) +REFL_FIELD( scratch) +REFL_FIELD( scratch_save) + +REFL_END + +REFL_TYPE(ggml_context_container) + REFL_FIELD(used) + REFL_FIELD(context) +REFL_END + + REFL_TYPE(ggml_numa_node) + REFL_FIELD(cpus) + REFL_FIELD(n_cpus) + REFL_END + + REFL_TYPE(ggml_numa_nodes) + REFL_FIELD(nodes) + REFL_FIELD(n_nodes) + REFL_END + + REFL_TYPE(ggml_state) + REFL_FIELD(contexts) + REFL_FIELD(numa) + REFL_END + + REFL_TYPE(gguf_str) + REFL_FIELD(n) + REFL_FIELD(data) + REFL_END + + REFL_TYPE(ggml_map_custom1_op_params) + REFL_FIELD(fun) + REFL_FIELD(n_tasks) + REFL_END + +REFL_TYPE(ggml_map_custom2_op_params) + REFL_FIELD(fun) + REFL_FIELD(n_tasks) +REFL_END + +REFL_TYPE(ggml_map_custom3_op_params) + REFL_FIELD(fun) + REFL_FIELD(n_tasks) +REFL_END + +REFL_TYPE(hash_map) + REFL_FIELD(set) + REFL_FIELD(vals) +REFL_END +REFL_TYPE(ggml_compute_state_shared) + REFL_FIELD(cgraph) + REFL_FIELD(cplan) +REFL_END +REFL_TYPE(ggml_compute_state) + REFL_FIELD(thrd) + REFL_FIELD(ith) +REFL_END +REFL_TYPE(ggml_lbfgs_iteration_data) + REFL_FIELD(alpha) + REFL_FIELD(ys) +REFL_END + +REFL_TYPE(gguf_kv) + REFL_FIELD(key) + REFL_FIELD(type) +REFL_END + +REFL_TYPE(gguf_header) + REFL_FIELD(magic) + REFL_FIELD(version) +REFL_END + +REFL_TYPE(gguf_tensor_info) + REFL_FIELD(name) + REFL_FIELD(n_dims) +REFL_END + +REFL_TYPE(gguf_context) + REFL_FIELD(header) + REFL_FIELD(kv) +REFL_END + +REFL_TYPE(gguf_buf) + REFL_FIELD(data) + REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_model_params) + REFL_FIELD(n_gpu_layers) +REFL_END +REFL_TYPE(llama_context_params) + REFL_FIELD(seed) +REFL_END +REFL_TYPE(llama_model_quantize_params) + REFL_FIELD(nthread) +REFL_END + +REFL_TYPE(llama_grammar_element) +REFL_END + +REFL_TYPE(llama_timings) + REFL_FIELD(t_start_ms) +REFL_END +REFL_TYPE(llama_beam_view) + REFL_FIELD(tokens) +REFL_END + +REFL_TYPE(llama_beams_state) + REFL_FIELD(beam_views) +REFL_END + +REFL_TYPE(ggml_backend) +REFL_END + +REFL_TYPE(ggml_backend_buffer) +REFL_END + +REFL_TYPE(ggml_allocr) +REFL_END + +REFL_TYPE(ggml_tallocr) +REFL_END + +REFL_TYPE(ggml_gallocr) +REFL_END + + +REFL_TYPE(llama_buffer) +REFL_FIELD(data) +REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_file) +REFL_FIELD(fp) +REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_mmap) +REFL_FIELD(addr) +REFL_FIELD(size) +REFL_END + + +REFL_TYPE(llama_mlock) + REFL_FIELD(addr) + REFL_FIELD(size) +REFL_END + +REFL_TYPE(llama_state) + REFL_FIELD(log_callback) + REFL_FIELD(log_callback_user_data) + REFL_END + + +REFL_TYPE(llama_hparams) + REFL_FIELD(vocab_only) + REFL_FIELD(n_vocab) + REFL_END + + +REFL_TYPE(llama_cparams) + REFL_FIELD(n_ctx) + REFL_FIELD(n_batch) +REFL_END + +REFL_TYPE(llama_layer) + REFL_FIELD(attn_norm) + REFL_FIELD(attn_norm_b) +REFL_END + +REFL_TYPE(llama_kv_cell) + REFL_FIELD(pos) + REFL_FIELD(delta) +REFL_END + +REFL_TYPE(llama_kv_cache) + REFL_FIELD(has_shift) + REFL_FIELD(head) + REFL_END + +REFL_TYPE(e_model) +REFL_END + +REFL_TYPE(llama_ftype) +REFL_END + +REFL_TYPE(llama_model) + REFL_FIELD(type) + REFL_FIELD(arch) +REFL_FIELD(ftype ) + +REFL_FIELD( name ) + + REFL_FIELD( hparams ) +REFL_FIELD( vocab) + +REFL_FIELD( tok_embd) +REFL_FIELD( pos_embd) +REFL_FIELD( tok_norm) +REFL_FIELD( tok_norm_b) + +REFL_FIELD( output_norm) +REFL_FIELD( output_norm_b) +REFL_FIELD( output) + +REFL_FIELD( layers) + +REFL_FIELD( n_gpu_layers) + + REFL_FIELD( gguf_kv) //unordered map + REFL_FIELD( ctx) + REFL_FIELD( buf) + REFL_FIELD( mapping) //std::unique_ptr +REFL_FIELD( mlock_buf) +REFL_FIELD( mlock_mmap) +REFL_FIELD( tensors_by_name) + REFL_FIELD( t_load_us) +REFL_FIELD( t_start_us) + +REFL_END + +REFL_TYPE(llama_vocab) + REFL_END + + REFL_TYPE(grammar_parser::parse_state) + REFL_END + +REFL_TYPE(llama_context) +REFL_FIELD( cparams) +//REFL_FIELD(model) +REFL_FIELD(kv_self) + REFL_FIELD(rng) //random numbers +REFL_FIELD(has_evaluated_once ) +REFL_FIELD( t_start_us) +REFL_FIELD( t_load_us) + REFL_FIELD( t_sample_us ) +REFL_FIELD( t_p_eval_us ) + REFL_FIELD( t_eval_us) +REFL_FIELD( n_sample ) +REFL_FIELD( n_p_eval ) + REFL_FIELD( n_eval ) +//REFL_FIELD( logits) crash +REFL_FIELD( logits_all ) +REFL_FIELD( embedding) +//REFL_FIELD( work_buffer) + REFL_FIELD( buf_compute) + REFL_FIELD( buf_alloc) +REFL_FIELD( alloc ) + +#ifdef GGML_USE_METAL +REFL_FIELD( ctx_metal ) +#endif + +#ifdef GGML_USE_MPI +REFL_FIELD( ctx_mpi ) + +#endif +REFL_END + +REFL_TYPE(llama_model_loader) + REFL_FIELD(n_kv) + REFL_FIELD(n_tensors) +REFL_END + +REFL_TYPE(llm_build_context) +// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’ +// REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’ +REFL_END + +REFL_TYPE(llm_offload_trie) +REFL_END + +REFL_TYPE(llm_symbol) + REFL_FIELD(prev) +REFL_END + +REFL_TYPE(llm_bigram_spm) +REFL_END + +REFL_TYPE(llm_tokenizer_spm) +REFL_END + +REFL_TYPE(llm_bigram_bpe) +REFL_END + +REFL_TYPE(llm_tokenizer_bpe) +REFL_END + + +REFL_TYPE(fragment_buffer_variant) +REFL_END + + +REFL_TYPE(llama_partial_utf8) + REFL_FIELD(value) + REFL_FIELD(n_remain) +REFL_END + + +REFL_TYPE(llama_grammar) + REFL_FIELD(rules) + REFL_FIELD(stacks) +REFL_END + + +REFL_TYPE(llama_grammar_candidate) + REFL_FIELD(index) + REFL_FIELD(code_points) +REFL_END + + +REFL_TYPE(llama_beam) + REFL_FIELD(tokens) + REFL_FIELD(p) +REFL_END + + +REFL_TYPE(llama_logit_info) +// REFL_FIELD(logits) + REFL_FIELD(n_vocab) +REFL_END + +REFL_TYPE(llama_beam_search_data) + REFL_FIELD(ctx) + REFL_FIELD(n_beams) +REFL_END + + +REFL_TYPE(quantize_state_internal) +// REFL_FIELD(model) + REFL_FIELD(params) +REFL_FIELD( n_attention_wv ) +REFL_FIELD( n_feed_forward_w2 ) + REFL_FIELD( i_attention_wv ) + REFL_FIELD( i_feed_forward_w2 ) +REFL_FIELD( n_k_quantized ) +REFL_FIELD( n_fallback ) + +REFL_END + +REFL_TYPE(llama_data_context) +REFL_END + +REFL_TYPE(llama_data_buffer_context) + REFL_FIELD(ptr) +REFL_END + +REFL_TYPE(llama_data_file_context) + REFL_FIELD(file) +REFL_END + +template +constexpr auto get_value_type_name(const T t) noexcept +{ + return t.value_type; +} + +namespace runtime2 + { + using namespace refl; + using namespace refl::descriptor; + template + void debug(std::basic_ostream& os, const T& value, bool compact = false); + + namespace detail + { + template &>() << std::declval())> + std::true_type is_ostream_printable_test(int); + + template + std::false_type is_ostream_printable_test(...); + + template + constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test(0))::value }; + + namespace + { + [[maybe_unused]] int next_depth(int depth) + { + return depth == -1 || depth > 8 + ? -1 + : depth + 1; + } + } + + template + void indent(std::basic_ostream& os, int depth) + { + for (int i = 0; i < depth; i++) { + os << " "; + } + } + + template + void debug_impl(std::basic_ostream& os, const T& value, [[maybe_unused]] int depth); + + template + void debug_detailed(std::basic_ostream& os, const T& value, int depth) + { + + using type_descriptor = type_descriptor; + bool compact = depth == -1; + // print type with members enclosed in braces + os << type_descriptor::name << " { "; + if (!compact) os << '\n'; + + constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); }); + for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) { + int new_depth = next_depth(depth); + + indent(os, new_depth); + os << get_display_name(member) << " = "; + + if constexpr (util::contains_instance(member.attributes)) { + // use the debug attribute to print + auto debug_attr = util::get_instance(member.attributes); + debug_attr.write(os, value); + } + else { + debug_impl(os, member(value), new_depth); + } + + if (!compact || index + 1 != readable_members.size) { + os << ", "; + } + if (!compact) { + indent(os, depth); + os << '\n'; + } + }); + + if (compact) os << ' '; + indent(os, depth); + os << '}'; + } + + template + void debug_reflectable(std::basic_ostream& os, const T& value, [[maybe_unused]] int depth) + { + using type_descriptor = type_descriptor; + if constexpr (trait::contains_instance_v) { + // use the debug attribute to print + auto debug_attr = util::get_instance(type_descriptor::attributes); + debug_attr.write(os, value); + } + else if constexpr (detail::is_ostream_printable_v) { + // type supports printing natively, just use that + + os << value; + + } + else { + debug_detailed(os, value, depth); + } + } + + template + void debug_container(std::basic_ostream& os, const T& value, int depth) + { + bool compact = depth == -1; + os << "["; + + auto end = value.end(); + for (auto it = value.begin(); it != end; ++it) + { + if (!compact) os << '\n'; + int new_depth = next_depth(depth); + indent(os, new_depth); + + debug_impl(os, *it, new_depth); + if (std::next(it, 1) != end) { + os << ", "; + } + else if (!compact) { + os << '\n'; + } + + } + + indent(os, depth); + os << "]"; + } + + template + void debug_impl(std::basic_ostream& os, const T& value, [[maybe_unused]] int depth) + { + using no_pointer_t = std::remove_pointer_t; + + if constexpr (std::is_same_v) { + os << (value ? "true" : "false"); + } + else if constexpr (std::is_pointer_v && !std::is_void_v && trait::is_reflectable_v) { + if (value == nullptr) { + os << "nullptr"; + } + else { + os << '&'; + debug_impl(os, *value, -1); + } + } + else if constexpr (trait::is_reflectable_v) { + debug_reflectable(os, value, depth); + } + else if constexpr (detail::is_ostream_printable_v) { + os << value; + } + else if constexpr (trait::is_container_v) { + debug_container(os, value, depth); + } + else { + os << "(not printable)"; + } + } + } + + /** + * Writes the debug representation of value to the given std::ostream. + * Calls the function specified by the debug attribute whenever possible, + * before falling back to recursively interating the members and printing them. + * Takes an optional arguments specifying whether to print a compact representation. + * The compact representation contains no newlines. + */ + template + void debug(std::basic_ostream& os, const T& value, [[maybe_unused]] bool compact) + { + static_assert(trait::is_reflectable_v || trait::is_container_v || detail::is_ostream_printable_v, + "Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!"); + + detail::debug_impl(os, value, compact ? -1 : 0); + } + + /** + * Writes the compact debug representation of the provided values to the given std::ostream. + */ + template + void debug_all(std::basic_ostream& os, const Ts&... values) + { + refl::runtime::debug(os, std::forward_as_tuple(static_cast(values)...), true); + } + + /** + * Writes the debug representation of the provided value to an std::string and returns it. + * Takes an optional arguments specifying whether to print a compact representation. + * The compact representation contains no newlines. + */ + template + std::basic_string debug_str(const T& value, bool compact = false) + { + std::basic_stringstream ss; + debug(ss, value, compact); + return ss.str(); + } + + /** + * Writes the compact debug representation of the provided values to an std::string and returns it. + */ + template + std::basic_string debug_all_str(const Ts&... values) + { + return refl::runtime::debug_str(std::forward_as_tuple(static_cast(values)...), true); + } +} + +// // A generic function to print out the fields of any object +template +void print_fields(const T& t) { + runtime2::debug(std::cout, t); + constexpr auto type = refl::reflect(); + + constexpr auto membertype = refl::member_list(); + + constexpr auto members = get_members(type); + std::cout << "DEBUG Type: " << type.name.c_str() << "\n"; + std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n"; + std::cout << "DEBUG Type3: " << typeid(members).name() << "\n"; + refl::util::for_each(members, [&](auto member) { + //using member_t = decltype(member::value_type); + //typename type3 = member::value_type; + //typename trait::remove_qualifiers_t::value_type>; + //constexpr auto type2 = refl::reflect(type3); + //std::cout << "Auto:" << foo <<"\n"; + std::cout << "Auto:" << member.name <<"\n"; + //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n"; + //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n"; + }); + std::cout << "\n"; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c8b4bc254f4c6..28f6254630010 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-rope.cpp) # dummy executable - not installed -get_filename_component(TEST_TARGET test-c.c NAME_WE) -add_executable(${TEST_TARGET} test-c.c) +get_filename_component(TEST_TARGET test-c.cpp NAME_WE) +add_executable(${TEST_TARGET} test-c.cpp) target_link_libraries(${TEST_TARGET} PRIVATE llama) diff --git a/tests/test-c.c b/tests/test-c.cpp similarity index 100% rename from tests/test-c.c rename to tests/test-c.cpp