diff --git a/.gitignore b/.gitignore
index 41259a12f50cb..f8a2a2dae5902 100644
--- a/.gitignore
+++ b/.gitignore
@@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
+/#llama.cpp#
+#*
+\\#*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f32df5fe52335..839aad003ca32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,7 +104,7 @@ option(LLAMA_BUILD_SERVER               "llama: build server example"
 # Compile flags
 #
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -230,7 +230,12 @@ if (LLAMA_BLAS)
 
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
+	# from https://github.com/NVIDIA/cutlass
+	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
+	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
+
+	#        add_compile_definitions(GGML_USE_OPENBLAS)
         if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
             add_compile_definitions(GGML_BLAS_USE_MKL)
         endif()
@@ -312,7 +317,7 @@ if (LLAMA_MPI)
     if (MPI_C_FOUND)
         message(STATUS "MPI found")
         set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
         add_compile_definitions(GGML_USE_MPI)
         add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
         if (NOT MSVC)
@@ -438,6 +443,9 @@ if (NOT cuda_host_flags STREQUAL "")
     set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 
+# 
+set(cuda_flags --verbose -G  ${cuda_flags})
+
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 
 if (WIN32)
@@ -485,8 +493,10 @@ if (NOT MSVC)
             add_link_options(-static-libgcc -static-libstdc++)
         endif()
     endif()
+    add_link_options("-Wl,-Map=${TARGET}.map")
+
     if (LLAMA_GPROF)
-        add_compile_options(-pg)
+      add_compile_options(-pg)
     endif()
 endif()
 
@@ -645,13 +655,16 @@ if (GGML_USE_CPU_HBM)
 endif()
 
 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
             ggml.h
-            ggml-alloc.c
+	    print.hpp
+	    ggml-internal.hpp
+	    llama-internal.hpp
+            ggml-alloc.cpp
             ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
             ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
             ggml-quants.h
             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@@ -683,7 +696,7 @@ add_library(llama
             )
 
 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features(llama PUBLIC cxx_std_20) # don't bump
 target_link_libraries(llama PRIVATE
     ggml
     ${LLAMA_EXTRA_LIBS}
diff --git a/Makefile b/Makefile
index a6d2c2ec0f380..3fe2af3d8beef 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++20 -fPIC -fpermissive -DCPP_ONLY
 
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 endif # LLAMA_METAL
 
 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
@@ -537,17 +537,17 @@ $(info )
 # Build library
 #
 
-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml.o: ggml.cpp ggml.h ggml-cuda.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
+	$(CXX) $(CXXFLAGS)    -c $< -o $@
 
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 
@@ -734,5 +734,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-c.o: tests/test-c.cpp llama.h
+	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
diff --git a/binding.py b/binding.py
new file mode 100644
index 0000000000000..217dce6846340
--- /dev/null
+++ b/binding.py
@@ -0,0 +1,337 @@
+import os
+import json
+import re
+import clang.cindex
+
+# configurable part
+
+CLANG_VERSION='13.0.1'
+#   homebrew installs for llvm (brew info llvm gives details):
+#       x64: /usr/local/opt/llvm/lib
+#       arm64: /opt/homebrew/opt/llvm/lib
+llvmLibPath = "/usr/lib/llvm-15/lib/"
+
+cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
+
+fileList = [
+#    "ggml.cpp",
+#    "llama.cpp",
+    "examples/server/server.cpp",
+]
+
+typeList = [
+]
+
+# end of configurable part
+
+clang.cindex.Config.set_library_path(llvmLibPath)
+
+
+def list_headers_in_dir(path):
+    # enumerates a folder but keeps the full pathing for the files returned
+    # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
+
+    # list all the files in the folder
+    files = os.listdir(path)
+    # only include .hxx files
+    files = list(filter(lambda x: x.endswith('.hxx'), files))
+    # add the folder path back on
+    files = list(map(lambda x: path + x, files))
+    return files
+
+
+# parse through the list of files specified and expand wildcards
+fullFileList = []
+for filePath in fileList:
+    if "*" in filePath:
+        # wildcard path
+        basePath = filePath[:-1]
+        if "*" in basePath:
+            # if there is still a wildcard, we have an issue...
+            raise NotImplementedError(
+                "wildcard only supported at end of file path")
+        files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
+        fullFileList = fullFileList + files
+    else:
+        # normal path
+        ff = os.path.join(cxxClientRoot, filePath)
+        fullFileList.append(ff)
+        print("DBUG",ff)
+# exclude _json.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
+# exclude _fmt.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
+
+
+# generate a list of regexps from the type list (for handling wildcards)
+typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
+
+
+def is_included_type(name, with_durability=False):
+
+    # TODO(brett19): This should be generalized somehow...
+    if "is_compound_operation" in name:
+        return False
+
+    if "replica_context" in name:
+        return False
+
+    if with_durability is True and '_with_legacy_durability' not in name:
+        return False
+
+    for x in typeListRe:
+        if re.fullmatch(x, name):
+            return True
+    return False
+
+
+opTypes = []
+opEnums = []
+
+
+def parse_type(type):
+    typeStr = type.get_canonical().spelling
+    return parse_type_str(typeStr)
+
+std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
+
+def parse_type_str(typeStr):
+    if typeStr == "std::mutex":
+        return {"name": "std::mutex"}
+    if typeStr == "std::string":
+        return {"name": "std::string"}
+    if typeStr == "std::chrono::duration<long long>":
+        return {"name": "std::chrono::seconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
+        return {"name": "std::chrono::milliseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
+        return {"name": "std::chrono::microseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
+        return {"name": "std::chrono::nanoseconds"}
+    if typeStr == "std::error_code":
+        return {"name": "std::error_code"}
+    if typeStr == "std::monostate":
+        return {"name": "std::monostate"}
+    if typeStr == "std::byte":
+        return {"name": "std::byte"}
+    if typeStr == "unsigned long":
+        return {"name": "std::size_t"}
+    if typeStr == "char":
+        return {"name": "std::int8_t"}
+    if typeStr == "unsigned char":
+        return {"name": "std::uint8_t"}
+    if typeStr == "short":
+        return {"name": "std::int16_t"}
+    if typeStr == "unsigned short":
+        return {"name": "std::uint16_t"}
+    if typeStr == "int":
+        return {"name": "std::int32_t"}
+    if typeStr == "unsigned int":
+        return {"name": "std::uint32_t"}
+    if typeStr == "long long":
+        return {"name": "std::int64_t"}
+    if typeStr == "unsigned long long":
+        return {"name": "std::uint64_t"}
+    if typeStr == "bool":
+        return {"name": "std::bool"}
+    if typeStr == "float":
+        return {"name": "std::float"}
+    if typeStr == "double":
+        return {"name": "std::double"}
+    if typeStr == "std::nullptr_t":
+        return {"name": "std::nullptr_t"}
+    if typeStr in std_comparators:
+        return {"name": typeStr}
+
+    tplParts = typeStr.split("<", 1)
+    if len(tplParts) > 1:
+        tplClassName = tplParts[0]
+        tplParams = tplParts[1][:-1]
+        if tplClassName == "std::function":
+            return {
+                "name": "std::function"
+            }
+        if tplClassName == "std::optional":
+            return {
+                "name": "std::optional",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::vector":
+            return {
+                "name": "std::vector",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::set":
+            return {
+                "name": "std::set",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::variant":
+            variantParts = tplParams.split(", ")
+            variantTypes = []
+            for variantPart in variantParts:
+                variantTypes.append(parse_type_str(variantPart))
+            return {
+                "name": "std::variant",
+                "of": variantTypes
+            }
+        if tplClassName == "std::array":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) != 2:
+                print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+            return {
+                "name": "std::array",
+                "of": parse_type_str(variantParts[0]),
+                "size": int(variantParts[1])
+            }
+        if tplClassName == "std::map":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) < 2 or len(variantParts) > 3:
+                print("FAILED TO PARSE MAP TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+
+            if len(variantParts) == 2:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1])
+                }
+            else:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1]),
+                    "comparator": parse_type_str(variantParts[2])
+                }
+
+        if tplClassName == "std::shared_ptr":
+            return {
+                "name": "std::shared_ptr",
+                "of": parse_type_str(tplParams)
+            }
+
+        #return {"name": "unknown", "str": typeStr}
+
+    if 'unnamed struct' in typeStr:
+        print("WARNING:  Found unnamed struct: " + typeStr)
+
+    return {"name": typeStr}
+
+internal_structs = []
+UNNAMED_STRUCT_DELIM = '::(unnamed struct'
+
+def traverse(node, namespace, main_file):
+    # only scan the elements of the file we parsed
+
+
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        print("#FILE", node.location.file )
+        print("REFL_TYPE(" + fullStructName + ")")
+
+        structFields = []
+        for child in node.get_children():
+            if child.kind == clang.cindex.CursorKind.FIELD_DECL:
+                struct_type = parse_type(child.type)
+                type_str = child.type.get_canonical().spelling
+                print("  REFL_FIELD(" + child.displayname + ")")
+                if 'unnamed' in type_str:
+                    name_tokens = type_str.split('::')
+                    name_override = '::'.join(name_tokens[:-1] + [child.displayname])
+                    struct_type['name'] = name_override
+                    internal_structs.append(name_override)
+
+                    structFields.append({
+                        "name": child.displayname,
+                        "type": struct_type,
+                    })
+            # replica read changes introduced duplicate get requests
+            #if any(map(lambda op: op['name'] == fullStructName, opTypes)):
+            #    return
+
+            #opTypes.append({
+            #    "name": fullStructName,
+            #    "fields": structFields,
+            #})
+        print("REFL_END")
+
+        
+    if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullStructName, with_durability=True):
+            type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
+            if type_ref:
+                base_request_name = type_ref.displayname.replace('struct', '').strip()
+                base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
+                if base_request:
+                    new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
+                    new_fields.extend([
+                            {"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
+                            {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
+                        ])
+
+                    opTypes.append({
+                        "name": fullStructName,
+                        "fields": new_fields
+                    })
+    if node.kind == clang.cindex.CursorKind.ENUM_DECL:
+        fullEnumName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullEnumName):
+            enumValues = []
+
+            for child in node.get_children():
+                if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
+                    enumValues.append({
+                        "name": child.displayname,
+                        "value": child.enum_value,
+                    })
+            opEnums.append({
+                "name": fullEnumName,
+                "type": parse_type(node.enum_type),
+                "values": enumValues,
+            })
+
+    if node.kind == clang.cindex.CursorKind.NAMESPACE:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
+        namespace = [*namespace, node.displayname]
+
+    for child in node.get_children():
+        traverse(child, namespace, main_file)
+
+for headerPath in fullFileList:
+    print("processing " + headerPath)
+    index = clang.cindex.Index.create()
+    args = [
+        '-std=c++17',
+    ]
+    
+    try:
+        translation_unit = index.parse(headerPath, args=args)
+    except Exception as e:
+        print(e)
+        import pdb
+        pdb.set_trace()
+        raise e
+
+    # output clang compiler diagnostics information (for debugging)
+
+    for diagnostic in translation_unit.diagnostics:
+        diagnosticMsg = diagnostic.format()
+        print(diagnostic)
+
+    traverse(translation_unit.cursor, [], headerPath)
+
+jsonData = json.dumps({
+    'op_structs': opTypes,
+    'op_enums': opEnums
+})
+
+f = open("bindings.json", "w")
+f.write(jsonData)
+f.close()
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 31ec8cade19be..18d2d03c09196 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -31,6 +31,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#include "print.hpp"
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -99,6 +101,7 @@ static void sigint_handler(int signo) {
     }
 }
 #endif
+using namespace refl;
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -117,7 +120,8 @@ int main(int argc, char ** argv) {
 
     // TODO: Dump params ?
     //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-
+    print_fields(params);
+    
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
     console::init(params.simple_io, params.use_color);
@@ -234,6 +238,8 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
+    print_fields(*model);
+	
     if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
         if (params.chatml) {
@@ -277,7 +283,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
-
+    print_fields(*ctx);
+    //print_fields(session_tokens);
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
     if (!session_tokens.empty()) {
@@ -365,6 +372,10 @@ int main(int argc, char ** argv) {
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
             }
+	    
+	    print_fields(*ctx_guidance);
+
+
         }
 
         if (params.n_keep > 0) {
@@ -473,7 +484,8 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd_guidance;
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-
+    print_fields(*ctx_sampling);
+    
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
         if (!embd.empty()) {
@@ -508,6 +520,7 @@ int main(int argc, char ** argv) {
                 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
+		print_fields(*ctx);
                 llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                 llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
@@ -624,7 +637,7 @@ int main(int argc, char ** argv) {
             }
 
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-
+	    //print_fields(id);
             llama_sampling_accept(ctx_sampling, ctx, id, true);
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 50f124b13e849..a42bba9b6d54f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -24,6 +24,7 @@
 #include <thread>
 #include <mutex>
 #include <chrono>
+#include "print.hpp"
 
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -33,6 +34,9 @@
 
 using json = nlohmann::json;
 
+REFL_TYPE(std::less< ::nlohmann::detail::value_t>)
+REFL_END
+
 struct server_params
 {
     std::string hostname = "127.0.0.1";
@@ -41,6 +45,13 @@ struct server_params
     int32_t read_timeout = 600;
     int32_t write_timeout = 600;
 };
+REFL_TYPE(server_params)
+  REFL_FIELD(hostname)
+  REFL_FIELD(public_path)
+  REFL_FIELD(port)
+  REFL_FIELD(read_timeout)
+  REFL_FIELD(write_timeout)
+REFL_END
 
 static bool server_verbose = false;
 
@@ -157,6 +168,15 @@ struct task_server {
     bool embedding_mode = false;
 };
 
+REFL_TYPE(task_server)
+  REFL_FIELD(id)
+  REFL_FIELD(target_id)
+  REFL_FIELD(type)
+  REFL_FIELD(data)
+  REFL_FIELD(infill_mode)
+  REFL_FIELD(embedding_mode)
+REFL_END
+
 struct task_result {
     int id;
     bool stop;
@@ -193,6 +213,18 @@ struct slot_params
     json input_suffix;
 };
 
+REFL_TYPE(slot_params)
+  REFL_FIELD(stream)
+  REFL_FIELD(cache_prompt)
+  REFL_FIELD(seed)
+  REFL_FIELD(n_keep)
+  REFL_FIELD(n_predict)
+  REFL_FIELD(antiprompt)
+  REFL_FIELD(input_prefix)
+  REFL_FIELD(input_suffix)
+REFL_END
+
+
 struct slot_image
 {
     int32_t id;
@@ -220,6 +252,17 @@ struct completion_token_output
     std::string text_to_send;
 };
 
+REFL_TYPE(completion_token_output)
+  REFL_FIELD(probs)
+  REFL_FIELD(tok)
+  REFL_FIELD(text_to_send)
+REFL_END
+
+REFL_TYPE(completion_token_output::token_prob)
+  REFL_FIELD(tok)
+  REFL_FIELD(prob)
+REFL_END
+
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
 {
     size_t i;
@@ -496,6 +539,51 @@ struct llama_client_slot
     }
 };
 
+//REFL_TYPE(llama_client_slot::llama_sampling_params)
+//REFL_END
+
+REFL_TYPE(llama_client_slot)
+  REFL_FIELD(id)
+  REFL_FIELD(task_id)
+  REFL_FIELD(params)
+  REFL_FIELD(state)
+  REFL_FIELD(command)
+  REFL_FIELD(t_last_used)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(n_past)
+  REFL_FIELD(n_decoded)
+  REFL_FIELD(n_remaining)
+  REFL_FIELD(i_batch)
+  REFL_FIELD(num_prompt_tokens)
+  REFL_FIELD(num_prompt_tokens_processed)
+  REFL_FIELD(multibyte_pending)
+  REFL_FIELD(prompt)
+  REFL_FIELD(generated_text)
+  REFL_FIELD(sampled)
+  REFL_FIELD(cache_tokens)
+  REFL_FIELD(generated_token_probs)
+  REFL_FIELD(infill)
+  REFL_FIELD(embedding)
+  REFL_FIELD(has_next_token)
+  REFL_FIELD(truncated)
+  REFL_FIELD(stopped_eos)
+  REFL_FIELD(stopped_word)
+  REFL_FIELD(stopped_limit)
+  REFL_FIELD(oaicompat)
+  REFL_FIELD(oaicompat_model)
+  REFL_FIELD(stopping_word)
+  REFL_FIELD(sparams)
+  REFL_FIELD(ctx_sampling)
+  REFL_FIELD(images)
+  REFL_FIELD(sent_count)
+  REFL_FIELD(sent_token_probs_index)
+  REFL_FIELD(t_start_process_prompt)
+  REFL_FIELD(t_start_genereration)
+  REFL_FIELD(t_prompt_processing)
+  REFL_FIELD(t_token_generation)
+REFL_END
+
+
 struct llama_server_context
 {
     llama_model *model = nullptr;
@@ -878,7 +966,7 @@ struct llama_server_context
         all_slots_are_idle = false;
 
         LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
-
+	print_fields(*slot);
         return true;
     }
 
@@ -1787,6 +1875,31 @@ struct llama_server_context
     }
 };
 
+REFL_TYPE(llama_server_context)
+  REFL_FIELD(model)
+  REFL_FIELD(ctx)
+  REFL_FIELD(clp_ctx)
+  REFL_FIELD(params)
+  REFL_FIELD(batch)
+  REFL_FIELD(multimodal)
+  REFL_FIELD(clean_kv_cache)
+  REFL_FIELD(all_slots_are_idle)
+  REFL_FIELD(add_bos_token)
+  REFL_FIELD(id_gen)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(system_need_update)
+  REFL_FIELD(system_prompt)
+  REFL_FIELD(system_tokens)
+  REFL_FIELD(name_user)
+  REFL_FIELD(name_assistant)
+  REFL_FIELD(slots)
+  REFL_FIELD(queue_tasks)
+  REFL_FIELD(queue_results)
+  REFL_FIELD(mutex_tasks)
+  REFL_FIELD(mutex_results)
+REFL_END
+
+
 static void server_print_usage(const char *argv0, const gpt_params &params,
                                const server_params &sparams)
 {
@@ -2497,6 +2610,11 @@ struct token_translator
     std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 
+
+REFL_TYPE(token_translator)
+  REFL_FIELD(ctx)
+REFL_END
+
 static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
 {
     auto & gtps = slot->generated_token_probs;
diff --git a/ggml-alloc.c b/ggml-alloc.cpp
similarity index 98%
rename from ggml-alloc.c
rename to ggml-alloc.cpp
index cdfe4caf69613..46f4c9bd73d45 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.cpp
@@ -386,7 +386,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 
 void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
     free(galloc->parse_seq);
-    galloc->parse_seq = malloc(sizeof(int) * n);
+    galloc->parse_seq = (int*)malloc(sizeof(int) * n);
 
     for (int i = 0; i < n; i++) {
         galloc->parse_seq[i] = list[i];
@@ -646,9 +646,9 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
         if (galloc->hash_values != NULL) {
             free(galloc->hash_values);
         }
-        galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
+        galloc->hash_set.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * hash_size);
         galloc->hash_set.size = hash_size;
-        galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
     }
 
     // reset hash table
@@ -674,7 +674,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
     // alloc hash_values if needed
     if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
         free(galloc->hash_values);
-        galloc->hash_values      = malloc(sizeof(struct hash_node) * hash_size);
+        galloc->hash_values      = (hash_node*)malloc(sizeof(struct hash_node) * hash_size);
         galloc->hash_values_size = hash_size;
     }
 
diff --git a/ggml-backend.c b/ggml-backend.cpp
similarity index 96%
rename from ggml-backend.c
rename to ggml-backend.cpp
index f6e5fceed0f4d..47b60cb1e284e 100644
--- a/ggml-backend.c
+++ b/ggml-backend.cpp
@@ -20,7 +20,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
         struct ggml_backend_buffer_i           iface,
                ggml_backend_buffer_context_t   context,
                size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
+  ggml_backend_buffer_t buffer = (ggml_backend_buffer*)malloc(sizeof(struct ggml_backend_buffer));
 
     GGML_ASSERT(iface.get_base != NULL);
 
@@ -195,9 +195,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
     // TODO: allow backends to support copy to/from same backend
 
     if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
-        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+      ggml_get_backend(dst)->iface.cpy_tensor_from((ggml_backend_t)ggml_get_backend(dst)->context, src, dst);
     } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
-        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
+      ggml_get_backend(src)->iface.cpy_tensor_to((ggml_backend_t)ggml_get_backend(src)->context, src, dst);
     } else {
         // shouldn't be hit when copying from/to CPU
         #ifndef NDEBUG
@@ -316,13 +316,13 @@ struct ggml_backend_plan_cpu {
 static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+    struct ggml_backend_plan_cpu * cpu_plan = (ggml_backend_plan_cpu*)malloc(sizeof(struct ggml_backend_plan_cpu));
 
     cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
     cpu_plan->cgraph = *cgraph;
 
     if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+      cpu_plan->cplan.work_data = (uint8_t*)malloc(cpu_plan->cplan.work_size);
     }
 
     return cpu_plan;
@@ -356,7 +356,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
         cpu_ctx->work_size = cplan.work_size;
     }
 
-    cplan.work_data = cpu_ctx->work_data;
+    cplan.work_data = (uint8_t*)cpu_ctx->work_data;
 
     ggml_graph_compute(cgraph, &cplan);
 }
@@ -385,13 +385,13 @@ static struct ggml_backend_i cpu_backend_i = {
 };
 
 ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+  struct ggml_backend_cpu_context * ctx = (ggml_backend_cpu_context*)malloc(sizeof(struct ggml_backend_cpu_context));
 
     ctx->n_threads = GGML_DEFAULT_N_THREADS;
     ctx->work_data = NULL;
     ctx->work_size = 0;
 
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+    ggml_backend_t cpu_backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
 
     *cpu_backend = (struct ggml_backend) {
         /* .interface = */ cpu_backend_i,
@@ -869,7 +869,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
 ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
     GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
 
-    struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
+    struct ggml_backend_sched * sched = (ggml_backend_sched*)malloc(sizeof(struct ggml_backend_sched));
     memset(sched, 0, sizeof(struct ggml_backend_sched));
 
     fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
@@ -907,9 +907,9 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
     // initialize hash tables
     size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
     sched->hash_set.size = hash_size;
-    sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
-    sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size);
-    sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size);
+    sched->hash_set.keys = (ggml_tensor**)malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
+    sched->node_talloc   = (ggml_tallocr**)malloc(sizeof(sched->node_talloc[0])   * hash_size);
+    sched->node_copies   = (ggml_tensor *(*)[4])malloc(sizeof(sched->node_copies[0])   * hash_size);
 
     sched_split_graph(sched, measure_graph);
     sched_alloc_splits(sched);
diff --git a/ggml-impl.h b/ggml-impl.h
index 06c07339e9269..1bf20a4af3985 100644
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 
diff --git a/ggml-internal.hpp b/ggml-internal.hpp
new file mode 100644
index 0000000000000..0725451fcbd3e
--- /dev/null
+++ b/ggml-internal.hpp
@@ -0,0 +1,258 @@
+struct ggml_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    int    n_objects;
+
+    struct ggml_object * objects_begin;
+    struct ggml_object * objects_end;
+
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
+
+  ggml_context():
+    mem_size(0),
+    mem_buffer(0),
+    mem_buffer_owned(0),
+    no_alloc(0),
+    no_alloc_save(0),
+    n_objects(0),
+    objects_begin(0),
+    objects_end(0),
+    scratch(),
+    scratch_save()
+  {
+    
+  }
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+
+  ggml_context_container(): used(0),context(){
+    
+  }
+};
+
+typedef double ggml_float;
+typedef void * thread_ret_t;
+
+#define MAX_FREE_BLOCKS 256
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+struct ggml_tallocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
+    void * base;
+    size_t alignment;
+
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+
+    size_t max_size;
+
+    bool measure;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+
+struct hash_node {
+    int n_children;
+    int n_views;
+};
+
+typedef struct ggml_tallocr * ggml_tallocr_t;
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+struct ggml_gallocr {
+    ggml_tallocr_t talloc;
+    struct ggml_hash_set hash_set;
+    struct hash_node * hash_values;
+    size_t hash_values_size;
+    ggml_tallocr_t * hash_allocs;
+    int * parse_seq;
+    int parse_seq_len;
+};
+
+struct ggml_allocr {
+    ggml_tallocr_t talloc;
+    ggml_gallocr_t galloc;
+};
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_state {
+    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
+
+  ggml_state():contexts(), numa()
+  {
+    
+  }
+};
+
+struct gguf_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+struct hash_map {
+    struct ggml_hash_set set;
+    struct ggml_tensor ** vals;
+};
+
+#if defined(_WIN32)
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+#else
+#include<atomic>
+using namespace std;
+#endif
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
+};
+typedef pthread_t ggml_thread_t;
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared * shared;
+};
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_str str;
+
+    struct gguf_array_T {
+        enum gguf_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct ggml_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+struct gguf_kv {
+    struct gguf_str key;
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+
+
+struct gguf_header {
+    char magic[4];
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_tensor_info {
+    struct gguf_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_MAX_DIMS];
+
+    enum ggml_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_context {
+    struct gguf_header header;
+
+    struct gguf_kv          * kv;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+
+#include "ggml-backend-impl.h"
diff --git a/ggml-mpi.c b/ggml-mpi.cpp
similarity index 100%
rename from ggml-mpi.c
rename to ggml-mpi.cpp
diff --git a/ggml-quants.c b/ggml-quants.cpp
similarity index 93%
rename from ggml-quants.c
rename to ggml-quants.cpp
index 7285d5f7fbcc0..e49189394e4ac 100644
--- a/ggml-quants.c
+++ b/ggml-quants.cpp
@@ -5,7 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
-
+#include <stdio.h>
 #ifdef __ARM_NEON
 
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
@@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+void quantize_row_q4_0_reference(const float * GGML_RESTRICT  x, block_q4_0 * GGML_RESTRICT  y, int k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
     }
 }
 
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_0_reference(x, y, k);
+void quantize_row_q4_0(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  y, int k) {
+  quantize_row_q4_0_reference(x, (block_q4_0*)y, k);
 }
 
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+void quantize_row_q4_1_reference(const float * GGML_RESTRICT  x, block_q4_1 * GGML_RESTRICT  y, int k) {
     const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     }
 }
 
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_1_reference(x, y, k);
+void quantize_row_q4_1(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  y, int k) {
+  quantize_row_q4_1_reference(x, (block_q4_1*)y, k);
 }
 
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+void quantize_row_q5_0_reference(const float * GGML_RESTRICT  x, block_q5_0 * GGML_RESTRICT  y, int k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
     }
 }
 
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_0_reference(x, y, k);
+void quantize_row_q5_0(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  y, int k) {
+  quantize_row_q5_0_reference(x, (block_q5_0*)y, k);
 }
 
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+void quantize_row_q5_1_reference(const float * GGML_RESTRICT  x, block_q5_1 * GGML_RESTRICT  y, int k) {
     const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     }
 }
 
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_1_reference(x, y, k);
+void quantize_row_q5_1(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  y, int k) {
+  quantize_row_q5_1_reference(x, (block_q5_1*)y, k);
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+void quantize_row_q8_0_reference(const float * GGML_RESTRICT  x, block_q8_0 * GGML_RESTRICT  y, int k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
@@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
     }
 }
 
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_0(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
-    block_q8_0 * restrict y = vy;
+    block_q8_0 * GGML_RESTRICT  y = (block_q8_0*)vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+void quantize_row_q8_1_reference(const float * GGML_RESTRICT  x, block_q8_1 * GGML_RESTRICT  y, int k) {
     assert(QK8_1 == 32);
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
@@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
     }
 }
 
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_1(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
 
-    block_q8_1 * restrict y = vy;
+    block_q8_1 * GGML_RESTRICT  y = (block_q8_1*)vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
 #endif
 }
 
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     static const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     static const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     static const int qk = QK8_0;
 
     assert(k % qk == 0);
@@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT  x, int8_t * GGML_RESTRICT  L, int rmse_type) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     return scale;
 }
 
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT  x, int8_t * GGML_RESTRICT  L, bool do_rmse) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT  x, uint8_t * GGML_RESTRICT  L, float * GGML_RESTRICT  the_min,
         int ntry, float alpha) {
     float min = x[0];
     float max = x[0];
@@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
     return scale;
 }
 
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT  x, const float * GGML_RESTRICT  weights,
+        uint8_t * GGML_RESTRICT  L, float * GGML_RESTRICT  the_min, uint8_t * GGML_RESTRICT  Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
     float min = x[0];
     float max = x[0];
@@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
 }
 
 #if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT  q, uint8_t * GGML_RESTRICT  d, uint8_t * GGML_RESTRICT  m) {
     if (j < 4) {
         *d = q[j] & 63; *m = q[j + 4] & 63;
     } else {
@@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 
 //========================- 2-bit (de)-quantization
 
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
+void quantize_row_q2_K_reference(const float * GGML_RESTRICT  x, block_q2_K * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     }
 }
 
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q2_K_reference(x, vy, k);
+void quantize_row_q2_K(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
+  quantize_row_q2_K_reference(x, (block_q2_K*)vy, k);
 }
 
-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q2_K(const float * GGML_RESTRICT  src, void * GGML_RESTRICT  dst, int n, int k, int64_t * GGML_RESTRICT  hist) {
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
+        block_q2_K * GGML_RESTRICT  y = (block_q2_K *)dst + j/QK_K;
         quantize_row_q2_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q2_K));
@@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
 
 //========================= 3-bit (de)-quantization
 
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
+void quantize_row_q3_K_reference(const float * GGML_RESTRICT  x, block_q3_K * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
 }
 
 #if QK_K == 256
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * GGML_RESTRICT  q = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].hmask;
         uint8_t m = 1;
 
         memcpy(aux, x[i].scales, 12);
@@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
     }
 }
 #else
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     assert(QK_K == 64);
     const int nb = k / QK_K;
@@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * GGML_RESTRICT  q = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].hmask;
 
         const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
         const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
@@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 }
 #endif
 
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q3_K_reference(x, vy, k);
+void quantize_row_q3_K(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
+  quantize_row_q3_K_reference(x, (block_q3_K*)vy, k);
 }
 
-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q3_K(const float * GGML_RESTRICT  src, void * GGML_RESTRICT  dst, int n, int k, int64_t * GGML_RESTRICT  hist) {
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
+        block_q3_K * GGML_RESTRICT  y = (block_q3_K *)dst + j/QK_K;
         quantize_row_q3_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q3_K));
@@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 4-bit (de)-quantization
 
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
+void quantize_row_q4_K_reference(const float * GGML_RESTRICT  x, block_q4_K * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     }
 }
 
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q4_K(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
     assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
+    block_q4_K * GGML_RESTRICT  y = (block_q4_K*)vy;
     quantize_row_q4_K_reference(x, y, k);
 }
 
-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q4_K(const float * GGML_RESTRICT  src, void * GGML_RESTRICT  dst, int n, int k, int64_t * GGML_RESTRICT  hist) {
     assert(k % QK_K == 0);
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
+        block_q4_K * GGML_RESTRICT  y = (block_q4_K *)dst + j/QK_K;
         quantize_row_q4_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q4_K));
@@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 5-bit (de)-quantization
 
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
+void quantize_row_q5_K_reference(const float * GGML_RESTRICT  x, block_q5_K * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * GGML_RESTRICT  qh = y[i].qh;
+        uint8_t * GGML_RESTRICT  ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         uint8_t m1 = 1, m2 = 2;
@@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * GGML_RESTRICT  qh = y[i].qh;
+        uint8_t * GGML_RESTRICT  ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         for (int j = 0; j < 32; ++j) {
@@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     }
 }
 
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
         }
 #else
         float d = GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict s = x[i].scales;
+        const int8_t * GGML_RESTRICT  s = x[i].scales;
         for (int l = 0; l < 8; ++l) {
             y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
             y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
@@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q5_K(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
     assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
+    block_q5_K * GGML_RESTRICT  y = (block_q5_K*)vy;
     quantize_row_q5_K_reference(x, y, k);
 }
 
-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q5_K(const float * GGML_RESTRICT  src, void * GGML_RESTRICT  dst, int n, int k, int64_t * GGML_RESTRICT  hist) {
     assert(k % QK_K == 0);
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
+        block_q5_K * GGML_RESTRICT  y = (block_q5_K *)dst + j/QK_K;
         quantize_row_q5_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q5_K));
@@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 6-bit (de)-quantization
 
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
+void quantize_row_q6_K_reference(const float * GGML_RESTRICT  x, block_q6_K * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             }
         }
 
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
+        uint8_t * GGML_RESTRICT  ql = y[i].ql;
+        uint8_t * GGML_RESTRICT  qh = y[i].qh;
 #if QK_K == 256
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
     }
 }
 
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
+        const uint8_t * GGML_RESTRICT  ql = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  sc = x[i].scales;
 
 #if QK_K == 256
         for (int n = 0; n < QK_K; n += 128) {
@@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q6_K(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  vy, int k) {
     assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
+    block_q6_K * GGML_RESTRICT  y = (block_q6_K*)vy;
     quantize_row_q6_K_reference(x, y, k);
 }
 
@@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
+        block_q6_K * GGML_RESTRICT  y = (block_q6_K *)dst + j/QK_K;
         quantize_row_q6_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q6_K));
@@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
 
 //===================================== Q8_K ==============================================
 
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
+void quantize_row_q8_K_reference(const float * GGML_RESTRICT  x, block_q8_K * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
     }
 }
 
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT  x, float * GGML_RESTRICT  y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2374,8 +2374,8 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q8_K_reference(x, y, k);
+void quantize_row_q8_K(const float * GGML_RESTRICT  x, void * GGML_RESTRICT  y, int k) {
+  quantize_row_q8_K_reference(x, (block_q8_K*)y, k);
 }
 
 //===================================== Dot ptoducts =================================
@@ -2423,14 +2423,15 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
+  //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy);
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q4_0 * GGML_RESTRICT  x = (const block_q4_0*)vx;
+    const block_q8_0 * GGML_RESTRICT  y = (const block_q8_0*)vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2439,10 +2440,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q4_0 * GGML_RESTRICT  x0 = &x[i + 0];
+        const block_q4_0 * GGML_RESTRICT  x1 = &x[i + 1];
+        const block_q8_0 * GGML_RESTRICT  y0 = &y[i + 0];
+        const block_q8_0 * GGML_RESTRICT  y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
         const int8x16_t  s8b = vdupq_n_s8(0x8);
@@ -2733,14 +2734,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
 #endif
 }
 
-void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_1_q8_1(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q4_1 * GGML_RESTRICT  x = (const block_q4_1*)vx;
+    const block_q8_1 * GGML_RESTRICT  y = (const block_q8_1*)vy;
 
     // TODO: add WASM SIMD
 #if defined(__ARM_NEON)
@@ -2752,10 +2753,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
+        const block_q4_1 * GGML_RESTRICT  x0 = &x[i + 0];
+        const block_q4_1 * GGML_RESTRICT  x1 = &x[i + 1];
+        const block_q8_1 * GGML_RESTRICT  y0 = &y[i + 0];
+        const block_q8_1 * GGML_RESTRICT  y1 = &y[i + 1];
 
         summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
 
@@ -2893,15 +2894,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_0_q8_0(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
     assert(qk == QK5_0);
 
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q5_0 * GGML_RESTRICT  x = (const block_q5_0*)vx;
+    const block_q8_0 * GGML_RESTRICT  y = (const block_q8_0*)vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2916,10 +2917,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q5_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q5_0 * GGML_RESTRICT  x0 = &x[i];
+        const block_q5_0 * GGML_RESTRICT  x1 = &x[i + 1];
+        const block_q8_0 * GGML_RESTRICT  y0 = &y[i];
+        const block_q8_0 * GGML_RESTRICT  y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3000,8 +3001,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 
     // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
+        const block_q5_0 * GGML_RESTRICT  x0 = &x[i];
+        const block_q8_0 * GGML_RESTRICT  y0 = &y[i];
 
         const v128_t m4b  = wasm_i8x16_splat(0x0F);
 
@@ -3199,15 +3200,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_1_q8_1(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
     assert(n % qk == 0);
     assert(qk == QK5_1);
 
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q5_1 * GGML_RESTRICT  x = (const block_q5_1*)vx;
+    const block_q8_1 * GGML_RESTRICT  y = (const block_q8_1*)vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3225,10 +3226,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q5_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i];
-        const block_q8_1 * restrict y1 = &y[i + 1];
+        const block_q5_1 * GGML_RESTRICT  x0 = &x[i];
+        const block_q5_1 * GGML_RESTRICT  x1 = &x[i + 1];
+        const block_q8_1 * GGML_RESTRICT  y0 = &y[i];
+        const block_q8_1 * GGML_RESTRICT  y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3314,8 +3315,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 
     // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
+        const block_q5_1 * GGML_RESTRICT  x0 = &x[i];
+        const block_q8_1 * GGML_RESTRICT  y0 = &y[i];
 
         summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
 
@@ -3518,14 +3519,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q8_0_q8_0(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q8_0 * GGML_RESTRICT  x = (const block_q8_0*)vx;
+    const block_q8_0 * GGML_RESTRICT  y = (const block_q8_0*)vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3534,10 +3535,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q8_0 * GGML_RESTRICT  x0 = &x[i + 0];
+        const block_q8_0 * GGML_RESTRICT  x1 = &x[i + 1];
+        const block_q8_0 * GGML_RESTRICT  y0 = &y[i + 0];
+        const block_q8_0 * GGML_RESTRICT  y1 = &y[i + 1];
 
         const int8x16_t x0_0 = vld1q_s8(x0->qs);
         const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3642,10 +3643,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
 }
 
 #if QK_K == 256
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+  const block_q2_K * GGML_RESTRICT  x = (const block_q2_K*)vx;
+  const block_q8_K * GGML_RESTRICT  y = (const block_q8_K*)vy;
 
     const int nb = n / QK_K;
 
@@ -3667,9 +3668,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  sc = x[i].scales;
 
         const uint8x16_t mins_and_scales = vld1q_u8(sc);
         const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -3746,8 +3747,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
         const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -3813,8 +3814,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         // load mins and scales from block_q2_K.scales[QK_K/16]
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -4035,10 +4036,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_K * GGML_RESTRICT  x = vx;
+    const block_q8_K * GGML_RESTRICT  y = vy;
 
     const int nb = n / QK_K;
 
@@ -4061,9 +4062,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const float dmin = -y[i].d * (float)x[i].dmin;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
+        const uint32_t * GGML_RESTRICT  sc = (const uint32_t *)x[i].scales;
 
         aux32[0] = sc[0] & 0x0f0f0f0f;
         aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
@@ -4114,8 +4115,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
     __m256 acc = _mm256_setzero_ps();
 
     uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
+    const uint8_t * GGML_RESTRICT  db = (const uint8_t *)&ud;
+    const uint8_t * GGML_RESTRICT  mb = (const uint8_t *)&um;
 
     float summs = 0;
 
@@ -4126,10 +4127,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint32_t * GGML_RESTRICT  sc = (const uint32_t *)x[i].scales;
         ud = (sc[0] >> 0) & 0x0f0f0f0f;
         um = (sc[0] >> 4) & 0x0f0f0f0f;
 
@@ -4166,8 +4167,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
     __m256 acc = _mm256_setzero_ps();
 
     uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
+    const uint8_t * GGML_RESTRICT  db = (const uint8_t *)&ud;
+    const uint8_t * GGML_RESTRICT  mb = (const uint8_t *)&um;
 
     float summs = 0;
 
@@ -4178,10 +4179,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint32_t * GGML_RESTRICT  sc = (const uint32_t *)x[i].scales;
         ud = (sc[0] >> 0) & 0x0f0f0f0f;
         um = (sc[0] >> 4) & 0x0f0f0f0f;
 
@@ -4227,9 +4228,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const float dmin = -y[i].d * (float)x[i].dmin;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint8_t * GGML_RESTRICT  q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
+        const uint32_t * GGML_RESTRICT  sc = (const uint32_t *)x[i].scales;
 
         aux32[0] = sc[0] & 0x0f0f0f0f;
         aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
@@ -4311,14 +4312,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
     const uint32_t kmask1 = 0x03030303;
     const uint32_t kmask2 = 0x0f0f0f0f;
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * GGML_RESTRICT  x = (const block_q3_K*)vx;
+    const block_q8_K * GGML_RESTRICT  y = (const block_q8_K*)vy;
 
     const int nb = n / QK_K;
 
@@ -4346,9 +4347,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  qh = x[i].hmask;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -4454,8 +4455,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         // Set up scales
         memcpy(aux, x[i].scales, 12);
@@ -4559,8 +4560,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         // Set up scales
         aux = (const uint32_t *)x[i].scales;
@@ -4694,9 +4695,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  qh = x[i].hmask;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
 
         memcpy(aux, x[i].scales, 12);
         utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -4806,11 +4807,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT  a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -4855,11 +4856,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * GGML_RESTRICT  x = vx;
+    const block_q8_K * GGML_RESTRICT  y = vy;
 
     const int nb = n / QK_K;
 
@@ -4947,8 +4948,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5018,8 +5019,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5098,8 +5099,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5173,10 +5174,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
+        const uint8_t * GGML_RESTRICT  q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
+        int8_t * GGML_RESTRICT  a = aux8;
         for (int l = 0; l < 8; ++l) {
             a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
             a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
@@ -5213,11 +5214,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * GGML_RESTRICT  x = (const block_q4_K*)vx;
+    const block_q8_K * GGML_RESTRICT  y = (const block_q8_K*)vy;
 
     const int nb = n / QK_K;
 
@@ -5262,8 +5263,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         int32_t sumi1 = 0;
         int32_t sumi2 = 0;
@@ -5334,8 +5335,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
 
@@ -5393,8 +5394,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -5494,8 +5495,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
         sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         vl = 32;
 
@@ -5548,10 +5549,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT  a = aux8;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
             a += 32;
@@ -5594,11 +5595,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 }
 #else
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * GGML_RESTRICT  x = vx;
+    const block_q8_K * GGML_RESTRICT  y = vy;
 
     const int nb = n / QK_K;
 
@@ -5618,14 +5619,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
     float sum_mins = 0.f;
 
     uint16_t aux16[2];
-    const uint8_t * restrict scales = (const uint8_t *)aux16;
+    const uint8_t * GGML_RESTRICT  scales = (const uint8_t *)aux16;
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
-        const uint16_t * restrict a = (const uint16_t *)x[i].scales;
+        const uint16_t * GGML_RESTRICT  a = (const uint16_t *)x[i].scales;
         aux16[0] = a[0] & 0x0f0f;
         aux16[1] = (a[0] >> 4) & 0x0f0f;
 
@@ -5698,8 +5699,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
         const __m256i q4l = _mm256_and_si256(q4bits, m4);
@@ -5744,8 +5745,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
         const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
@@ -5778,16 +5779,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #elif defined __riscv_v_intrinsic
 
     uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
+    const uint8_t * GGML_RESTRICT  scales = (const uint8_t *)s16;
 
     float sumf = 0;
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
 
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        const uint16_t * GGML_RESTRICT  b = (const uint16_t *)x[i].scales;
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
@@ -5827,17 +5828,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
     memset(sums, 0, 8*sizeof(float));
 
     uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
+    const uint8_t * GGML_RESTRICT  scales = (const uint8_t *)s16;
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        uint8_t * restrict a = aux8;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
+        uint8_t * GGML_RESTRICT  a = aux8;
         for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
         for (int l = 0; l < 32; ++l) a[l+32] = q4[l]  >> 4;
 
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        const uint16_t * GGML_RESTRICT  b = (const uint16_t *)x[i].scales;
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
@@ -5861,11 +5862,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * GGML_RESTRICT  x = (const block_q5_K*)vx;
+    const block_q8_K * GGML_RESTRICT  y = (const block_q8_K*)vy;
 
     const int nb = n / QK_K;
 
@@ -5911,9 +5912,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -5976,8 +5977,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
    for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
 #if QK_K == 256
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
@@ -6065,8 +6066,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6163,9 +6164,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
         vl = 8;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -6249,11 +6250,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT  a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -6302,11 +6303,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * GGML_RESTRICT  x = vx;
+    const block_q8_K * GGML_RESTRICT  y = vy;
 
     const int nb = n / QK_K;
 
@@ -6328,9 +6329,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const int8_t * sc = x[i].scales;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const uint8x8_t qhbits = vld1_u8(qh);
 
@@ -6387,8 +6388,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
@@ -6433,8 +6434,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
@@ -6490,9 +6491,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const int8_t * sc = x[i].scales;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
 
@@ -6560,10 +6561,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT  hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
+        int8_t * GGML_RESTRICT  a = aux8;
         for (int l = 0; l < 32; ++l) {
             a[l+ 0] = q4[l] & 0xF;
             a[l+32] = q4[l]  >> 4;
@@ -6574,7 +6575,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         }
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
+        const int8_t * GGML_RESTRICT  sc = x[i].scales;
 
         for (int j = 0; j < QK_K/16; ++j) {
             const float dl = d * sc[j];
@@ -6591,11 +6592,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
 
 #if QK_K == 256
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * GGML_RESTRICT  x = (const block_q6_K *)vx;
+    const block_q8_K * GGML_RESTRICT  y = (const block_q8_K *)vy;
 
     const int nb = n / QK_K;
 
@@ -6618,11 +6619,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * GGML_RESTRICT  scale = x[i].scales;
 
         const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
         const int8x16_t scales = vld1q_s8(scale);
@@ -6750,9 +6751,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -6830,9 +6831,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -6942,11 +6943,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * GGML_RESTRICT  scale = x[i].scales;
 
         size_t vl;
 
@@ -7030,11 +7031,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT  a = aux8;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
                 a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -7067,11 +7068,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT  s, const void * GGML_RESTRICT  vx, const void * GGML_RESTRICT  vy) {
     assert(n % QK_K == 0);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * GGML_RESTRICT  x = vx;
+    const block_q8_K * GGML_RESTRICT  y = vy;
 
     const int nb = n / QK_K;
 
@@ -7094,11 +7095,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = (float)x[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * GGML_RESTRICT  scale = x[i].scales;
 
         int32_t isum = 0;
 
@@ -7157,9 +7158,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
         const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
@@ -7214,9 +7215,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
         const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
         const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
@@ -7281,11 +7282,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = (float)x[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT  q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * GGML_RESTRICT  scale = x[i].scales;
 
         int32_t isum = 0;
 
@@ -7350,11 +7351,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT  q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT  qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT  q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT  a = aux8;
         for (int l = 0; l < 16; ++l) {
             a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
             a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
diff --git a/ggml-quants.h b/ggml-quants.h
index 70c12c27465e8..2706e36ada7d3 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 
 
 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
-
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
+
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
+
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
+
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
 
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
+
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
 
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
diff --git a/ggml.c b/ggml.cpp
similarity index 98%
rename from ggml.c
rename to ggml.cpp
index f92292b39c635..f1a0e5358859a 100644
--- a/ggml.c
+++ b/ggml.cpp
@@ -38,6 +38,14 @@
 #pragma warning(disable: 4996)
 #endif
 
+// initializers for static data called in the ggml_init function
+static  size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {};
+static  char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
+
+void type_traits_init();
+void GGUF_TYPE_SIZE_init();
+void GGUF_TYPE_NAME_init();
+
 #if defined(_WIN32)
 
 #include <windows.h>
@@ -86,7 +94,9 @@ static int sched_yield (void) {
 }
 #else
 #include <pthread.h>
-#include <stdatomic.h>
+//#include <stdatomic.h>
+#include <atomic>
+using namespace std;
 
 typedef void * thread_ret_t;
 
@@ -96,6 +106,8 @@ typedef void * thread_ret_t;
 
 #endif
 
+#include <atomic>
+
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
@@ -409,37 +421,39 @@ int64_t ggml_cycles_per_ms(void) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT  s, const float * GGML_RESTRICT  x, const float * GGML_RESTRICT  y);
+static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT  s, ggml_fp16_t * GGML_RESTRICT  x, ggml_fp16_t * GGML_RESTRICT  y);
 
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_I8] = {
+
+static  ggml_type_traits_t type_traits[GGML_TYPE_COUNT]; 
+void type_traits_init(){
+    type_traits[GGML_TYPE_I8] = {
         .type_name                = "i8",
         .blck_size                = 1,
         .type_size                = sizeof(int8_t),
         .is_quantized             = false,
-    },
-    [GGML_TYPE_I16] = {
+    };
+    type_traits[GGML_TYPE_I16] = {
         .type_name                = "i16",
         .blck_size                = 1,
         .type_size                = sizeof(int16_t),
         .is_quantized             = false,
-    },
-    [GGML_TYPE_I32] = {
+    };
+    type_traits[GGML_TYPE_I32] = {
         .type_name                = "i32",
         .blck_size                = 1,
         .type_size                = sizeof(int32_t),
         .is_quantized             = false,
-    },
-    [GGML_TYPE_F32] = {
+    };
+    type_traits[GGML_TYPE_F32] = {
         .type_name                = "f32",
         .blck_size                = 1,
         .type_size                = sizeof(float),
         .is_quantized             = false,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
         .vec_dot_type             = GGML_TYPE_F32,
-    },
-    [GGML_TYPE_F16] = {
+    };
+    type_traits[GGML_TYPE_F16] = {
         .type_name                = "f16",
         .blck_size                = 1,
         .type_size                = sizeof(ggml_fp16_t),
@@ -449,8 +463,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
         .vec_dot_type             = GGML_TYPE_F16,
-    },
-    [GGML_TYPE_Q4_0] = {
+    };
+    type_traits[GGML_TYPE_Q4_0] = {
         .type_name                = "q4_0",
         .blck_size                = QK4_0,
         .type_size                = sizeof(block_q4_0),
@@ -460,8 +474,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
         .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
+    };
+    type_traits[GGML_TYPE_Q4_1] = {
         .type_name                = "q4_1",
         .blck_size                = QK4_1,
         .type_size                = sizeof(block_q4_1),
@@ -471,8 +485,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
         .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [4] = { // GGML_TYPE_Q4_2
+    };
+    type_traits[4] = { // GGML_TYPE_Q4_2
         .type_name                = "DEPRECATED",
         .blck_size                = 0,
         .type_size                = 0,
@@ -482,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = NULL,
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [5] = { // GGML_TYPE_Q4_3
+    };
+    type_traits[5] = { // GGML_TYPE_Q4_3
         .type_name                = "DEPRECATED",
         .blck_size                = 0,
         .type_size                = 0,
@@ -493,8 +507,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = NULL,
         .vec_dot                  = NULL,
         .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [GGML_TYPE_Q5_0] = {
+    };
+    type_traits[GGML_TYPE_Q5_0] = {
         .type_name                = "q5_0",
         .blck_size                = QK5_0,
         .type_size                = sizeof(block_q5_0),
@@ -504,8 +518,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
         .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
+    };
+    type_traits[GGML_TYPE_Q5_1] = {
         .type_name                = "q5_1",
         .blck_size                = QK5_1,
         .type_size                = sizeof(block_q5_1),
@@ -515,8 +529,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
         .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
+    };
+    type_traits[GGML_TYPE_Q8_0] = {
         .type_name                = "q8_0",
         .blck_size                = QK8_0,
         .type_size                = sizeof(block_q8_0),
@@ -526,8 +540,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
         .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
+    };
+    type_traits[GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
         .blck_size                = QK8_1,
         .type_size                = sizeof(block_q8_1),
@@ -535,8 +549,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_q8_1,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
         .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q2_K] = {
+    };
+    type_traits[GGML_TYPE_Q2_K] = {
         .type_name                = "q2_K",
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q2_K),
@@ -546,8 +560,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
         .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q3_K] = {
+    };
+    type_traits[GGML_TYPE_Q3_K] = {
         .type_name                = "q3_K",
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q3_K),
@@ -557,8 +571,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
         .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q4_K] = {
+    };
+    type_traits[GGML_TYPE_Q4_K] = {
         .type_name                = "q4_K",
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q4_K),
@@ -568,8 +582,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
         .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q5_K] = {
+    };
+    type_traits[GGML_TYPE_Q5_K] = {
         .type_name                = "q5_K",
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q5_K),
@@ -579,8 +593,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
         .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q6_K] = {
+    };
+    type_traits[GGML_TYPE_Q6_K] = {
         .type_name                = "q6_K",
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q6_K),
@@ -590,15 +604,15 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
         .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q8_K] = {
+    };
+    type_traits[GGML_TYPE_Q8_K] = {
         .type_name                = "q8_K",
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q8_K),
         .is_quantized             = true,
         .from_float               = quantize_row_q8_K,
-    }
-};
+    };
+}
 
 // For internal test use
 ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
@@ -1160,7 +1174,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
 #ifdef GGML_SIMD
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1197,7 +1211,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
     *s = sumf;
 }
 
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) {
     ggml_float sumf = 0.0;
 
 #if defined(GGML_SIMD)
@@ -1235,10 +1249,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
 
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
     ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
 
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
 
     for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
         x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1288,7 +1302,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
     }
 }
 
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -1320,10 +1334,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 }
 
 // xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
 
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
 
     for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
         x[i] = (const float *) ((const char *) xv + i*xs);
@@ -2175,18 +2189,26 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    static bool is_first_call = true;
 
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
+struct ggml_context * ggml_init(struct ggml_init_params params) {
 
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
+  // initialize the data in the arrays
+  type_traits_init();
+  GGUF_TYPE_SIZE_init();
+  GGUF_TYPE_NAME_init();
+
+  struct ggml_context * ctx = NULL;
+  static bool is_first_call = true;
+  // make this function thread safe
+  ggml_critical_section_start();
+  
+
+  if (is_first_call) {
+    // initialize time system (required on Windows)
+    ggml_time_init();
+    
+    // initialize GELU, Quick GELU, SILU and EXP F32 tables
+    {
             const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
             ggml_fp16_t ii;
@@ -2238,7 +2260,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     }
 
     // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+    
 
     for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
         if (!g_state.contexts[i].used) {
@@ -2402,7 +2424,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
     // align to GGML_MEM_ALIGN
     size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
 
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@@ -2475,7 +2497,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
                 return NULL;
             }
 
-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+            data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);
 
             ctx->scratch.offs += data_size;
         } else {
@@ -2630,7 +2652,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
     const int nc    = tensor->ne[0];
     const size_t n1 = tensor->nb[1];
 
-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;
 
     switch (tensor->type) {
         case GGML_TYPE_I8:
@@ -2682,7 +2704,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
     const int nc    = tensor->ne[0];
     const size_t n1 = tensor->nb[1];
 
-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;
 
     switch (tensor->type) {
         case GGML_TYPE_I8:
@@ -3063,7 +3085,7 @@ struct ggml_tensor * ggml_view_tensor(
 struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
     struct ggml_object * obj = ctx->objects_begin;
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
 
     while (obj != NULL) {
         if (obj->type == GGML_OBJECT_TENSOR) {
@@ -3080,7 +3102,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
     struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
     obj = obj->next;
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
 
     while (obj != NULL) {
         if (obj->type == GGML_OBJECT_TENSOR) {
@@ -3096,7 +3118,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
 struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
     struct ggml_object * obj = ctx->objects_begin;
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
 
     while (obj != NULL) {
         if (obj->type == GGML_OBJECT_TENSOR) {
@@ -3292,7 +3314,7 @@ static struct ggml_tensor * ggml_acc_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ACC;
@@ -4145,7 +4167,7 @@ static struct ggml_tensor * ggml_set_impl(
     // make a view of the destination
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_SET;
@@ -5402,7 +5424,7 @@ struct ggml_tensor * ggml_pool_2d(
     };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op = GGML_OP_POOL_2D;
@@ -8262,7 +8284,7 @@ static void ggml_compute_forward_repeat_back_f32(
     GGML_ASSERT(nb00 == sizeof(float));
 
     if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
     } else {
         for         (int k3 = 0; k3 < ne3; k3++) {
             for     (int k2 = 0; k2 < ne2; k2++) {
@@ -9390,6 +9412,7 @@ static void ggml_compute_forward_mul_mat(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
+
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
@@ -9492,7 +9515,7 @@ static void ggml_compute_forward_mul_mat(
 
     if (params->type == GGML_TASK_INIT) {
         if (src1->type != vec_dot_type) {
-            char * wdata = params->wdata;
+	  char * wdata = (char*)params->wdata;
             const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
 
             for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -9646,7 +9669,7 @@ static void ggml_compute_forward_out_prod_f32(
             return;
         }
 #endif
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
         return;
     }
 
@@ -9829,7 +9852,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 
     if (params->type == GGML_TASK_INIT) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
         return;
     }
 
@@ -11843,7 +11866,7 @@ static void ggml_compute_forward_pool_1d(
               struct ggml_tensor * dst) {
 
     const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
     const int k0 = opts[1];
     const int s0 = opts[2];
     const int p0 = opts[3];
@@ -11867,7 +11890,7 @@ static void ggml_compute_forward_pool_2d(
     }
 
     const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
     const int k0 = opts[1];
     const int k1 = opts[2];
     const int s0 = opts[3];
@@ -14098,7 +14121,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
     size = ggml_hash_size(size);
     struct ggml_hash_set result;
     result.size = size;
-    result.keys = malloc(sizeof(struct ggml_tensor *) * size);
+    result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
     memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
     return result;
 }
@@ -14113,9 +14136,9 @@ struct hash_map {
 };
 
 static struct hash_map * ggml_new_hash_map(size_t size) {
-    struct hash_map * result = malloc(sizeof(struct hash_map));
+  struct hash_map * result = (hash_map*)malloc(sizeof(struct hash_map));
     result->set = ggml_hash_set_new(size);
-    result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
+    result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
     memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
     return result;
 }
@@ -16034,7 +16057,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
         /*.abort_callback          =*/ NULL,
         /*.abort_callback_data     =*/ NULL,
     };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+    struct ggml_compute_state * workers = (ggml_compute_state*)alloca(sizeof(struct ggml_compute_state)*n_threads);
 
     // create thread pool
     if (n_threads > 1) {
@@ -16631,7 +16654,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
             continue;
         }
 
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
     }
 
     GGML_PRINT("========================================\n");
@@ -16903,11 +16926,11 @@ static enum ggml_opt_result ggml_opt_adam(
     const int n_accum = MAX(1, params.n_gradient_accumulation);
     const float accum_norm = 1.0f / (float) n_accum;
 
-    float * g  = opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
+    float * g  = (float*)opt->adam.g->data;  // gradients
+    float * m  = (float*)opt->adam.m->data;  // first moment
+    float * v  = (float*)opt->adam.v->data;  // second moment
 
-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values
 
     struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
@@ -17175,7 +17198,7 @@ static enum ggml_opt_result linesearch_backtracking(
         } else {
             // Armijo condition is satisfied
             if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
+	      return (ggml_opt_result)count;
             }
 
             ggml_vec_dot_f32(nx, &dg, g, d);
@@ -17186,14 +17209,14 @@ static enum ggml_opt_result linesearch_backtracking(
             } else {
                 if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
                     // regular Wolfe conditions
-                    return count;
+		  return (ggml_opt_result)count;
                 }
 
                 if(dg > -params->lbfgs.wolfe*dginit) {
                     width = dec;
                 } else {
                     // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
+		  return (ggml_opt_result)count;
                 }
             }
         }
@@ -17258,13 +17281,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
-    float * x  = opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
+    float * x  = (float*)opt->lbfgs.x->data;  // current parameters
+    float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
+    float * g  = (float*)opt->lbfgs.g->data;  // current gradient
+    float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
+    float * d  = (float*)opt->lbfgs.d->data;  // search direction
 
-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values
 
     const int n_accum = MAX(1, params.n_gradient_accumulation);
     const float accum_norm = 1.0f / (float) n_accum;
@@ -17277,10 +17300,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     ggml_opt_get_params(np, ps, x);
 
     // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
+    float * lm_alpha = (float*)opt->lbfgs.lmal->data;
+    float * lm_ys    = (float*)opt->lbfgs.lmys->data;
+    float * lm_s     = (float*)opt->lbfgs.lms->data;
+    float * lm_y     = (float*)opt->lbfgs.lmy->data;
 
     bool cancel = false;
 
@@ -17377,7 +17400,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             ggml_vec_cpy_f32(nx, x, xp);
             ggml_vec_cpy_f32(nx, g, gp);
 
-            return ls;
+            return (ggml_opt_result)ls;
         }
 
         opt->loss_after = fx;
@@ -17564,7 +17587,7 @@ GGML_API void ggml_opt_init(
     opt->nx = nx;
     opt->just_initialized = true;
     if (opt->ctx == NULL) {
-        struct ggml_init_params ctx_opt_params;
+      struct ggml_init_params ctx_opt_params;
         if (opt->params.type == GGML_OPT_ADAM) {
             ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
             if (opt->params.past > 0) {
@@ -17718,7 +17741,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+        block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0;
 
         quantize_row_q4_0_reference(src + b, y, k);
 
@@ -17741,7 +17764,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_1;
 
     for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+        block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1;
 
         quantize_row_q4_1_reference(src + b, y, k);
 
@@ -17764,7 +17787,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK5_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+        block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0;
 
         quantize_row_q5_0_reference(src + b, y, k);
 
@@ -17794,7 +17817,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK5_1;
 
     for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+        block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1;
 
         quantize_row_q5_1_reference(src + b, y, k);
 
@@ -17824,7 +17847,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK8_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+        block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0;
 
         quantize_row_q8_0_reference(src + b, y, k);
 
@@ -17928,37 +17951,39 @@ struct gguf_str {
     char * data;
 };
 
-static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
+
+void GGUF_TYPE_SIZE_init() {
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT8]   = sizeof(uint8_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT8]    = sizeof(int8_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT16]  = sizeof(uint16_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT16]   = sizeof(int16_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT32]  = sizeof(uint32_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT32]   = sizeof(int32_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT32] = sizeof(float);
+    GGUF_TYPE_SIZE[GGUF_TYPE_BOOL]    = sizeof(bool);
+    GGUF_TYPE_SIZE[GGUF_TYPE_STRING]  = sizeof(struct gguf_str);
+    GGUF_TYPE_SIZE[GGUF_TYPE_UINT64]  = sizeof(uint64_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_INT64]   = sizeof(int64_t);
+    GGUF_TYPE_SIZE[GGUF_TYPE_FLOAT64] = sizeof(double);
+    GGUF_TYPE_SIZE[GGUF_TYPE_ARRAY]   = 0; // undefined
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 
-static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = "u8",
-    [GGUF_TYPE_INT8]    = "i8",
-    [GGUF_TYPE_UINT16]  = "u16",
-    [GGUF_TYPE_INT16]   = "i16",
-    [GGUF_TYPE_UINT32]  = "u32",
-    [GGUF_TYPE_INT32]   = "i32",
-    [GGUF_TYPE_FLOAT32] = "f32",
-    [GGUF_TYPE_BOOL]    = "bool",
-    [GGUF_TYPE_STRING]  = "str",
-    [GGUF_TYPE_ARRAY]   = "arr",
-    [GGUF_TYPE_UINT64]  = "u64",
-    [GGUF_TYPE_INT64]   = "i64",
-    [GGUF_TYPE_FLOAT64] = "f64",
+
+void GGUF_TYPE_NAME_init(){
+  GGUF_TYPE_NAME[GGUF_TYPE_UINT8]   = "u8";
+  GGUF_TYPE_NAME[GGUF_TYPE_INT8]    = "i8";
+  GGUF_TYPE_NAME[GGUF_TYPE_UINT16]  = "u16";
+  GGUF_TYPE_NAME[GGUF_TYPE_INT16]   = "i16";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT32]  = "u32";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT32]   = "i32";
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
+    GGUF_TYPE_NAME[GGUF_TYPE_BOOL]    = "bool";
+    GGUF_TYPE_NAME[GGUF_TYPE_STRING]  = "str";
+    GGUF_TYPE_NAME[GGUF_TYPE_ARRAY]   = "arr";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT64]  = "u64";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT64]   = "i64";
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 
@@ -18040,14 +18065,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
 
     bool ok = true;
 
-    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
     ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
 
     return ok;
 }
 
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+  struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
@@ -18092,7 +18117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     bool ok = true;
 
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     // read the header
     {
@@ -18124,7 +18149,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the kv pairs
     {
-        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
+      ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
 
         for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
@@ -18199,7 +18224,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the tensor infos
     {
-        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+      ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
 
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct gguf_tensor_info * info = &ctx->infos[i];
@@ -18319,10 +18344,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         // create the tensors
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
+	      (int64_t)ctx->infos[i].ne[0],
+	      (int64_t)ctx->infos[i].ne[1],
+	      (int64_t)ctx->infos[i].ne[2],
+	      (int64_t)ctx->infos[i].ne[3],
             };
 
             struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@@ -18603,7 +18628,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
 
     const int n_kv = gguf_get_n_kv(ctx);
 
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
     ctx->header.n_kv++;
@@ -18739,7 +18764,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
             case GGUF_TYPE_ARRAY:
                 {
                     if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+		      const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
                         for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                             data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                         }
@@ -18760,7 +18785,7 @@ void gguf_add_tensor(
              struct gguf_context * ctx,
         const struct ggml_tensor * tensor) {
     const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
 
     ctx->infos[idx].name.n    = strlen(tensor->name);
     ctx->infos[idx].name.data = strdup(tensor->name);
diff --git a/ggml.h b/ggml.h
index f2fce0f22d357..1d69be2b00347 100644
--- a/ggml.h
+++ b/ggml.h
@@ -285,8 +285,10 @@
     GGML_UNUSED(prefix##3);
 
 #ifdef  __cplusplus
+#ifndef CPP_ONLY
 extern "C" {
 #endif
+#endif
 
 #if defined(__ARM_NEON) && defined(__CUDACC__)
     typedef half ggml_fp16_t;
@@ -1859,7 +1861,7 @@ extern "C" {
         int n_gradient_accumulation;
 
         // ADAM parameters
-        struct {
+        struct ggml_adam{
             int n_iter;
 
             float sched; // schedule multiplier (fixed, decay or warmup)
@@ -1875,7 +1877,7 @@ extern "C" {
         } adam;
 
         // LBFGS parameters
-        struct {
+        struct ggml_lbfgs{
             int m; // number of corrections to approximate the inv. Hessian
             int n_iter;
             int max_linesearch;
@@ -1902,7 +1904,7 @@ extern "C" {
         float loss_before;
         float loss_after;
 
-        struct {
+        struct ggml_grad{
             struct ggml_tensor * g;  // current gradient
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
@@ -1912,7 +1914,7 @@ extern "C" {
             int n_no_improvement;
         } adam;
 
-        struct {
+        struct ggml_params{
             struct ggml_tensor * x;    // current parameters
             struct ggml_tensor * xp;   // previous parameters
             struct ggml_tensor * g;    // current gradient
@@ -2134,15 +2136,15 @@ extern "C" {
 
 #ifdef  __cplusplus
 // restrict not standard in C++
-#define GGML_RESTRICT
+#define GGML_RESTRICT 
 #else
-#define GGML_RESTRICT restrict
+#define GGML_RESTRICT __restrict__
 #endif
     typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
     typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
-    typedef struct {
+    typedef struct ggml_something{
         const char      * type_name;
         int               blck_size;
         size_t            type_size;
@@ -2157,5 +2159,7 @@ extern "C" {
     GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
 #ifdef  __cplusplus
+#ifndef CPP_ONLY
 }
 #endif
+#endif
diff --git a/llama-internal.hpp b/llama-internal.hpp
new file mode 100644
index 0000000000000..33cf39e5d4f58
--- /dev/null
+++ b/llama-internal.hpp
@@ -0,0 +1,896 @@
+#include <set>
+#include <queue>
+enum llm_arch {
+    LLM_ARCH_LLAMA,
+    LLM_ARCH_FALCON,
+    LLM_ARCH_BAICHUAN,
+    LLM_ARCH_GPT2,
+    LLM_ARCH_GPTJ,
+    LLM_ARCH_GPTNEOX,
+    LLM_ARCH_MPT,
+    LLM_ARCH_STARCODER,
+    LLM_ARCH_PERSIMMON,
+    LLM_ARCH_REFACT,
+    LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+    LLM_KV_GENERAL_ARCHITECTURE,
+    LLM_KV_GENERAL_QUANTIZATION_VERSION,
+    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_NAME,
+    LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_URL,
+    LLM_KV_GENERAL_DESCRIPTION,
+    LLM_KV_GENERAL_LICENSE,
+    LLM_KV_GENERAL_SOURCE_URL,
+    LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+    LLM_KV_CONTEXT_LENGTH,
+    LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_BLOCK_COUNT,
+    LLM_KV_FEED_FORWARD_LENGTH,
+    LLM_KV_USE_PARALLEL_RESIDUAL,
+    LLM_KV_TENSOR_DATA_LAYOUT,
+
+    LLM_KV_ATTENTION_HEAD_COUNT,
+    LLM_KV_ATTENTION_HEAD_COUNT_KV,
+    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+    LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_LAYERNORM_EPS,
+    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+
+    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
+
+    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_LIST,
+    LLM_KV_TOKENIZER_TOKEN_TYPE,
+    LLM_KV_TOKENIZER_SCORES,
+    LLM_KV_TOKENIZER_MERGES,
+    LLM_KV_TOKENIZER_BOS_ID,
+    LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_UNK_ID,
+    LLM_KV_TOKENIZER_SEP_ID,
+    LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_HF_JSON,
+    LLM_KV_TOKENIZER_RWKV,
+};
+
+// available llama models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_1B,
+    MODEL_3B,
+    MODEL_7B,
+    MODEL_8B,
+    MODEL_13B,
+    MODEL_15B,
+    MODEL_30B,
+    MODEL_34B,
+    MODEL_40B,
+    MODEL_65B,
+    MODEL_70B,
+};
+
+enum llama_fver {
+    GGUF_FILE_VERSION_V1 = 1,
+    GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
+};
+
+struct LLM_KV {
+  LLM_KV(llm_arch arch) : arch(arch) {}
+
+  llm_arch arch;
+
+  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
+
+};
+
+enum llm_tensor {
+    LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_ROPE_FREQS,
+    LLM_TENSOR_ATTN_Q,
+    LLM_TENSOR_ATTN_K,
+    LLM_TENSOR_ATTN_V,
+    LLM_TENSOR_ATTN_QKV,
+    LLM_TENSOR_ATTN_OUT,
+    LLM_TENSOR_ATTN_NORM,
+    LLM_TENSOR_ATTN_NORM_2,
+    LLM_TENSOR_ATTN_ROT_EMBD,
+    LLM_TENSOR_FFN_GATE,
+    LLM_TENSOR_FFN_DOWN,
+    LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_ATTN_Q_NORM,
+    LLM_TENSOR_ATTN_K_NORM,
+};
+
+
+struct llama_cparams {
+    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_threads;       // number of threads to use for generation
+    uint32_t n_threads_batch; // number of threads to use for batch processing
+
+    float    rope_freq_base;
+    float    rope_freq_scale;
+
+    uint32_t n_yarn_orig_ctx;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+
+    bool mul_mat_q;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attn_norm;
+    struct ggml_tensor * attn_norm_b;
+    struct ggml_tensor * attn_norm_2;
+    struct ggml_tensor * attn_norm_2_b;
+    struct ggml_tensor * attn_q_norm;
+    struct ggml_tensor * attn_q_norm_b;
+    struct ggml_tensor * attn_k_norm;
+    struct ggml_tensor * attn_k_norm_b;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+    struct ggml_tensor * wqkv;
+
+    // attention bias
+    struct ggml_tensor * bo;
+    struct ggml_tensor * bqkv;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    struct ggml_tensor * ffn_norm_b;
+
+    // ff
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
+
+    // ff bias
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
+};
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+	return seq_id.find(id) != seq_id.end();
+    }
+};
+
+struct llama_buffer {
+    void * data = NULL;
+    size_t size = 0;
+
+    // fallback to malloc / free
+    // useful in cases where CUDA can try to allocate PINNED memory
+    bool fallback = false;
+
+  void resize(size_t n) ;
+
+
+  ~llama_buffer();
+
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<llama_kv_cell> cells;
+
+    struct ggml_tensor * k = NULL;
+    struct ggml_tensor * v = NULL;
+
+    struct ggml_context * ctx = NULL;
+
+    llama_buffer buf;
+
+    ~llama_kv_cache() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    ggml_cuda_free_data(k);
+	    ggml_cuda_free_data(v);
+	}
+#endif
+    }
+};
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+	token text;
+	float score;
+	ttype type;
+    };
+
+    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::unordered_map<token, id> special_tokens_cache;
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id = 1;
+    id special_eos_id = 2;
+    id special_unk_id = 0;
+    id special_sep_id = -1;
+    id special_pad_id = -1;
+
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
+    id linefeed_id       = 13;
+    id special_prefix_id = 32007;
+    id special_middle_id = 32009;
+    id special_suffix_id = 32008;
+    id special_eot_id    = 32010;
+
+    int find_bpe_rank(std::string token_left, std::string token_right) const {
+	GGML_ASSERT(token_left.find(" ") == std::string::npos);
+	GGML_ASSERT(token_left.find("\n") == std::string::npos);
+	GGML_ASSERT(token_right.find(" ") == std::string::npos);
+	GGML_ASSERT(token_right.find("\n") == std::string::npos);
+
+	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+	if (it == bpe_ranks.end()) {
+	    return -1;
+	}
+
+	return it->second;
+    }
+};
+
+struct llama_mmap {
+  void * addr;
+  size_t size;
+
+  llama_mmap(const llama_mmap &) = delete;
+
+  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
+  ~llama_mmap();
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+#else
+    static constexpr bool SUPPORTED = false;
+#endif
+};
+
+
+struct llama_hparams {
+    bool     vocab_only;
+    uint32_t n_vocab;
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_yarn_orig_ctx;
+    int8_t   rope_scaling_type_train : 3;
+    bool     rope_finetuned : 1;
+
+    float f_clamp_kqv;
+    float f_max_alibi_bias;
+
+  bool operator!=(const llama_hparams & other) const;
+    uint32_t n_gqa() const {
+	return n_head/n_head_kv;
+    }
+
+    uint32_t n_embd_head() const {
+	return n_embd/n_head;
+    }
+
+    uint32_t n_embd_gqa() const {
+	return n_embd/n_gqa();
+    }
+};
+
+struct llama_mlock {
+  void * addr = NULL;
+  size_t size = 0;
+  bool failed_already = false;
+  llama_mlock() ;
+
+  llama_mlock(const llama_mlock &) = delete;
+  ~llama_mlock();
+  void init(void * ptr);
+  void grow_to(size_t target_size);
+#ifdef _POSIX_MEMLOCK_RANGE
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION						\
+  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION						\
+  "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+#endif
+  bool raw_lock(const void * addr, size_t size) const ;
+#undef MLOCK_SUGGESTION
+  static void raw_unlock(void * addr, size_t size);
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+  bool raw_lock(void * ptr, size_t len) const ;
+  static void raw_unlock(void * ptr, size_t len);
+#else
+    static constexpr bool SUPPORTED = false;
+  static size_t lock_granularity();
+  bool raw_lock(const void * addr, size_t len) const;
+  static void raw_unlock(const void * addr, size_t len);
+#endif
+};
+
+
+struct llama_model {
+    e_model     type  = MODEL_UNKNOWN;
+    llm_arch    arch  = LLM_ARCH_UNKNOWN;
+    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+
+    std::string name = "n/a";
+
+    llama_hparams hparams = {};
+    llama_vocab   vocab;
+
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
+    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm_b;
+
+    struct ggml_tensor * output_norm;
+    struct ggml_tensor * output_norm_b;
+    struct ggml_tensor * output;
+
+    std::vector<llama_layer> layers;
+
+    int n_gpu_layers;
+
+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
+    // context
+    struct ggml_context * ctx = NULL;
+
+    // the model memory buffer
+    llama_buffer buf;
+
+    // model memory mapped file
+    std::unique_ptr<llama_mmap> mapping;
+
+    // objects representing data potentially being locked in memory
+    llama_mlock mlock_buf;
+    llama_mlock mlock_mmap;
+
+    // for quantize-stats only
+    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+
+    ~llama_model() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+		ggml_cuda_free_data(tensors_by_name[i].second);
+	    }
+	    ggml_cuda_free_scratch();
+	}
+#endif
+
+#if defined(GGML_USE_CLBLAST)
+	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+	    ggml_cl_free_data(tensors_by_name[i].second);
+	}
+#endif
+    }
+};
+
+struct llama_context {
+    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
+  ~llama_context();
+
+    llama_cparams cparams;
+
+    const llama_model & model;
+
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
+
+    std::mt19937 rng;
+
+    bool has_evaluated_once = false;
+
+    int64_t t_start_us;
+    int64_t t_load_us;
+    int64_t t_sample_us = 0;
+    int64_t t_p_eval_us = 0;
+    int64_t t_eval_us   = 0;
+
+    int32_t n_sample = 0; // number of tokens sampled
+    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    int32_t n_eval   = 0; // number of eval calls
+
+    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    std::vector<float> logits;
+    bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
+
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
+    // memory buffers used to evaluate the model
+    llama_buffer buf_compute;
+
+    llama_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+
+#ifdef GGML_USE_METAL
+    ggml_metal_context * ctx_metal = NULL;
+#endif
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+};
+
+
+struct LLM_TN {
+  LLM_TN(llm_arch arch) ;
+
+  llm_arch arch;
+
+  std::string operator()(llm_tensor tensor) const;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
+
+  std::string operator()(llm_tensor tensor, int bid) const ;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
+
+};
+
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+  llama_file(const char * fname, const char * mode) ;
+  size_t tell() const;
+  void seek(size_t offset, int whence) const;
+  void read_raw(void * ptr, size_t len) const;
+  uint32_t read_u32() const;
+  void write_raw(const void * ptr, size_t len) const ;
+  void write_u32(std::uint32_t val) const;
+  ~llama_file();
+
+};
+
+
+struct llama_state {
+  llama_state();
+    // We save the log callback globally
+    ggml_log_callback log_callback;
+    void * log_callback_user_data = nullptr;
+};
+
+
+
+struct llama_model_loader {
+    int n_kv      = 0;
+    int n_tensors = 0;
+    int n_created = 0;
+
+    int64_t n_elements = 0;
+    size_t  n_bytes    = 0;
+
+    bool use_mmap = false;
+
+    llama_file  file;
+    llama_ftype ftype;
+    llama_fver  fver;
+
+    std::unique_ptr<llama_mmap> mapping;
+
+    struct gguf_context * ctx_gguf = NULL;
+    struct ggml_context * ctx_meta = NULL;
+
+  llama_model_loader(const std::string & fname, bool use_mmap) ;
+
+  ~llama_model_loader();
+
+  std::string get_arch_name() const;
+
+  enum llm_arch get_arch() const ;
+  const char * get_tensor_name(int i) const;
+
+  struct ggml_tensor * get_tensor_meta(int i) const;
+
+  void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
+
+  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
+
+  struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
+
+  void done_getting_tensors() const;
+
+  size_t file_offset(const char * name) const;
+
+
+  void load_data_for(struct ggml_tensor * cur) const ;
+  void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
+};
+
+struct llama_data_context {
+    virtual void write(const void * src, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
+    virtual ~llama_data_context() = default;
+};
+
+struct llama_data_buffer_context : llama_data_context {
+    uint8_t * ptr;
+    size_t size_written = 0;
+  llama_data_buffer_context(uint8_t * p) ;
+  void write(const void * src, size_t size) override ;
+  size_t get_size_written() override ;
+};
+
+struct llama_data_file_context : llama_data_context {
+    llama_file * file;
+    size_t size_written = 0;
+  llama_data_file_context(llama_file * f);
+  size_t get_size_written() override ;
+  void write(const void * src, size_t size);
+};
+
+
+struct llama_beam {
+  std::vector<llama_token> tokens;
+  float p;  // Cumulative beam probability (renormalized relative to all beams)
+  bool eob; // Initialize end-of-beam to false. Callback sets this to true.
+  // Sort beams by probability. In case of ties, prefer beams at eob.
+  bool operator<(const llama_beam & rhs) const ;
+  void shift_tokens(const size_t n) ;
+  llama_beam_view view() const;
+};
+
+// A struct for calculating logit-related info.
+struct llama_logit_info {
+    const float * const logits;
+    const int n_vocab;
+    const float max_l;
+    const float normalizer;
+    struct sum_exp {
+	float max_l;
+	float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+    };
+  llama_logit_info(llama_context * ctx);
+  llama_token_data get_token_data(const llama_token token_id) const ;
+  std::vector<llama_token_data> top_k(size_t k) ;
+  float probability_from_logit(float logit) const ;
+};
+
+
+struct llama_beam_search_data {
+  llama_context * ctx;
+  size_t n_beams;
+  int n_past;
+  int n_predict;
+  std::vector<llama_beam> beams;
+  std::vector<llama_beam> next_beams;
+  size_t common_prefix_length;
+  std::vector<llama_beam_view> beam_views;
+  llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
+  void collapse_beams(const size_t beam_idx) ;
+  void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
+  size_t find_common_prefix_length() ;
+  llama_beams_state get_beams_state(const bool last_call) ;
+  void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
+  static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
+  size_t top_beam_index();
+  void update_beams_from_beam_views();
+};
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llm_build_context {
+    const llama_model    & model;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head;
+    const int64_t n_embd_gqa;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_orig_ctx;
+
+    const bool do_rope_shift;
+
+    const llm_build_cb & cb;
+
+    llama_buffer & buf_compute;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+	llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+	bool   worst_case);
+
+  void init() ;
+  void free() ;
+  struct ggml_cgraph * build_llama() ;
+  struct ggml_cgraph * build_baichuan() ;
+  struct ggml_cgraph * build_falcon() ;
+  struct ggml_cgraph * build_starcoder() ;
+  struct ggml_cgraph * build_persimmon() ;
+  struct ggml_cgraph * build_refact() ;
+  struct ggml_cgraph * build_bloom() ;
+  struct ggml_cgraph * build_mpt() ;
+  struct ggml_cgraph * build_stablelm();
+};
+
+
+enum llm_offload_func_e {
+    OFFLOAD_FUNC_NOP,
+    OFFLOAD_FUNC,
+    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_NR,
+    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_OUT,
+};
+
+struct llm_offload_trie {
+  struct node {
+    ~node() ;
+    node * children[256] = { nullptr };
+    llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+  };
+  node * root = nullptr;
+  llm_offload_trie();
+  llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
+  ~llm_offload_trie();
+  void add(const char * name, llm_offload_func_e func);
+  llm_offload_func_e find(const char * name) const;
+  
+};
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+
+struct llm_bigram_spm {
+    struct comparator {
+      bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm {
+  llm_tokenizer_spm(const llama_vocab & vocab);
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+
+private:
+  void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
+  void try_add_bigram(int left, int right) ;
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  llm_bigram_spm::queue work_queue;
+
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+    struct comparator {
+      bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe {
+  llm_tokenizer_bpe(const llama_vocab & vocab);
+
+  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+private:
+  void add_new_bigram(int left, int right) ;
+
+  std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
+
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  std::vector<llm_symbol> symbols_final;
+
+    llm_bigram_bpe::queue work_queue;
+};
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+  fragment_buffer_variant(llama_vocab::id _token);
+  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
+  const FRAGMENT_BUFFER_VARIANT_TYPE type;
+  const llama_vocab::id token;
+  const std::string _dummy;
+  const std::string & raw_text;
+  const uint64_t offset;
+  const uint64_t length;
+};
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
diff --git a/llama.h b/llama.h
index 1a62058d1406b..9a1e7d04e050a 100644
--- a/llama.h
+++ b/llama.h
@@ -50,7 +50,9 @@
 #endif
 
 #ifdef __cplusplus
+#ifndef CPP_ONLY
 extern "C" {
+#endif
 #endif
 
     //
@@ -827,8 +829,10 @@ extern "C" {
     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
 
 #ifdef __cplusplus
+#ifndef CPP_ONLY
 }
 #endif
+#endif
 
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
@@ -844,4 +848,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
 
 #endif // LLAMA_API_INTERNAL
 
+
+
 #endif // LLAMA_H
+
+
diff --git a/print.hpp b/print.hpp
new file mode 100644
index 0000000000000..6f0dc6bf5fa6c
--- /dev/null
+++ b/print.hpp
@@ -0,0 +1,763 @@
+#include <refl-cpp/refl.hpp>
+#include <iostream>
+#include "llama.h"
+#include "ggml-internal.hpp"
+#include "llama-internal.hpp"
+
+REFL_TYPE(ggml_init_params )
+REFL_END
+
+// we use the named data type patch
+#define ggml_opt_params_names
+#ifdef ggml_opt_params_names
+REFL_TYPE(ggml_opt_params::ggml_adam)
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_lbfgs)
+REFL_END
+
+
+REFL_TYPE(ggml_opt_context::ggml_grad )
+REFL_END
+#endif
+REFL_TYPE(gpt_params )
+
+REFL_FIELD( seed )
+REFL_FIELD( n_threads)
+REFL_FIELD( n_threads_batch)
+REFL_FIELD( n_predict )
+REFL_FIELD( n_ctx )
+REFL_FIELD( n_batch)
+REFL_FIELD( n_keep )
+REFL_FIELD( n_draft)
+REFL_FIELD( n_chunks )
+REFL_FIELD( n_parallel)
+REFL_FIELD( n_sequences)
+REFL_FIELD( p_accept  )
+REFL_FIELD( p_split )
+REFL_FIELD( n_gpu_layers)
+REFL_FIELD( n_gpu_layers_draft)
+REFL_FIELD( main_gpu )
+REFL_FIELD( tensor_split)
+REFL_FIELD( n_beams )
+REFL_FIELD(rope_freq_base)
+REFL_FIELD( rope_freq_scale )
+REFL_FIELD( yarn_ext_factor )
+REFL_FIELD( yarn_attn_factor )
+REFL_FIELD( yarn_beta_fast )
+REFL_FIELD( yarn_beta_slow )
+REFL_FIELD( yarn_orig_ctx)
+REFL_FIELD( rope_scaling_type)
+REFL_FIELD( sparams)
+REFL_FIELD(model )
+REFL_FIELD(model_draft )
+REFL_FIELD(model_alias)
+REFL_FIELD(prompt )
+REFL_FIELD(prompt_file )
+REFL_FIELD(path_prompt_cache )
+REFL_FIELD(input_prefix )
+REFL_FIELD(input_suffix )
+REFL_FIELD( antiprompt)
+REFL_FIELD(logdir )
+REFL_FIELD( lora_adapter)
+REFL_FIELD(lora_base )
+REFL_FIELD( ppl_stride )
+REFL_FIELD( ppl_output_type )
+REFL_FIELD( hellaswag )
+REFL_FIELD( hellaswag_tasks )
+REFL_FIELD( mul_mat_q )
+REFL_FIELD( memory_f16)
+REFL_FIELD( random_prompt )
+REFL_FIELD( use_color )
+REFL_FIELD( interactive )
+REFL_FIELD( chatml )
+REFL_FIELD( prompt_cache_all )
+REFL_FIELD( prompt_cache_ro )
+REFL_FIELD( embedding )
+REFL_FIELD( escape )
+REFL_FIELD( interactive_first )
+REFL_FIELD( multiline_input )
+REFL_FIELD( simple_io )
+REFL_FIELD( cont_batching )
+REFL_FIELD( input_prefix_bos )
+REFL_FIELD( ignore_eos )
+REFL_FIELD( instruct )
+REFL_FIELD( logits_all )
+REFL_FIELD( use_mmap)
+REFL_FIELD( use_mlock )
+REFL_FIELD( numa )
+REFL_FIELD( verbose_prompt )
+REFL_FIELD( infill ) 
+REFL_FIELD(mmproj )
+REFL_FIELD( image)
+
+REFL_END
+
+REFL_TYPE(llama_sampling_params)
+REFL_END
+
+REFL_TYPE(llm_arch)
+REFL_END
+
+REFL_TYPE(llama_sampling_context )
+REFL_FIELD( params)
+REFL_FIELD( mirostat_mu)
+REFL_FIELD( grammar)
+REFL_FIELD( parsed_grammar)
+//REFL_FIELD( prev)  // TODO fixme has null data 
+//REFL_FIELD( cur)
+REFL_END
+
+REFL_TYPE(llama_token_data )
+REFL_END
+
+
+REFL_TYPE(llama_token_data_array )
+REFL_END
+
+REFL_TYPE(llama_batch )
+REFL_END
+
+
+REFL_TYPE(ggml_object)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_tensor)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_cplan)
+  REFL_FIELD(work_size)
+REFL_END
+
+REFL_TYPE(ggml_hash_set)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_cgraph)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_scratch)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_compute_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_context)
+  REFL_FIELD(ctx)
+REFL_END
+
+REFL_TYPE(gguf_init_params)
+REFL_END
+
+REFL_TYPE(ggml_something)
+  REFL_FIELD(type_name)
+REFL_END
+
+REFL_TYPE(ggml_context)
+  REFL_FIELD(mem_size)
+REFL_FIELD(mem_buffer)
+REFL_FIELD(mem_buffer_owned)
+REFL_FIELD(    no_alloc)
+REFL_FIELD(    no_alloc_save)
+REFL_FIELD(    n_objects)
+REFL_FIELD(    objects_begin)
+REFL_FIELD(    objects_end)
+REFL_FIELD(    scratch)
+REFL_FIELD(    scratch_save)
+
+REFL_END
+
+REFL_TYPE(ggml_context_container)
+  REFL_FIELD(used)
+  REFL_FIELD(context)
+REFL_END
+
+ REFL_TYPE(ggml_numa_node)
+   REFL_FIELD(cpus)
+   REFL_FIELD(n_cpus)
+ REFL_END
+
+ REFL_TYPE(ggml_numa_nodes)
+   REFL_FIELD(nodes)
+   REFL_FIELD(n_nodes)
+ REFL_END
+
+ REFL_TYPE(ggml_state)
+   REFL_FIELD(contexts)
+   REFL_FIELD(numa)
+   REFL_END
+
+ REFL_TYPE(gguf_str)
+   REFL_FIELD(n)
+   REFL_FIELD(data)
+ REFL_END
+
+ REFL_TYPE(ggml_map_custom1_op_params)
+   REFL_FIELD(fun)
+   REFL_FIELD(n_tasks)
+ REFL_END
+
+REFL_TYPE(ggml_map_custom2_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(ggml_map_custom3_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(hash_map)
+  REFL_FIELD(set)
+  REFL_FIELD(vals)
+REFL_END
+REFL_TYPE(ggml_compute_state_shared)
+  REFL_FIELD(cgraph)
+  REFL_FIELD(cplan)
+REFL_END
+REFL_TYPE(ggml_compute_state)
+  REFL_FIELD(thrd)
+  REFL_FIELD(ith)
+REFL_END
+REFL_TYPE(ggml_lbfgs_iteration_data)
+  REFL_FIELD(alpha)
+  REFL_FIELD(ys)
+REFL_END
+
+REFL_TYPE(gguf_kv)
+  REFL_FIELD(key)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(gguf_header)
+  REFL_FIELD(magic)
+  REFL_FIELD(version)
+REFL_END
+
+REFL_TYPE(gguf_tensor_info)
+  REFL_FIELD(name)
+  REFL_FIELD(n_dims)
+REFL_END
+
+REFL_TYPE(gguf_context)
+  REFL_FIELD(header)
+  REFL_FIELD(kv)
+REFL_END
+
+REFL_TYPE(gguf_buf)
+  REFL_FIELD(data)
+  REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_model_params)
+  REFL_FIELD(n_gpu_layers)
+REFL_END
+REFL_TYPE(llama_context_params)
+  REFL_FIELD(seed)
+REFL_END
+REFL_TYPE(llama_model_quantize_params)
+  REFL_FIELD(nthread)
+REFL_END
+
+REFL_TYPE(llama_grammar_element)
+REFL_END
+
+REFL_TYPE(llama_timings)
+  REFL_FIELD(t_start_ms)
+REFL_END
+REFL_TYPE(llama_beam_view)
+  REFL_FIELD(tokens)
+REFL_END
+
+REFL_TYPE(llama_beams_state)
+  REFL_FIELD(beam_views)
+REFL_END
+  
+REFL_TYPE(ggml_backend)
+REFL_END
+
+REFL_TYPE(ggml_backend_buffer)
+REFL_END
+
+REFL_TYPE(ggml_allocr)
+REFL_END
+
+REFL_TYPE(ggml_tallocr)
+REFL_END
+
+REFL_TYPE(ggml_gallocr)
+REFL_END
+
+
+REFL_TYPE(llama_buffer)
+REFL_FIELD(data)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_file)
+REFL_FIELD(fp)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_mmap)
+REFL_FIELD(addr)
+REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_mlock)
+  REFL_FIELD(addr)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(llama_state)
+ REFL_FIELD(log_callback)
+ REFL_FIELD(log_callback_user_data)
+ REFL_END
+  
+
+REFL_TYPE(llama_hparams)
+  REFL_FIELD(vocab_only)
+  REFL_FIELD(n_vocab)
+  REFL_END
+
+
+REFL_TYPE(llama_cparams)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(n_batch)
+REFL_END
+
+REFL_TYPE(llama_layer)
+ REFL_FIELD(attn_norm)
+ REFL_FIELD(attn_norm_b)
+REFL_END
+
+REFL_TYPE(llama_kv_cell)
+  REFL_FIELD(pos)
+  REFL_FIELD(delta)
+REFL_END
+
+REFL_TYPE(llama_kv_cache)
+   REFL_FIELD(has_shift)
+   REFL_FIELD(head)
+ REFL_END
+
+REFL_TYPE(e_model)
+REFL_END
+
+REFL_TYPE(llama_ftype)
+REFL_END
+
+REFL_TYPE(llama_model)
+  REFL_FIELD(type)
+  REFL_FIELD(arch)
+REFL_FIELD(ftype )
+
+REFL_FIELD(  name )
+
+  REFL_FIELD(   hparams )
+REFL_FIELD(    vocab)
+
+REFL_FIELD(   tok_embd)
+REFL_FIELD(   pos_embd)
+REFL_FIELD(   tok_norm)
+REFL_FIELD(   tok_norm_b)
+
+REFL_FIELD(   output_norm)
+REFL_FIELD(  output_norm_b)
+REFL_FIELD(  output)
+
+REFL_FIELD(  layers)
+
+REFL_FIELD(  n_gpu_layers)
+
+  REFL_FIELD(  gguf_kv) //unordered map
+  REFL_FIELD( ctx)
+  REFL_FIELD( buf)
+ REFL_FIELD( mapping) //std::unique_ptr 
+REFL_FIELD( mlock_buf)
+REFL_FIELD( mlock_mmap)
+REFL_FIELD( tensors_by_name)
+  REFL_FIELD( t_load_us)
+REFL_FIELD( t_start_us)
+
+REFL_END
+
+REFL_TYPE(llama_vocab)
+  REFL_END
+  
+  REFL_TYPE(grammar_parser::parse_state)
+  REFL_END
+  
+REFL_TYPE(llama_context)
+REFL_FIELD( cparams)
+//REFL_FIELD(model)
+REFL_FIELD(kv_self)
+ REFL_FIELD(rng) //random numbers
+REFL_FIELD(has_evaluated_once )
+REFL_FIELD( t_start_us)
+REFL_FIELD( t_load_us)
+  REFL_FIELD( t_sample_us )
+REFL_FIELD( t_p_eval_us )
+  REFL_FIELD( t_eval_us)
+REFL_FIELD( n_sample )
+REFL_FIELD( n_p_eval )
+  REFL_FIELD( n_eval  )
+//REFL_FIELD(  logits) crash
+REFL_FIELD(  logits_all )
+REFL_FIELD(  embedding)
+//REFL_FIELD(   work_buffer)
+  REFL_FIELD(   buf_compute)
+  REFL_FIELD( buf_alloc)
+REFL_FIELD( alloc ) 
+
+#ifdef GGML_USE_METAL
+REFL_FIELD( ctx_metal )
+#endif
+
+#ifdef GGML_USE_MPI
+REFL_FIELD( ctx_mpi )
+
+#endif
+REFL_END
+
+REFL_TYPE(llama_model_loader)
+  REFL_FIELD(n_kv)
+  REFL_FIELD(n_tensors)
+REFL_END
+
+REFL_TYPE(llm_build_context)
+// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
+//  REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
+REFL_END
+
+REFL_TYPE(llm_offload_trie)
+REFL_END
+
+REFL_TYPE(llm_symbol)
+  REFL_FIELD(prev)
+REFL_END
+
+REFL_TYPE(llm_bigram_spm)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_spm)
+REFL_END
+
+REFL_TYPE(llm_bigram_bpe)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_bpe)
+REFL_END
+  
+
+REFL_TYPE(fragment_buffer_variant)
+REFL_END
+  
+
+REFL_TYPE(llama_partial_utf8)
+  REFL_FIELD(value)
+  REFL_FIELD(n_remain)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar)
+ REFL_FIELD(rules)
+ REFL_FIELD(stacks)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar_candidate)
+ REFL_FIELD(index)
+ REFL_FIELD(code_points)
+REFL_END
+  
+
+REFL_TYPE(llama_beam)
+  REFL_FIELD(tokens)
+  REFL_FIELD(p)
+REFL_END
+  
+
+REFL_TYPE(llama_logit_info)
+//  REFL_FIELD(logits)
+  REFL_FIELD(n_vocab)
+REFL_END
+
+REFL_TYPE(llama_beam_search_data)
+  REFL_FIELD(ctx)
+  REFL_FIELD(n_beams)
+REFL_END
+
+
+REFL_TYPE(quantize_state_internal)
+//  REFL_FIELD(model)
+  REFL_FIELD(params)
+REFL_FIELD( n_attention_wv )
+REFL_FIELD(    n_feed_forward_w2 )
+  REFL_FIELD(    i_attention_wv    )
+  REFL_FIELD(    i_feed_forward_w2 )
+REFL_FIELD(    n_k_quantized     )
+REFL_FIELD(     n_fallback        )
+
+REFL_END
+
+REFL_TYPE(llama_data_context)
+REFL_END
+  
+REFL_TYPE(llama_data_buffer_context)
+  REFL_FIELD(ptr)
+REFL_END
+
+REFL_TYPE(llama_data_file_context)
+  REFL_FIELD(file)
+REFL_END
+
+template <typename T>
+constexpr auto get_value_type_name(const T t) noexcept
+{
+  return t.value_type;
+}
+
+namespace runtime2
+    {
+      using namespace refl;
+      using namespace refl::descriptor;
+      template <typename CharT, typename T>
+        void debug(std::basic_ostream<CharT>& os, const T& value, bool compact = false);
+
+        namespace detail
+        {
+            template <typename CharT, typename T, typename = decltype(std::declval<std::basic_ostream<CharT>&>() << std::declval<T>())>
+            std::true_type is_ostream_printable_test(int);
+
+            template <typename CharT, typename T>
+            std::false_type is_ostream_printable_test(...);
+
+            template <typename CharT, typename T>
+            constexpr bool is_ostream_printable_v{ decltype(is_ostream_printable_test<CharT, T>(0))::value };
+
+            namespace
+            {
+                [[maybe_unused]] int next_depth(int depth)
+                {
+                    return depth == -1 || depth > 8
+                        ? -1
+                        : depth + 1;
+                }
+            }
+
+            template <typename CharT>
+            void indent(std::basic_ostream<CharT>& os, int depth)
+            {
+                for (int i = 0; i < depth; i++) {
+                    os << "    ";
+                }
+            }
+
+            template <typename CharT, typename T>
+            void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth);
+
+            template <typename CharT, typename T>
+            void debug_detailed(std::basic_ostream<CharT>& os, const T& value, int depth)
+            {
+
+                using type_descriptor = type_descriptor<T>;
+                bool compact = depth == -1;
+                // print type with members enclosed in braces
+                os << type_descriptor::name << " { ";
+                if (!compact) os << '\n';
+
+                constexpr auto readable_members = filter(type_descriptor::members, [](auto member) { return is_readable(member); });
+                for_each(readable_members, [&](auto member, [[maybe_unused]] auto index) {
+                    int new_depth = next_depth(depth);
+
+                    indent(os, new_depth);
+                    os << get_display_name(member) << " = ";
+
+                    if constexpr (util::contains_instance<attr::debug>(member.attributes)) {
+                        // use the debug attribute to print
+                        auto debug_attr = util::get_instance<attr::debug>(member.attributes);
+                        debug_attr.write(os, value);
+                    }
+                    else {
+                        debug_impl(os, member(value), new_depth);
+                    }
+
+                    if (!compact || index + 1 != readable_members.size) {
+                        os << ", ";
+                    }
+                    if (!compact) {
+                        indent(os, depth);
+                        os << '\n';
+                    }
+                });
+
+                if (compact) os << ' ';
+                indent(os, depth);
+                os << '}';
+            }
+
+            template <typename CharT, typename T>
+            void debug_reflectable(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
+            {
+                using type_descriptor = type_descriptor<T>;
+                if constexpr (trait::contains_instance_v<attr::debug, typename type_descriptor::attribute_types>) {
+                    // use the debug attribute to print
+                    auto debug_attr = util::get_instance<attr::debug>(type_descriptor::attributes);
+                    debug_attr.write(os, value);
+                }
+                else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
+                    // type supports printing natively, just use that
+
+		  os << value;
+
+                }
+                else {
+                    debug_detailed(os, value, depth);
+                }
+            }
+
+            template <typename CharT, typename T>
+            void debug_container(std::basic_ostream<CharT>& os, const T& value, int depth)
+            {
+                bool compact = depth == -1;
+                os << "[";
+
+                auto end = value.end();
+                for (auto it = value.begin(); it != end; ++it)
+                {
+                    if (!compact) os << '\n';
+                    int new_depth = next_depth(depth);
+                    indent(os, new_depth);
+		    
+		      debug_impl(os, *it, new_depth);
+		      if (std::next(it, 1) != end) {
+                        os << ", ";
+		      }
+		      else if (!compact) {
+                        os << '\n';
+		      }
+		    
+                }
+
+                indent(os, depth);
+                os << "]";
+            }
+
+            template <typename CharT, typename T>
+            void debug_impl(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] int depth)
+            {
+                using no_pointer_t = std::remove_pointer_t<T>;
+
+                if constexpr (std::is_same_v<bool, T>) {
+                    os << (value ? "true" : "false");
+                }
+                else if constexpr (std::is_pointer_v<T> && !std::is_void_v<no_pointer_t> && trait::is_reflectable_v<no_pointer_t>) {
+                    if (value == nullptr) {
+                        os << "nullptr";
+                    }
+                    else {
+                        os << '&';
+                        debug_impl(os, *value, -1);
+                    }
+                }
+                else if constexpr (trait::is_reflectable_v<T>) {
+                    debug_reflectable(os, value, depth);
+                }
+                else if constexpr (detail::is_ostream_printable_v<CharT, T>) {
+                    os << value;
+                }
+                else if constexpr (trait::is_container_v<T>) {
+                    debug_container(os, value, depth);
+                }
+                else {
+                    os << "(not printable)";
+                }
+            }
+        }
+
+        /**
+         * Writes the debug representation of value to the given std::ostream.
+         * Calls the function specified by the debug<F> attribute whenever possible,
+         * before falling back to recursively interating the members and printing them.
+         * Takes an optional arguments specifying whether to print a compact representation.
+         * The compact representation contains no newlines.
+         */
+        template <typename CharT, typename T>
+        void debug(std::basic_ostream<CharT>& os, const T& value, [[maybe_unused]] bool compact)
+        {
+            static_assert(trait::is_reflectable_v<T> || trait::is_container_v<T> || detail::is_ostream_printable_v<CharT, T>,
+                "Type is not reflectable, not a container of reflectable types and does not support operator<<(std::ostream&, T)!");
+
+            detail::debug_impl(os, value, compact ? -1 : 0);
+        }
+
+        /**
+         * Writes the compact debug representation of the provided values to the given std::ostream.
+         */
+        template <typename CharT, typename... Ts>
+        void debug_all(std::basic_ostream<CharT>& os, const Ts&... values)
+        {
+            refl::runtime::debug(os, std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
+        }
+
+        /**
+         * Writes the debug representation of the provided value to an std::string and returns it.
+         * Takes an optional arguments specifying whether to print a compact representation.
+         * The compact representation contains no newlines.
+         */
+        template <typename CharT = char, typename T>
+        std::basic_string<CharT> debug_str(const T& value, bool compact = false)
+        {
+            std::basic_stringstream<CharT> ss;
+            debug(ss, value, compact);
+            return ss.str();
+        }
+
+        /**
+         * Writes the compact debug representation of the provided values to an std::string and returns it.
+         */
+        template <typename CharT = char, typename... Ts>
+        std::basic_string<CharT> debug_all_str(const Ts&... values)
+        {
+            return refl::runtime::debug_str(std::forward_as_tuple(static_cast<const Ts&>(values)...), true);
+        }
+}
+
+// // A generic function to print out the fields of any object
+template<typename T>
+void print_fields(const T& t) {
+  runtime2::debug(std::cout, t);
+  constexpr auto type = refl::reflect<T>();
+
+  constexpr auto membertype = refl::member_list<T>();
+
+  constexpr auto members = get_members(type);
+  std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
+  std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
+  std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
+     refl::util::for_each(members, [&](auto member) {
+       //using member_t = decltype(member::value_type);
+       //typename type3 = member::value_type;
+       //typename trait::remove_qualifiers_t<member_t>::value_type>;
+       //constexpr auto type2 = refl::reflect(type3);
+	 //std::cout  << "Auto:" << foo <<"\n";       
+       std::cout  << "Auto:" << member.name <<"\n";
+       //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
+       //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
+     });
+     std::cout << "\n";
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c8b4bc254f4c6..28f6254630010 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
 llama_build_and_test_executable(test-rope.cpp)
 
 # dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
+get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
+add_executable(${TEST_TARGET} test-c.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
diff --git a/tests/test-c.c b/tests/test-c.cpp
similarity index 100%
rename from tests/test-c.c
rename to tests/test-c.cpp