diff --git a/.gitignore b/.gitignore
index 41259a12f50cb..f8a2a2dae5902 100644
--- a/.gitignore
+++ b/.gitignore
@@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
+/#llama.cpp#
+#*
+\\#*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f32df5fe52335..7a9cc630a91f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,34 @@
 cmake_minimum_required(VERSION 3.13)  # for add_link_options
 project("llama.cpp" C CXX)
 
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if (NOT MSVC)
+    set(cuda_flags -Wno-pedantic)
+endif()
 
+set(LLAMA_CUBLAS ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(LLAMA_CUDA_F16 ON)
+set(LLAMA_ACCELERATE ON)
+set(LLAMA_K_QUANTS ON)
+
+#-DLLAMA_NATIVE=off
+set(LLAMA_AVX ON)
+set(LLAMA_AVX2 OFF)
+set(LLAMA_AVX512 OFF)
+set(LLAMA_FMA OFF)
+set(LLAMA_F16C OFF)
+set(CMAKE_CUDA_FLAGS "--verbose") #
+set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+set(CUDACXX /usr/local/cuda-12.3/bin/nvcc)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc)
+set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3)
+#GGML_USE_CUBLAS
+
+#set(CMAKE_EXE_LINKER_FLAGS -pg)
+#set(CMAKE_SHARED_LINKER_FLAGS -pg)
+
+set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
+    
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -44,7 +70,7 @@ endif()
 
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 
 # debug
@@ -77,9 +103,9 @@ endif()
 
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  ON)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  ON)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
@@ -104,7 +130,7 @@ option(LLAMA_BUILD_SERVER               "llama: build server example"
 # Compile flags
 #
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -230,7 +256,12 @@ if (LLAMA_BLAS)
 
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
+	# from https://github.com/NVIDIA/cutlass
+	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
+	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
+
+	#        add_compile_definitions(GGML_USE_OPENBLAS)
         if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
             add_compile_definitions(GGML_BLAS_USE_MKL)
         endif()
@@ -272,6 +303,7 @@ if (LLAMA_CUBLAS)
         endif()
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+
         if (DEFINED LLAMA_CUDA_DMMV_Y)
             add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
         endif()
@@ -312,7 +344,7 @@ if (LLAMA_MPI)
     if (MPI_C_FOUND)
         message(STATUS "MPI found")
         set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
         add_compile_definitions(GGML_USE_MPI)
         add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
         if (NOT MSVC)
@@ -390,14 +422,15 @@ endif()
 
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+    # -Wpedantic
+        set(warning_flags -Wall -Wextra  -Wcast-qual -Wno-unused-function)
         set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive)
         set(host_cxx_flags "")
 
         if (CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive)
 
             if (
                 (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@@ -407,30 +440,27 @@ if (LLAMA_ALL_WARNINGS)
             endif()
         elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
             set(c_flags ${c_flags} -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive)
 
             if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive)
             endif()
             if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive)
             endif()
         endif()
     else()
         # todo : msvc
     endif()
 
-    set(c_flags   ${c_flags}   ${warning_flags})
-    set(cxx_flags ${cxx_flags} ${warning_flags})
+    set(c_flags   ${c_flags}  -save-temps --verbose  ${warning_flags})
+    set(cxx_flags ${cxx_flags} -fpermissive  -save-temps --verbose ${warning_flags})
     add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
                         "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
                         "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 
 endif()
 
-if (NOT MSVC)
-    set(cuda_flags -Wno-pedantic)
-endif()
 set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
 
 list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
@@ -438,6 +468,9 @@ if (NOT cuda_host_flags STREQUAL "")
     set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 
+# 
+set(cuda_flags --verbose -G  ${cuda_flags})
+
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 
 if (WIN32)
@@ -485,8 +518,10 @@ if (NOT MSVC)
             add_link_options(-static-libgcc -static-libstdc++)
         endif()
     endif()
+    add_link_options("-Wl,-Map=${TARGET}.map")
+
     if (LLAMA_GPROF)
-        add_compile_options(-pg)
+      add_compile_options(-pg)
     endif()
 endif()
 
@@ -645,13 +680,16 @@ if (GGML_USE_CPU_HBM)
 endif()
 
 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
             ggml.h
-            ggml-alloc.c
+	    print.hpp
+	    ggml-internal.hpp
+	    llama-internal.hpp
+            ggml-alloc.cpp
             ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
             ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
             ggml-quants.h
             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@@ -683,7 +721,7 @@ add_library(llama
             )
 
 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features(llama PUBLIC cxx_std_20) # don't bump
 target_link_libraries(llama PRIVATE
     ggml
     ${LLAMA_EXTRA_LIBS}
diff --git a/Makefile b/Makefile
index a6d2c2ec0f380..e2d28e2ee6eb2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,4 @@
+
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
@@ -116,7 +117,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive
 
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -502,7 +503,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 endif # LLAMA_METAL
 
 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
@@ -537,17 +538,17 @@ $(info )
 # Build library
 #
 
-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml.o: ggml.cpp ggml.h ggml-cuda.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
+	$(CXX) $(CXXFLAGS)    -c $< -o $@
 
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 
@@ -582,7 +583,7 @@ clean:
 # Examples
 #
 
-main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp   ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@@ -678,6 +679,9 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 build-info.o: common/build-info.cpp
 	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
 
+#print.o: print.cpp # print.hpp
+#	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+
 #
 # Tests
 #
@@ -734,5 +738,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-c.o: tests/test-c.cpp llama.h
+	$(CXX) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tt:
+	clang++ -std=c++17 ggml.cpp
diff --git a/README.md b/README.md
index e14886737121b..d3b92a75a7373 100644
--- a/README.md
+++ b/README.md
@@ -696,7 +696,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
 
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 
 ### Instruction mode with Alpaca
 
diff --git a/README.org b/README.org
new file mode 100644
index 0000000000000..2b80ae5f9dc8f
--- /dev/null
+++ b/README.org
@@ -0,0 +1,1097 @@
+This readme is showing how to use mistral using llama.cpp and cuda profiling nsys to collect data.
+
+#+begin_src sh :results verbatim :exports both
+  /home/mdupont/2023/11/07/nvidia-cuda-toolkit-11.5.1/amd64/cuda_cuobjdump/bin/cuobjdump --dump-ptx  ./build/bin/main  > ./build/bin/main.ptx
+#end_example
+  
+  Now to run llama.cpp with model downloaded from ollama we can do it like this
+
+#+begin_src sh :results verbatim :exports both
+      sudo /opt/nvidia/nsight-systems/2023.2.3/bin/nsys profile --show-output=true --trace=cuda,nvtx,cublas,cublas-verbose,cusparse,cusparse-verbose,mpi,oshmem,ucx,osrt,cudnn,opengl,opengl-annotations,openacc,openmp,nvvideo --sample=process-tree  --cudabacktrace=all   ./build/bin/main -m ~/.ollama/models/blobs/sha256:6ae28029995007a3ee8d0b8556d50f3b59b831074cf19c84de87acf51fb54054  -f prompt.org
+#+end_src
+
+#+RESULTS:
+#+begin_example
+This readme is showing how to use mistral using llama.cpp and cuda profiling nsys to collect data.
+
+,#+begin_src sh :results verbatim :exports both
+  /home/mdupont/2023/11/07/nvidia-cuda-toolkit-11.5.1/amd64/cuda_cuobjdump/bin/cuobjdump --dump-ptx  ./build/bin/main > ./build/bin/main.ptx
+#end_example
+  
+  Now to run llama.cpp with model downloaded from ollama we can do it like this
+
+,#+begin_src sh :results verbatim :exports both
+      sudo /opt/nvidia/nsight-systems/2023.2.3/bin/nsys profile --show-output=true --trace=cuda,nvtx,cublas,cublas-verbose,cusparse,cusparse-verbose,mpi,oshmem,ucx,osrt,cudnn,opengl,opengl-annotations,openacc,openmp,nvvideo --sample=process-tree  --cudabacktrace=all   ./build/bin/main -m ~/.ollama/models/blobs/sha256:6ae28029995007a3ee8d0b8556d50f3b59b831074cf19c84de87acf51fb54054    -f README.org
+,#+end_src
+
+  Here we can see the data collected by nsys:
+
+  ,#+begin_example data
+  ===nsys===
+  ====/path/to/bin/main===
+
+  ===Profile Summary=====
+    Total Samples = 30956
+    Sample Rate = 16.102757 Hz
+
+    CPU Samples:
+      Instructions Executed = 6469108233
+      Flops Executed = 6145482438.736761
+      Floats Executed = 20133734308.689648
+      Memory Accesses = 309559
+      Register Accesses = 102771
+      Branch Taken = 149
+      Branch Missed = 378
+      Static Branchs Executed = 17
+      Dynamic Branchs Executed = 5
+    GPU Samples:
+      Instructions Executed = 163111268848
+      Flops Executed = 15056925654.22184
+      Floats Executed = 20133734308.689648
+      Memory Accesses = 172190
+      Register Accesses = 43252
+      Branch Taken = 29
+      Branch Missed = 393
+      Static Branchs Executed = 2
+      Dynamic Branchs Executed = 6
+    ===Profile Details===== 
+  ====/path/to/bin/main===
+  ====Total Samples=====
+    Instructions Executed = 179422513688
+    Flops Executed = 30190359948.90951
+    Floats Executed = 20133734308.689648
+    Memory Accesses = 481749
+    Register Accesses = 146023
+    Branch Taken = 162
+    Branch Missed = 415
+    Static Branchs Executed = 17
+    Dynamic Branchs Executed = 5
+    ====Instruction Details=====
+    <Insert detailed instruction breakdown here>
+    ====Memory Access Details=====
+    <Insert detailed memory access breakdown here>
+    ====Register Access Details=====
+    <Insert detailed register access breakdown here>
+    ====Branching Details=====
+    <Insert detailed branching breakdown here>
+  ====/path/to/bin/main===
+  ====Function Calls=====
+    Function Name | Samples | Flops Executed
+    <Insert function name, sample count, and flop execution count here>
+  ====Function Returns=====
+    Function Name | Samples | Flops Executed
+    <Insert function name, sample count, and flop execution count here>
+  ====Code Coverage=====
+    <Insert code coverage breakdown here>
+  ====Heap Usage=====
+    <Insert heap usage breakdown here>
+  ====Stack Usage=====
+    <Insert stack usage breakdown here>
+#include <iostream>
+#include <vector>
+#include "gtest/gtest.h"
+using namespace testing;
+class TestMyCode : public Test {
+protected:
+  // Set up any needed data or environment variables before each test case.
+};
+TEST_F(TestMyCode, TestCase1) {
+  // Test code for TestCase1 goes here.
+}
+TEST_F(TestMyCode, TestCase2) {
+  // Test code for TestCase2 goes here.
+}
+int main() {
+  InitGoogleTest();
+  RunAllTests(new MySuite());
+  CleanUpGoogleTest();
+  return EXIT_SUCCESS;
+}Generating '/tmp/nsys-report-d862.qdstrm'
+[1/1] [0%                          ] report7.nsys-rep[1/1] [0%                          ] report7.nsys-rep[1/1] [===========50%              ] report7.nsys-rep[1/1] [========================100%] report7.nsys-rep[1/1] [0%                          ] report7.nsys-rep[1/1] [5%                          ] report7.nsys-rep[1/1] [7%                          ] report7.nsys-rep[1/1] [9%                          ] report7.nsys-rep[1/1] [10%                         ] report7.nsys-rep[1/1] [12%                         ] report7.nsys-rep[1/1] [14%                         ] report7.nsys-rep[1/1] [=15%                        ] report7.nsys-rep[1/1] [=17%                        ] report7.nsys-rep[1/1] [==19%                       ] report7.nsys-rep[1/1] [==21%                       ] report7.nsys-rep[1/1] [===22%                      ] report7.nsys-rep[1/1] [===24%                      ] report7.nsys-rep[1/1] [====26%                     ] report7.nsys-rep[1/1] [====27%                     ] report7.nsys-rep[1/1] [=====29%                    ] report7.nsys-rep[1/1] [=====31%                    ] report7.nsys-rep[1/1] [=====32%                    ] report7.nsys-rep[1/1] [======34%                   ] report7.nsys-rep[1/1] [=======36%                  ] report7.nsys-rep[1/1] [=======37%                  ] report7.nsys-rep[1/1] [=======39%                  ] report7.nsys-rep[1/1] [========41%                 ] report7.nsys-rep[1/1] [========42%                 ] report7.nsys-rep[1/1] [=========44%                ] report7.nsys-rep[1/1] [=========45%                ] report7.nsys-rep[1/1] [==========47%               ] report7.nsys-rep[1/1] [==========48%               ] report7.nsys-rep[1/1] [==========49%               ] report7.nsys-rep[1/1] [===========50%              ] report7.nsys-rep[1/1] [===========51%              ] report7.nsys-rep[1/1] [===========52%              ] report7.nsys-rep[1/1] [===========53%              ] report7.nsys-rep[1/1] [============54%             ] report7.nsys-rep[1/1] [============55%             ] report7.nsys-rep[1/1] [============56%             ] report7.nsys-rep[1/1] [============57%             ] report7.nsys-rep[1/1] [=============58%            ] report7.nsys-rep[1/1] [=============59%            ] report7.nsys-rep[1/1] [=============60%            ] report7.nsys-rep[1/1] [==============61%           ] report7.nsys-rep[1/1] [==============62%           ] report7.nsys-rep[1/1] [==============63%           ] report7.nsys-rep[1/1] [==============64%           ] report7.nsys-rep[1/1] [===============65%          ] report7.nsys-rep[1/1] [===============66%          ] report7.nsys-rep[1/1] [===============67%          ] report7.nsys-rep[1/1] [================68%         ] report7.nsys-rep[1/1] [================69%         ] report7.nsys-rep[1/1] [================70%         ] report7.nsys-rep[1/1] [================71%         ] report7.nsys-rep[1/1] [=================72%        ] report7.nsys-rep[1/1] [=================73%        ] report7.nsys-rep[1/1] [=================74%        ] report7.nsys-rep[1/1] [==================75%       ] report7.nsys-rep[1/1] [==================76%       ] report7.nsys-rep[1/1] [==================77%       ] report7.nsys-rep[1/1] [==================78%       ] report7.nsys-rep[1/1] [===================79%      ] report7.nsys-rep[1/1] [===================80%      ] report7.nsys-rep[1/1] [===================81%      ] report7.nsys-rep[1/1] [===================82%      ] report7.nsys-rep[1/1] [====================83%     ] report7.nsys-rep[1/1] [====================84%     ] report7.nsys-rep[1/1] [====================85%     ] report7.nsys-rep[1/1] [=====================86%    ] report7.nsys-rep[1/1] [=====================87%    ] report7.nsys-rep[1/1] [=====================88%    ] report7.nsys-rep[1/1] [=====================89%    ] report7.nsys-rep[1/1] [======================90%   ] report7.nsys-rep[1/1] [======================91%   ] report7.nsys-rep[1/1] [======================92%   ] report7.nsys-rep[1/1] [=======================93%  ] report7.nsys-rep[1/1] [=======================94%  ] report7.nsys-rep[1/1] [=======================95%  ] report7.nsys-rep[1/1] [=======================96%  ] report7.nsys-rep[1/1] [========================97% ] report7.nsys-rep[1/1] [========================98% ] report7.nsys-rep[1/1] [========================99% ] report7.nsys-rep[1/1] [========================100%] report7.nsys-rep[1/1] [========================100%] report7.nsys-rep
+Generated:
+    /mnt/data1/2023/11/09/llama.cpp/report7.nsys-rep
+#+end_example
+Log start
+main: build = 1503 (5519834)
+main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+main: seed  = 1699536977
+ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
+ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
+ggml_init_cublas: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6
+llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/mdupont/.ollama/models/blobs/sha256:6ae28029995007a3ee8d0b8556d50f3b59b831074cf19c84de87acf51fb54054 (version GGUF V2)
+llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
+llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   10:              blk.1.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   11:              blk.1.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   12:              blk.1.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   13:         blk.1.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   15:              blk.1.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   16:            blk.1.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   19:              blk.2.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   20:              blk.2.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   21:              blk.2.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   22:         blk.2.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   24:              blk.2.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   25:            blk.2.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   28:              blk.3.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   29:              blk.3.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   30:              blk.3.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   31:         blk.3.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   33:              blk.3.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   34:            blk.3.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   37:              blk.4.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   38:              blk.4.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   39:              blk.4.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   40:         blk.4.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   42:              blk.4.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   43:            blk.4.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   46:              blk.5.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   47:              blk.5.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   48:              blk.5.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   49:         blk.5.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   51:              blk.5.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   52:            blk.5.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   55:              blk.6.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   56:              blk.6.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   57:              blk.6.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   58:         blk.6.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   60:              blk.6.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   61:            blk.6.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   64:              blk.7.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   65:              blk.7.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   66:              blk.7.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   67:         blk.7.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   69:              blk.7.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   70:            blk.7.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   73:              blk.8.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   74:              blk.8.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   75:              blk.8.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   76:         blk.8.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   78:              blk.8.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   79:            blk.8.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   82:              blk.9.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   83:              blk.9.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   84:              blk.9.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   85:         blk.9.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   87:              blk.9.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   88:            blk.9.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   91:             blk.10.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   92:             blk.10.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   93:             blk.10.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   94:        blk.10.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   96:             blk.10.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   97:           blk.10.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  100:             blk.11.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  101:             blk.11.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  102:             blk.11.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  103:        blk.11.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  105:             blk.11.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  106:           blk.11.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  109:             blk.12.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  110:             blk.12.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  111:             blk.12.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  112:        blk.12.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  114:             blk.12.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  115:           blk.12.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  118:             blk.13.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  119:             blk.13.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  120:             blk.13.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  121:        blk.13.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  123:             blk.13.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  124:           blk.13.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  127:             blk.14.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  128:             blk.14.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  129:             blk.14.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  130:        blk.14.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  132:             blk.14.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  133:           blk.14.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  136:             blk.15.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  137:             blk.15.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  138:             blk.15.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  139:        blk.15.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  142:           blk.15.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  145:             blk.16.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  146:             blk.16.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  147:             blk.16.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  148:        blk.16.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  150:             blk.16.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  151:           blk.16.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  154:             blk.17.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  155:             blk.17.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  156:             blk.17.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  157:        blk.17.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  159:             blk.17.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  160:           blk.17.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  163:             blk.18.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  164:             blk.18.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  165:             blk.18.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  166:        blk.18.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  168:             blk.18.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  169:           blk.18.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  172:             blk.19.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  173:             blk.19.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  174:             blk.19.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  175:        blk.19.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  177:             blk.19.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  178:           blk.19.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  181:             blk.20.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  182:             blk.20.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  183:             blk.20.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  184:        blk.20.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  186:             blk.20.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  187:           blk.20.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  190:             blk.21.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  191:             blk.21.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  192:             blk.21.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  193:        blk.21.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  195:             blk.21.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  196:           blk.21.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  199:             blk.22.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  200:             blk.22.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  201:             blk.22.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  202:        blk.22.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  204:             blk.22.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  205:           blk.22.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  208:             blk.23.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  209:             blk.23.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  210:             blk.23.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  211:        blk.23.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  213:             blk.23.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  214:           blk.23.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  217:             blk.24.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  218:             blk.24.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  219:             blk.24.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  220:        blk.24.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  222:             blk.24.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  223:           blk.24.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  226:             blk.25.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  227:             blk.25.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  228:             blk.25.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  229:        blk.25.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  231:             blk.25.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  232:           blk.25.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  235:             blk.26.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  236:             blk.26.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  237:             blk.26.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  238:        blk.26.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  240:             blk.26.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  241:           blk.26.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  244:             blk.27.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  245:             blk.27.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  246:             blk.27.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  247:        blk.27.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  249:             blk.27.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  250:           blk.27.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  253:             blk.28.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  254:             blk.28.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  255:             blk.28.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  256:        blk.28.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  258:             blk.28.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  259:           blk.28.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  262:             blk.29.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  263:             blk.29.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  264:             blk.29.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  265:        blk.29.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  267:             blk.29.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  268:           blk.29.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  271:             blk.30.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  272:             blk.30.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  273:             blk.30.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  274:        blk.30.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  276:             blk.30.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  277:           blk.30.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  280:             blk.31.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  281:             blk.31.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  282:             blk.31.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  283:        blk.31.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  285:             blk.31.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  286:           blk.31.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  290:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
+llama_model_loader: - kv   0:                       general.architecture str     
+llama_model_loader: - kv   1:                               general.name str     
+llama_model_loader: - kv   2:                       llama.context_length u32     
+llama_model_loader: - kv   3:                     llama.embedding_length u32     
+llama_model_loader: - kv   4:                          llama.block_count u32     
+llama_model_loader: - kv   5:                  llama.feed_forward_length u32     
+llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     
+llama_model_loader: - kv   7:                 llama.attention.head_count u32     
+llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     
+llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     
+llama_model_loader: - kv  10:                       llama.rope.freq_base f32     
+llama_model_loader: - kv  11:                          general.file_type u32     
+llama_model_loader: - kv  12:                       tokenizer.ggml.model str     
+llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr     
+llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr     
+llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr     
+llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32     
+llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32     
+llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32     
+llama_model_loader: - kv  19:               general.quantization_version u32     
+llama_model_loader: - type  f32:   65 tensors
+llama_model_loader: - type q4_0:  225 tensors
+llama_model_loader: - type q6_K:    1 tensors
+llm_load_vocab: special tokens definition check successful ( 259/32000 ).
+llm_load_print_meta: format           = GGUF V2
+llm_load_print_meta: arch             = llama
+llm_load_print_meta: vocab type       = SPM
+llm_load_print_meta: n_vocab          = 32000
+llm_load_print_meta: n_merges         = 0
+llm_load_print_meta: n_ctx_train      = 32768
+llm_load_print_meta: n_embd           = 4096
+llm_load_print_meta: n_head           = 32
+llm_load_print_meta: n_head_kv        = 8
+llm_load_print_meta: n_layer          = 32
+llm_load_print_meta: n_rot            = 128
+llm_load_print_meta: n_gqa            = 4
+llm_load_print_meta: f_norm_eps       = 0.0e+00
+llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+llm_load_print_meta: n_ff             = 14336
+llm_load_print_meta: rope scaling     = linear
+llm_load_print_meta: freq_base_train  = 10000.0
+llm_load_print_meta: freq_scale_train = 1
+llm_load_print_meta: n_yarn_orig_ctx  = 32768
+llm_load_print_meta: rope_finetuned   = unknown
+llm_load_print_meta: model type       = 7B
+llm_load_print_meta: model ftype      = mostly Q4_0
+llm_load_print_meta: model params     = 7.24 B
+llm_load_print_meta: model size       = 3.83 GiB (4.54 BPW) 
+llm_load_print_meta: general.name   = mistralai
+llm_load_print_meta: BOS token = 1 '<s>'
+llm_load_print_meta: EOS token = 2 '</s>'
+llm_load_print_meta: UNK token = 0 '<unk>'
+llm_load_print_meta: LF token  = 13 '<0x0A>'
+llm_load_tensors: ggml ctx size =    0.11 MB
+llm_load_tensors: using CUDA for GPU acceleration
+llm_load_tensors: mem required  = 3917.97 MB
+llm_load_tensors: offloading 0 repeating layers to GPU
+llm_load_tensors: offloaded 0/35 layers to GPU
+llm_load_tensors: VRAM used: 0.00 MB
+..................................................................................................
+llama_new_context_with_model: n_ctx      = 512
+llama_new_context_with_model: freq_base  = 10000.0
+llama_new_context_with_model: freq_scale = 1
+llama_new_context_with_model: kv self size  =   64.00 MB
+llama_build_graph: non-view tensors processed: 740/740
+llama_new_context_with_model: compute buffer total size = 79.63 MB
+llama_new_context_with_model: VRAM scratch buffer: 73.00 MB
+llama_new_context_with_model: total VRAM used: 73.00 MB (model: 0.00 MB, context: 73.00 MB)
+
+system_info: n_threads = 12 / 24 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
+sampling: 
+	repeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000
+	top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800
+	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
+generate: n_ctx = 512, n_batch = 512, n_predict = -1, n_keep = 0
+
+
+ [end of text]
+
+llama_print_timings:        load time =     245.80 ms
+llama_print_timings:      sample time =       6.71 ms /    52 runs   (    0.13 ms per token,  7748.47 tokens per second)
+llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
+llama_print_timings:        eval time =    5098.77 ms /    52 runs   (   98.05 ms per token,    10.20 tokens per second)
+llama_print_timings:       total time =    5161.43 ms
+Log end
+[ Babel evaluation exited with code 0 ]
+
+
+#+begin_src sh  :results verbatim :exports both
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys stats report7.nsys-rep 
+#+end_src
+
+#+RESULTS:
+#+begin_example
+Generating SQLite file report7.sqlite from report7.nsys-rep
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/nvtx_sum.py]... 
+
+ ,** NVTX Range Summary (nvtx_sum):
+
+ Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)    Max (ns)   StdDev (ns)   Style             Range          
+ --------  ---------------  ---------  -----------  -----------  ---------  ----------  -----------  -------  -------------------------
+     71.3       91,261,248      2,048     44,561.2     34,700.0     33,179  17,628,931    388,774.9  PushPop  cuBLAS:cublasSgemm_v2    
+     21.8       27,939,877        225    124,177.2     53,143.0     27,935  15,965,566  1,060,852.9  PushPop  cuBLAS:cublasGemmEx      
+      6.3        8,036,669          1  8,036,669.0  8,036,669.0  8,036,669   8,036,669          0.0  PushPop  cuBLAS:cublasCreate_v2   
+      0.6          742,488      2,273        326.7        221.0        150      18,693        509.1  PushPop  cuBLAS:cublasSetStream_v2
+      0.0            7,419          2      3,709.5      3,709.5        142       7,277      5,045.2  PushPop  cuBLAS:cublasGetProperty 
+      0.0              207          1        207.0        207.0        207         207          0.0  PushPop  cuBLAS:cublasSetMathMode 
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/osrt_sum.py]... 
+
+ ,** OS Runtime Summary (osrt_sum):
+
+ Time (%)  Total Time (ns)  Num Calls      Avg (ns)          Med (ns)         Min (ns)        Max (ns)     StdDev (ns)            Name         
+ --------  ---------------  ---------  ----------------  ----------------  --------------  --------------  ------------  ----------------------
+     49.8   98,748,705,227        995      99,244,929.9     100,207,029.0           3,076     145,062,709   9,535,006.2  poll                  
+     38.9   77,113,391,701          1  77,113,391,701.0  77,113,391,701.0  77,113,391,701  77,113,391,701           0.0  pthread_cond_wait     
+     10.8   21,505,984,622         43     500,139,177.3     500,139,962.0     500,071,147     500,199,879      31,487.9  pthread_cond_timedwait
+      0.2      408,111,147      5,966          68,406.2           1,002.5              19      66,331,209   1,803,864.3  fflush                
+      0.2      371,330,137        585         634,752.4           4,055.0             202     106,687,209   7,290,173.5  ioctl                 
+      0.1      100,181,277         29       3,454,526.8           6,438.0           1,135      93,195,838  17,278,903.4  mmap                  
+      0.0       58,243,121         12       4,853,593.4           8,691.5           2,231      58,158,033  16,786,545.6  munmap                
+      0.0        2,653,253          4         663,313.3         354,810.5             157       1,943,475     915,833.7  fwrite                
+      0.0        2,281,929     66,070              34.5              22.0              21         648,878       2,531.0  fread                 
+      0.0          831,597         27          30,799.9           6,749.0           3,478         474,236      89,505.1  mmap64                
+      0.0          599,699          9          66,633.2          38,958.0           4,556         206,867      71,500.9  sem_timedwait         
+      0.0          235,180         37           6,356.2           1,564.0             689         114,711      18,945.1  fopen                 
+      0.0          134,278        466             288.2             217.0             155          10,542         532.5  fputs                 
+      0.0          132,740          3          44,246.7          45,080.0          41,640          46,020       2,305.8  pthread_create        
+      0.0           88,594         44           2,013.5           1,668.5             861           3,993         920.3  open64                
+      0.0           26,380         29             909.7             524.0             385           3,325         826.9  fclose                
+      0.0           21,411         56             382.3              24.0              22          20,033       2,673.7  fgets                 
+      0.0           16,310         62             263.1             120.0              80           2,821         481.5  fcntl                 
+      0.0           15,596         16             974.8             764.0             145           5,352       1,249.5  read                  
+      0.0           12,287          6           2,047.8           1,692.5             618           4,230       1,338.0  open                  
+      0.0            9,178         11             834.4             570.0             301           1,485         475.1  write                 
+      0.0            7,860          2           3,930.0           3,930.0           2,653           5,207       1,806.0  socket                
+      0.0            7,589          3           2,529.7           2,328.0             775           4,486       1,863.7  pipe2                 
+      0.0            6,039          1           6,039.0           6,039.0           6,039           6,039           0.0  connect               
+      0.0            4,874          2           2,437.0           2,437.0           1,626           3,248       1,146.9  fopen64               
+      0.0            1,674          1           1,674.0           1,674.0           1,674           1,674           0.0  pthread_cond_signal   
+      0.0            1,026          7             146.6             164.0              89             212          53.8  dup                   
+      0.0              871          1             871.0             871.0             871             871           0.0  bind                  
+      0.0              415          1             415.0             415.0             415             415           0.0  listen                
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_api_sum.py]... 
+
+ ,** CUDA API Summary (cuda_api_sum):
+
+ Time (%)  Total Time (ns)  Num Calls    Avg (ns)     Med (ns)    Min (ns)     Max (ns)     StdDev (ns)                       Name                     
+ --------  ---------------  ---------  ------------  -----------  ---------  -------------  ------------  ---------------------------------------------
+     33.3    3,915,363,238        289  13,547,969.7  9,484,112.0     19,820     32,587,408  13,784,976.3  cudaDeviceSynchronize                        
+     33.3    3,915,338,614        289  13,547,884.5  9,484,033.0     19,749     32,587,319  13,784,970.8  cudaDeviceSynchronize                        
+     11.0    1,289,319,560      7,108     181,389.9      4,874.0      1,971  1,248,737,939  14,811,400.1  cudaLaunchKernel                             
+     10.9    1,288,680,251      7,108     181,300.0      4,784.0      1,922  1,248,737,696  14,811,398.3  cudaLaunchKernel                             
+      4.3      504,516,347      3,747     134,645.4      4,250.0      2,925     11,642,362     664,161.4  cudaMemcpyAsync                              
+      4.3      504,111,303      3,747     134,537.3      4,161.0      2,862     11,641,970     664,125.5  cudaMemcpyAsync                              
+      2.0      237,836,979          8  29,729,622.4      1,076.0        972    237,827,936  84,084,416.4  cudaStreamCreateWithFlags                    
+      0.2       24,762,935          4   6,190,733.8  5,975,786.0    463,322     12,348,041   6,245,573.4  cudaMallocHost                               
+      0.2       24,762,567          4   6,190,641.8  5,975,703.0    463,182     12,347,979   6,245,578.8  cudaMallocHost                               
+      0.1        9,415,273          8   1,176,909.1    147,189.5      1,509      4,594,906   1,935,033.5  cudaFreeHost                                 
+      0.1        9,410,395          8   1,176,299.4    146,459.0      1,278      4,592,920   1,934,725.0  cudaFreeHost                                 
+      0.1        7,195,101          2   3,597,550.5  3,597,550.5  1,072,705      6,122,396   3,570,670.7  cudaFree                                     
+      0.1        7,194,827          2   3,597,413.5  3,597,413.5  1,072,563      6,122,264   3,570,677.8  cudaFree                                     
+      0.1        7,147,578      1,536       4,653.4      4,177.0      3,552         58,008       2,635.3  cudaMemcpy2DAsync                            
+      0.1        6,938,748      1,536       4,517.4      4,042.0      3,425         57,847       2,634.2  cudaMemcpy2DAsync                            
+      0.0        4,765,427     13,477         353.6        256.0        150          7,184         215.8  cudaStreamGetCaptureInfo_v2_v11030           
+      0.0        2,473,305         17     145,488.5     72,327.0      2,246        539,857     166,286.6  cudaMalloc                                   
+      0.0        2,470,534         17     145,325.5     72,203.0      2,181        539,649     166,184.6  cudaMalloc                                   
+      0.0        2,469,464      2,273       1,086.4        946.0        841          4,801         417.9  cudaEventRecord                              
+      0.0        2,304,122      2,273       1,013.7        873.0        771          4,723         417.2  cudaEventRecord                              
+      0.0        1,179,270        161       7,324.7      7,423.0      5,556         11,078         902.4  cudaMemsetAsync                              
+      0.0        1,157,594        161       7,190.0      7,289.0      5,437         10,922         896.7  cudaMemsetAsync                              
+      0.0          363,729        166       2,191.1      2,186.0        730          6,634         535.8  cudaOccupancyMaxActiveBlocksPerMultiprocessor
+      0.0           93,899        766         122.6        102.0         63            553          63.3  cuGetProcAddress_v2                          
+      0.0           30,972          1      30,972.0     30,972.0     30,972         30,972           0.0  cudaGetDeviceProperties_v2_v12000            
+      0.0            9,674         18         537.4        224.0        203          4,209         947.6  cudaEventCreateWithFlags                     
+      0.0            6,163          2       3,081.5      3,081.5      2,878          3,285         287.8  cudaEventQuery                               
+      0.0            5,973          2       2,986.5      2,986.5      2,776          3,197         297.7  cudaEventQuery                               
+      0.0            1,239          3         413.0        152.0         76          1,011         519.3  cuModuleGetLoadingMode                       
+      0.0            1,162          2         581.0        581.0        400            762         256.0  cudaGetDriverEntryPoint_v11030               
+      0.0              960          2         480.0        480.0        360            600         169.7  cuInit                                       
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 
+
+ ,** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+
+ Time (%)  Total Time (ns)  Instances    Avg (ns)     Med (ns)    Min (ns)    Max (ns)   StdDev (ns)                                                   Name                                                
+ --------  ---------------  ---------  ------------  -----------  ---------  ----------  ------------  ----------------------------------------------------------------------------------------------------
+     94.3    3,661,170,403        224  16,344,510.7  8,861,904.0  2,199,256  30,836,845  12,771,357.3  void dequantize_block<(int)32, (int)2, &dequantize_q4_0, __half>(const void *, T4 *, int)           
+      2.7      103,018,305        225     457,859.1    346,527.0    333,855   1,230,427     271,927.9  void dequantize_block<(int)1, (int)1, &convert_f32, __half>(const void *, T4 *, int)                
+      1.1       44,414,363        161     275,865.6    345,439.0    110,432     804,285     138,253.6  ampere_h16816gemm_256x128_ldg8_stages_32x3_tn                                                       
+      1.1       43,348,510      2,273      19,071.1      6,944.0      6,784     619,070      49,609.4  void dequantize_block<(int)1, (int)1, &convert_f16, float>(const void *, T4 *, int)                 
+      0.4       16,973,438      2,048       8,287.8      8,671.5      7,360      10,304         693.3  void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_16x6_tn_align1>(T1::Params)                
+      0.1        5,584,460          1   5,584,460.0  5,584,460.0  5,584,460   5,584,460           0.0  void dequantize_block_q6_K<__half>(const void *, T1 *)                                              
+      0.1        4,481,001      2,048       2,188.0      2,271.5      1,663       3,360         484.2  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, float, float, float, (bool)1, (boo…
+      0.1        1,946,648         64      30,416.4     30,176.0     29,664      34,720         977.1  ampere_h16816gemm_128x128_ldg8_stages_64x3_tn                                                       
+      0.0          340,796         64       5,324.9      5,312.0      5,184       6,048         162.5  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __half, __half, __half, __half, (bool)1, …
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_mem_time_sum.py]... 
+
+ ,** CUDA GPU MemOps Summary (by Time) (cuda_gpu_mem_time_sum):
+
+ Time (%)  Total Time (ns)  Count  Avg (ns)   Med (ns)  Min (ns)   Max (ns)   StdDev (ns)      Operation     
+ --------  ---------------  -----  ---------  --------  --------  ----------  -----------  ------------------
+     82.7      538,012,483  3,010  178,741.7  13,488.0     5,120  11,313,305    646,615.9  [CUDA memcpy HtoD]
+     17.2      112,106,788  2,273   49,321.1  22,495.0     7,999   1,823,129    143,689.5  [CUDA memcpy DtoH]
+      0.0           66,112    161      410.6     384.0       352       1,152         82.8  [CUDA memset]     
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_mem_size_sum.py]... 
+
+ ,** CUDA GPU MemOps Summary (by Size) (cuda_gpu_mem_size_sum):
+
+ Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation     
+ ----------  -----  --------  --------  --------  --------  -----------  ------------------
+  6,729.069  3,010     2.236     0.192     0.096   107.520        6.567  [CUDA memcpy HtoD]
+  2,884.992  2,273     1.269     0.562     0.192    48.000        3.775  [CUDA memcpy DtoH]
+      0.063    161     0.000     0.000     0.000     0.002        0.000  [CUDA memset]     
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/openmp_sum.py]... 
+SKIPPED: report7.sqlite does not contain OpenMP event data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/opengl_khr_range_sum.py]... 
+SKIPPED: report7.sqlite does not contain KHR Extension (KHR_DEBUG) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/opengl_khr_gpu_range_sum.py]... 
+SKIPPED: report7.sqlite does not contain GPU KHR Extension (KHR_DEBUG) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/vulkan_marker_sum.py]... 
+SKIPPED: report7.sqlite does not contain Vulkan Debug Extension (Vulkan Debug Util) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/vulkan_gpu_marker_sum.py]... 
+SKIPPED: report7.sqlite does not contain GPU Vulkan Debug Extension (GPU Vulkan Debug markers) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/dx11_pix_sum.py]... 
+SKIPPED: report7.sqlite does not contain DX11 CPU debug markers.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/dx12_gpu_marker_sum.py]... 
+SKIPPED: report7.sqlite does not contain DX12 GPU debug markers.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/dx12_pix_sum.py]... 
+SKIPPED: report7.sqlite does not contain DX12 CPU debug markers.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/wddm_queue_sum.py]... 
+SKIPPED: report7.sqlite does not contain WDDM context data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/um_sum.py]... 
+SKIPPED: report7.sqlite does not contain CUDA Unified Memory CPU page faults data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/um_total_sum.py]... 
+SKIPPED: report7.sqlite does not contain CUDA Unified Memory CPU page faults data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/um_cpu_page_faults_sum.py]... 
+SKIPPED: report7.sqlite does not contain CUDA Unified Memory CPU page faults data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/openacc_sum.py]... 
+SKIPPED: report7.sqlite does not contain OpenACC event data.
+
+#+end_example
+
+#+begin_src sh
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys export -t json  report7.nsys-rep
+#+end_src
+
+#+RESULTS:
+
+#+begin_src sh
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys export -t hdf  report7.nsys-rep
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys export -t json  report7.nsys-rep
+  #    jq .  ./report12.json > report12.jq
+#+end_src
+
+#+RESULTS:
+
+
+#+begin_src sh :results verbatim :exports both
+python ./reporthd5_callchains.py ./report7.h5
+#+end_src
+
+#+RESULTS:
+#+begin_example
+./report2.h5
+./report2.h5
+('0x7f70ac50663f|721|MOD:321/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:0', 17)
+('0x7f70ac508958|717|MOD:321/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:1', 17)
+('0x7f70af680966|722|MOD:235/usr/lib/x86_64-linux-gnu/libcuda.so.545.23.06|DEP:2', 17)
+('cudaFreeHost|636|MOD:206/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 8)
+('ggml_cuda_host_free|637|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 8)
+('llama_new_context_with_model|647|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 6)
+('llama_init_from_gpt_params(gpt_params&)|521|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 6)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 6)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 6)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 6)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 6)
+('cudaMallocHost|778|MOD:206/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 4)
+('ggml_cuda_host_malloc|779|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 4)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 4)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:7', 4)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 4)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 4)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 3)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 3)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 3)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 3)
+('0x7f70d54421b0|728|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 3)
+('0x7f70d50aa9bd|729|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 3)
+('llama_free|848|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 3)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 2)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 2)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 2)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 2)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 2)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 2)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2)
+('llm_load_tensors(llama_model_loader&, llama_model&, int, int, float const*, bool, void (*)(float, votrunc|638|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 2)
+('llama_load_model_from_file|520|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 2)
+('llama_init_from_gpt_params(gpt_params&)|521|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('0x7f70d5442978|723|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 1)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 1)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('0x7f70b46e9dc8|724|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 1)
+('0x7f70b16d9e24|725|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 1)
+('0x7f70b16da79b|726|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 1)
+('cublasLtCtxInit|510|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 1)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 1)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 1)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 1)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 1)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:13', 1)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 1)
+('0x7f70d50aa20b|730|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('0x7f70d50aa22e|731|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 1)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 1)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 1)
+('llama_free_model|805|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+#+end_example
+
+* mistral eval 
+
+This is a table of performance metrics for code that performs several operations on a GPU using NVIDIA CUDA. The operations are:
+
+* `cudaDeviceSynchronize`: This operation synchronizes the execution of all other threads on the GPU. It ensures that all threads have completed before moving on to the next operation.
+* `cudaLaunchKernel`: This operation launches a kernel function (a small CUDA program) on the GPU. In this case, two different kernels are launched, likely with different parameters or data inputs.
+* `cudaMemcpyAsync`: This operation copies memory from the CPU to the GPU or vice versa asynchronously. It does not block the execution of other threads on the GPU, allowing multiple operations to be performed concurrently.
+* `cudaStreamCreateWithFlags`: This operation creates a new CUDA stream, which is used to manage the execution of multiple operations on the GPU in parallel. In this case, a single stream is created with some flags set.
+
+
+#+begin_src sh :results verbatim :exports both
+python ./reporthd5_callchains.py ./report7.h5
+#+end_src
+
+#+RESULTS:
+#+begin_example
+./report7.h5
+./report7.h5
+('0x7fbb4530663f|697|MOD:296/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:0', 15147)
+('0x7fbb45308958|693|MOD:296/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:1', 15147)
+('0x7fbb48480966|698|MOD:231/usr/lib/x86_64-linux-gnu/libcuda.so.545.23.06|DEP:2', 15147)
+('0x7fbb4d5057a8|3059|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 4385)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 4036)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 4036)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 4036)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:7', 4032)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 4032)
+('cudaMemcpyAsync|724|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 3747)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 2731)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 2731)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2731)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 2731)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 2725)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 2725)
+('cudaLaunchKernel|744|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 2723)
+('0x7fbb6e25d785|3070|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 2273)
+('0x7fbb6deab1d7|3071|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 2273)
+('0x7fbb6deac192|3072|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 2273)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2273)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 2273)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 2273)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 2273)
+('void dequantize_block<1, 1, &(convert_f16(void const*, int, int, __half2&)), float>(void const*, flotrunc|2841|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 2273)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:17', 2272)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:18', 2272)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 2211)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 2211)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 2211)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 2211)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:18', 2210)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:19', 2210)
+('0x7fbb6deaa8b2|3073|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 2112)
+('0x7fbb4c77794d|3084|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 2048)
+('0x7fbb4c7db69a|3085|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 2048)
+('0x7fbb4afd0fc9|3086|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 2048)
+('0x7fbb4a4f5b71|3087|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 2048)
+('0x7fbb4a62697b|3088|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 2048)
+('cublasLtSSSMatmul|2823|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:9', 2048)
+('0x7fbb6de4cb15|3089|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 2048)
+('0x7fbb6de4ef46|3090|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 2048)
+('0x7fbb6df65abd|3091|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 2048)
+('cublasSgemm_v2|2827|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 2048)
+('0x7fbb4ad4b256|3092|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 2048)
+('0x7fbb4afd1133|3093|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 2048)
+('0x7fbb4a4f5b71|3087|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 2048)
+('0x7fbb4a62697b|3088|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 2048)
+('cublasLtSSSMatmul|2823|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 2048)
+('0x7fbb6de4cb15|3089|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 2048)
+('0x7fbb6de4ef46|3090|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 2048)
+('0x7fbb6df65abd|3091|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 2048)
+('cublasSgemm_v2|2827|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 2048)
+('0x7fbb6de4cb48|3094|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 2048)
+('0x7fbb6de4ef46|3090|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 2048)
+('0x7fbb6df65abd|3091|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 2048)
+('cublasSgemm_v2|2827|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 2048)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 2048)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 2048)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2048)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 2048)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:15', 2048)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:16', 2048)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1542)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1542)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1542)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 1539)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 1539)
+('cudaMemcpy2DAsync|2915|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 1536)
+('ggml_cuda_cpy_tensor_2d(void*, ggml_tensor const*, long, long, long, long, CUstream_st*)|2916|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 1536)
+('cudaDeviceSynchronize|2772|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 289)
+('void dequantize_block<1, 1, &(convert_f32(void const*, int, int, __half2&)), __half>(void const*, __trunc|3047|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 225)
+('0x7fbb4acae2f1|3062|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 225)
+('0x7fbb4acb0dda|3063|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 225)
+('0x7fbb4a4f3471|3064|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 225)
+('0x7fbb4a62af2f|3065|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:9', 225)
+('cublasLtHHHMatmul|2759|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:10', 225)
+('0x7fbb6de43905|3066|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 225)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 225)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 225)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:14', 225)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:15', 225)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:16', 225)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 225)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 225)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 225)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 225)
+('0x7fbb6de43938|3074|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 225)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 225)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 225)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 225)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 225)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 225)
+('void dequantize_block<32, 2, &(dequantize_q4_0(void const*, int, int, __half2&)), __half>(void consttrunc|745|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 224)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:21', 224)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:22', 224)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 163)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 163)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 163)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 163)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 163)
+('0x7fbb4d503e43|3078|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 161)
+('0x7fbb4acb13e3|3079|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 161)
+('0x7fbb4a4f3471|3064|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 161)
+('0x7fbb4a62af2f|3065|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 161)
+('cublasLtHHHMatmul|2759|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 161)
+('0x7fbb6de43905|3066|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 161)
+5('0x7fbb4d4468ad|3081|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 161)
+('0x7fbb4d4468cd|3082|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 161)
+('0x7fbb6deaa85f|3083|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 161)
+('0x7fbb4d44430d|3060|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 64)
+('0x7fbb4d44432d|3061|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 64)
+('0x7fbb4ad41fd2|3067|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 64)
+('0x7fbb4acb0e84|3068|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 64)
+('0x7fbb4a4f3471|3064|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 64)
+('0x7fbb4a62af2f|3065|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 64)
+('cublasLtHHHMatmul|2759|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 64)
+('0x7fbb6de43905|3066|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 64)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 64)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 64)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 64)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 64)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:14', 64)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 64)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 64)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 64)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 64)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:19', 63)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:20', 63)
+('cudaMalloc|703|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 14)
+('ggml_cuda_pool_malloc(unsigned long, unsigned long*)|2855|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 14)
+('cudaFreeHost|613|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 8)
+('ggml_cuda_host_free|614|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 8)
+('llama_new_context_with_model|628|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 6)
+('llama_init_from_gpt_params(gpt_params&)|523|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 6)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 6)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 6)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 6)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 6)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 6)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 6)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 6)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 6)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 6)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:14', 6)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:15', 6)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 6)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 5)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 5)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:13', 5)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 5)
+('cudaMallocHost|3009|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 4)
+('ggml_cuda_host_malloc|3010|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 4)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 4)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 4)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 4)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 4)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 4)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:7', 4)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 4)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 4)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 3)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 3)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 3)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 3)
+('0x7fbb6e2421b0|704|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 3)
+('0x7fbb6deaa9bd|705|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 3)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 3)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 3)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 3)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 3)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 3)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:13', 3)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:14', 3)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 3)
+('llama_free|3928|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 3)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 2)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 2)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 2)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 2)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 2)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 2)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2)
+('llm_load_tensors(llama_model_loader&, llama_model&, int, int, float const*, bool, void (*)(float, votrunc|615|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 2)
+('llama_load_model_from_file|521|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 2)
+('llama_init_from_gpt_params(gpt_params&)|523|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('0x7fbb6e23e8db|3049|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 2)
+('0x7fbb6deaae8b|3050|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 2)
+('0x7fbb6deac55b|3051|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 2)
+('0x7fbb6de43264|3053|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 2)
+('0x7fbb6de43c6c|3054|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 2)
+('0x7fbb6e242978|699|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 1)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 1)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('0x7fbb4d4e9dc8|700|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 1)
+('0x7fbb4a4d9e24|701|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 1)
+('0x7fbb4a4da79b|702|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 1)
+('cublasLtCtxInit|456|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 1)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 1)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 1)
+('0x7fbb6deaa20b|706|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('0x7fbb6deaa22e|707|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 1)
+('0x7fbb6deaa5dc|3052|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:22', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:23', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:24', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:25', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:22', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:23', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:24', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:25', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:26', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:27', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:28', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:22', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:23', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:24', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:25', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:26', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:22', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:23', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:24', 1)
+('0x7fbb6deaa582|3076|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 1)
+('void dequantize_block_q6_K<__half>(void const*, __half*)|3698|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 1)
+('llama_free_model|3899|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+#+end_example
+
+
+nm  /mnt/data1/2023/11/09/llama.cpp/build/bin/main  >main.nm
+
+
+grep libcuda report7.gron  -C10 > cudareport.txt
+grep -C1000 libcuda report7.jq > cuda.txt
+
+
+
+(gpt_params &) @0x7fffffffc960: {seed = 1700596789, n_threads = 12, 
+  n_threads_batch = -1, n_predict = -1, n_ctx = 512, n_batch = 512, n_keep = 0, 
+  n_draft = 16, n_chunks = -1, n_parallel = 1, n_sequences = 1, p_accept = 0.5, 
+  p_split = 0.100000001, n_gpu_layers = -1, n_gpu_layers_draft = -1, main_gpu = 0, 
+  tensor_split = {0 <repeats 16 times>}, n_beams = 0, rope_freq_base = 0, 
+--Type <RET> for more, q to quit, c to continue without paging--
+  rope_freq_scale = 0, yarn_ext_factor = -1, yarn_attn_factor = 1, 
+  yarn_beta_fast = 32, yarn_beta_slow = 1, yarn_orig_ctx = 0, 
+  rope_scaling_type = -1 '\377', sparams = {n_prev = 64, n_probs = 0, top_k = 40, 
+    top_p = 0.949999988, min_p = 0.0500000007, tfs_z = 1, typical_p = 1, 
+    temp = 0.800000012, penalty_last_n = 64, penalty_repeat = 1.10000002, 
+--Type <RET> for more, q to quit, c to continue without paging--
+    penalty_freq = 0, penalty_present = 0, mirostat = 0, mirostat_tau = 5, 
+    mirostat_eta = 0.100000001, penalize_nl = true, grammar = "", 
+    cfg_negative_prompt = "", cfg_scale = 1, 
+    logit_bias = std::unordered_map with 0 elements}, 
+  model = "/home/mdupont/.ollama/models/mistral", model_draft = "", 
+--Type <RET> for more, q to quit, c to continue without paging--
+  model_alias = "unknown", prompt = "", prompt_file = "", path_prompt_cache = "", 
+  input_prefix = "", input_suffix = "", 
+  antiprompt = std::vector of length 0, capacity 0, logdir = "", 
+  lora_adapter = std::vector of length 0, capacity 0, lora_base = "", ppl_stride = 0, 
+  ppl_output_type = 0, hellaswag = false, hellaswag_tasks = 400, mul_mat_q = true, 
+--Type <RET> for more, q to quit, c to continue without paging--
+  memory_f16 = true, random_prompt = false, use_color = false, interactive = false, 
+  chatml = false, prompt_cache_all = false, prompt_cache_ro = false, 
+  embedding = false, escape = false, interactive_first = false, 
+  multiline_input = false, simple_io = false, cont_batching = false, 
+  input_prefix_bos = false, ignore_eos = false, instruct = false, logits_all = false, 
+--Type <RET> for more, q to quit, c to continue without paging--
+  use_mmap = true, use_mlock = false, numa = false, verbose_prompt = false, 
+  infill = false, mmproj = "", image = ""}
+(gdb)
+
+	    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
+
+	        at /home/mdupont/experiments/llama.cpp/ggml.cpp:18561
+18561	                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+
+
+p *ctx
+$14 = {header = {magic = "GGUF", version = 2, n_tensors = 291, n_kv = 20}, 
+  kv = 0x555556ffc2f0, infos = 0x55555716d5f0, alignment = 0, offset = 0, size = 0, 
+  data = 0x0}
+(gdb)
+
+This key we can treat differently and we can imagine a template class attachd to the dynamic model or
+even a customized class for it. 
+$5 = {key = {n = 21, data = 0x555555e1cb50 "tokenizer.ggml.tokens"}, 
+  type = GGUF_TYPE_ARRAY, value = {uint8 = 0 '\000', int8 = 0 '\000', uint16 = 0, 
+    int16 = 0, uint32 = 0, int32 = 0, float32 = 0, uint64 = 0, int64 = 0, float64 = 0, 
+    bool_ = false, str = {n = 0, data = 0x0}, arr = {type = GGUF_TYPE_UINT8, n = 0, 
+      data = 0x0}}}
+
+      
diff --git a/binding.py b/binding.py
new file mode 100644
index 0000000000000..668afd566e22c
--- /dev/null
+++ b/binding.py
@@ -0,0 +1,334 @@
+import os
+import json
+import re
+import clang.cindex
+
+# configurable part
+
+CLANG_VERSION='13.0.1'
+#   homebrew installs for llvm (brew info llvm gives details):
+#       x64: /usr/local/opt/llvm/lib
+#       arm64: /opt/homebrew/opt/llvm/lib
+llvmLibPath = "/usr/lib/llvm-15/lib/"
+
+cxxClientRoot = "/home/mdupont/experiments/llama.cpp/"
+
+fileList = [
+    "ggml.cpp",
+    "llama.cpp"
+]
+
+typeList = [
+]
+
+# end of configurable part
+
+clang.cindex.Config.set_library_path(llvmLibPath)
+
+
+def list_headers_in_dir(path):
+    # enumerates a folder but keeps the full pathing for the files returned
+    # and removes certain files we don't want (like non-hxx, _json.hxx or _fmt.hxx)
+
+    # list all the files in the folder
+    files = os.listdir(path)
+    # only include .hxx files
+    files = list(filter(lambda x: x.endswith('.hxx'), files))
+    # add the folder path back on
+    files = list(map(lambda x: path + x, files))
+    return files
+
+
+# parse through the list of files specified and expand wildcards
+fullFileList = []
+for filePath in fileList:
+    if "*" in filePath:
+        # wildcard path
+        basePath = filePath[:-1]
+        if "*" in basePath:
+            # if there is still a wildcard, we have an issue...
+            raise NotImplementedError(
+                "wildcard only supported at end of file path")
+        files = list_headers_in_dir(os.path.join(cxxClientRoot, basePath))
+        fullFileList = fullFileList + files
+    else:
+        # normal path
+        ff = os.path.join(cxxClientRoot, filePath)
+        fullFileList.append(ff)
+        print("DBUG",ff)
+# exclude _json.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_json.hxx'), fullFileList))
+# exclude _fmt.hxx files
+fullFileList = list(
+    filter(lambda x: not x.endswith('_fmt.hxx'), fullFileList))
+
+
+# generate a list of regexps from the type list (for handling wildcards)
+typeListRe = list(map(lambda x: x.replace("*", "(.*)") + "(.*)", typeList))
+
+
+def is_included_type(name, with_durability=False):
+
+    # TODO(brett19): This should be generalized somehow...
+    if "is_compound_operation" in name:
+        return False
+
+    if "replica_context" in name:
+        return False
+
+    if with_durability is True and '_with_legacy_durability' not in name:
+        return False
+
+    for x in typeListRe:
+        if re.fullmatch(x, name):
+            return True
+    return False
+
+
+opTypes = []
+opEnums = []
+
+
+def parse_type(type):
+    typeStr = type.get_canonical().spelling
+    return parse_type_str(typeStr)
+
+std_comparators = ["std::less<>", "std::greater<>", "std::less_equal<>", "std::greater_equal<>"]
+
+def parse_type_str(typeStr):
+    if typeStr == "std::mutex":
+        return {"name": "std::mutex"}
+    if typeStr == "std::string":
+        return {"name": "std::string"}
+    if typeStr == "std::chrono::duration<long long>":
+        return {"name": "std::chrono::seconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000>>":
+        return {"name": "std::chrono::milliseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000>>":
+        return {"name": "std::chrono::microseconds"}
+    if typeStr == "std::chrono::duration<long long, std::ratio<1, 1000000000>>":
+        return {"name": "std::chrono::nanoseconds"}
+    if typeStr == "std::error_code":
+        return {"name": "std::error_code"}
+    if typeStr == "std::monostate":
+        return {"name": "std::monostate"}
+    if typeStr == "std::byte":
+        return {"name": "std::byte"}
+    if typeStr == "unsigned long":
+        return {"name": "std::size_t"}
+    if typeStr == "char":
+        return {"name": "std::int8_t"}
+    if typeStr == "unsigned char":
+        return {"name": "std::uint8_t"}
+    if typeStr == "short":
+        return {"name": "std::int16_t"}
+    if typeStr == "unsigned short":
+        return {"name": "std::uint16_t"}
+    if typeStr == "int":
+        return {"name": "std::int32_t"}
+    if typeStr == "unsigned int":
+        return {"name": "std::uint32_t"}
+    if typeStr == "long long":
+        return {"name": "std::int64_t"}
+    if typeStr == "unsigned long long":
+        return {"name": "std::uint64_t"}
+    if typeStr == "bool":
+        return {"name": "std::bool"}
+    if typeStr == "float":
+        return {"name": "std::float"}
+    if typeStr == "double":
+        return {"name": "std::double"}
+    if typeStr == "std::nullptr_t":
+        return {"name": "std::nullptr_t"}
+    if typeStr in std_comparators:
+        return {"name": typeStr}
+
+    tplParts = typeStr.split("<", 1)
+    if len(tplParts) > 1:
+        tplClassName = tplParts[0]
+        tplParams = tplParts[1][:-1]
+        if tplClassName == "std::function":
+            return {
+                "name": "std::function"
+            }
+        if tplClassName == "std::optional":
+            return {
+                "name": "std::optional",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::vector":
+            return {
+                "name": "std::vector",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::set":
+            return {
+                "name": "std::set",
+                "of": parse_type_str(tplParams)
+            }
+        if tplClassName == "std::variant":
+            variantParts = tplParams.split(", ")
+            variantTypes = []
+            for variantPart in variantParts:
+                variantTypes.append(parse_type_str(variantPart))
+            return {
+                "name": "std::variant",
+                "of": variantTypes
+            }
+        if tplClassName == "std::array":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) != 2:
+                print("FAILED TO PARSE ARRAY TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+            return {
+                "name": "std::array",
+                "of": parse_type_str(variantParts[0]),
+                "size": int(variantParts[1])
+            }
+        if tplClassName == "std::map":
+            variantParts = tplParams.split(", ")
+            if len(variantParts) < 2 or len(variantParts) > 3:
+                print("FAILED TO PARSE MAP TYPES: " + typeStr)
+                return {"name": "unknown", "str": typeStr}
+
+            if len(variantParts) == 2:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1])
+                }
+            else:
+                return {
+                    "name": "std::map",
+                    "of": parse_type_str(variantParts[0]),
+                    "to": parse_type_str(variantParts[1]),
+                    "comparator": parse_type_str(variantParts[2])
+                }
+
+        if tplClassName == "std::shared_ptr":
+            return {
+                "name": "std::shared_ptr",
+                "of": parse_type_str(tplParams)
+            }
+
+        #return {"name": "unknown", "str": typeStr}
+
+    if 'unnamed struct' in typeStr:
+        print("WARNING:  Found unnamed struct: " + typeStr)
+
+    return {"name": typeStr}
+
+internal_structs = []
+UNNAMED_STRUCT_DELIM = '::(unnamed struct'
+
+def traverse(node, namespace, main_file):
+    # only scan the elements of the file we parsed
+    #print("FILE", node.location.file )
+
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL or node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        print("REFL_TYPE(" + fullStructName + ")")
+
+        structFields = []
+        for child in node.get_children():
+            if child.kind == clang.cindex.CursorKind.FIELD_DECL:
+                struct_type = parse_type(child.type)
+                type_str = child.type.get_canonical().spelling
+                print("  REFL_FIELD(" + child.displayname + ")")
+                if 'unnamed' in type_str:
+                    name_tokens = type_str.split('::')
+                    name_override = '::'.join(name_tokens[:-1] + [child.displayname])
+                    struct_type['name'] = name_override
+                    internal_structs.append(name_override)
+
+                    structFields.append({
+                        "name": child.displayname,
+                        "type": struct_type,
+                    })
+            # replica read changes introduced duplicate get requests
+            if any(map(lambda op: op['name'] == fullStructName, opTypes)):
+                return
+
+            opTypes.append({
+                "name": fullStructName,
+                "fields": structFields,
+            })
+        print("REFL_END")
+        
+    if node.kind == clang.cindex.CursorKind.TYPE_ALIAS_DECL:
+        fullStructName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullStructName, with_durability=True):
+            type_ref = next((c for c in node.get_children() if c.kind == clang.cindex.CursorKind.TYPE_REF), None)
+            if type_ref:
+                base_request_name = type_ref.displayname.replace('struct', '').strip()
+                base_request = next((op for op in opTypes if op['name'] == base_request_name), None)
+                if base_request:
+                    new_fields = [f for f in base_request['fields'] if f['name'] != 'durability_level']
+                    new_fields.extend([
+                            {"name":"persist_to", "type":{"name":"couchbase::persist_to"}},
+                            {"name":"replicate_to", "type":{"name":"couchbase::replicate_to"}}
+                        ])
+
+                    opTypes.append({
+                        "name": fullStructName,
+                        "fields": new_fields
+                    })
+    if node.kind == clang.cindex.CursorKind.ENUM_DECL:
+        fullEnumName = "::".join([*namespace, node.displayname])
+        if is_included_type(fullEnumName):
+            enumValues = []
+
+            for child in node.get_children():
+                if child.kind == clang.cindex.CursorKind.ENUM_CONSTANT_DECL:
+                    enumValues.append({
+                        "name": child.displayname,
+                        "value": child.enum_value,
+                    })
+            opEnums.append({
+                "name": fullEnumName,
+                "type": parse_type(node.enum_type),
+                "values": enumValues,
+            })
+
+    if node.kind == clang.cindex.CursorKind.NAMESPACE:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.CLASS_DECL:
+        namespace = [*namespace, node.displayname]
+    if node.kind == clang.cindex.CursorKind.STRUCT_DECL:
+        namespace = [*namespace, node.displayname]
+
+    for child in node.get_children():
+        traverse(child, namespace, main_file)
+
+for headerPath in fullFileList:
+    print("processing " + headerPath)
+    index = clang.cindex.Index.create()
+    args = [
+        '-std=c++17',
+    ]
+    
+    try:
+        translation_unit = index.parse(headerPath, args=args)
+    except Exception as e:
+        print(e)
+        import pdb
+        pdb.set_trace()
+        raise e
+
+    # output clang compiler diagnostics information (for debugging)
+
+    for diagnostic in translation_unit.diagnostics:
+        diagnosticMsg = diagnostic.format()
+        print(diagnostic)
+
+    traverse(translation_unit.cursor, [], headerPath)
+
+jsonData = json.dumps({
+    'op_structs': opTypes,
+    'op_enums': opEnums
+})
+
+f = open("bindings.json", "w")
+f.write(jsonData)
+f.close()
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index ff51cc8034c8b..503ed4212f313 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -144,7 +144,7 @@ namespace grammar_parser {
                 while (*pos != '"') {
                     auto char_pair = parse_char(pos);
                          pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+			 out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_CHAR, char_pair.first));
                 }
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '[') { // char range(s)
@@ -162,11 +162,11 @@ namespace grammar_parser {
                         ? LLAMA_GRETYPE_CHAR_ALT
                         : start_type;
 
-                    out_elements.push_back({type, char_pair.first});
+                    out_elements.push_back(llama_grammar_element(type, char_pair.first));
                     if (pos[0] == '-' && pos[1] != ']') {
                         auto endchar_pair = parse_char(pos + 1);
                              pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+			     out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first));
                     }
                 }
                 pos = parse_space(pos + 1, is_nested);
@@ -175,7 +175,7 @@ namespace grammar_parser {
                 uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
                 pos = parse_space(name_end, is_nested);
                 last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+                out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_RULE_REF, ref_rule_id));
             } else if (*pos == '(') { // grouping
                 // parse nested alternates into synthesized rule
                 pos = parse_space(pos + 1, true);
@@ -183,7 +183,7 @@ namespace grammar_parser {
                 pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
                 last_sym_start = out_elements.size();
                 // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                out_elements.push_back(llama_grammar_element(LLAMA_GRETYPE_RULE_REF, sub_rule_id));
                 if (*pos != ')') {
                     throw std::runtime_error(std::string("expecting ')' at ") + pos);
                 }
@@ -219,7 +219,8 @@ namespace grammar_parser {
 
                 // in original rule, replace previous symbol with reference to generated rule
                 out_elements.resize(last_sym_start);
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+		llama_grammar_element a(LLAMA_GRETYPE_RULE_REF, sub_rule_id);
+                out_elements.push_back(a);
 
                 pos = parse_space(pos + 1, is_nested);
             } else {
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 1317024c2c11c..d83cfc57cd6a9 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -138,7 +138,7 @@ llama_token llama_sampling_sample(
     cur.clear();
 
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+      cur.emplace_back(llama_token_data(token_id, logits[token_id], 0.0f));
     }
 
     llama_token_data_array cur_p = { cur.data(), cur.size(), false };
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 8155101d0ab93..b2679a9d998e4 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1527,11 +1527,14 @@ int main(int argc, char ** argv) {
     std::vector<uint8_t> work_buffer;
 
     for (int ex=0; ex<n_examples; ++ex) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ compute_size,
-            /*.mem_buffer =*/ compute_addr,
-            /*.no_alloc   =*/ false,
-        };
+      struct ggml_init_params params(
+				     //.mem_size   =
+				     compute_size,
+				     //.mem_buffer =
+				     compute_addr,
+				     //.no_alloc   =
+				     false
+				     );
 
         struct ggml_context * ctx0 = ggml_init(params);
 
@@ -1602,11 +1605,14 @@ int main(int argc, char ** argv) {
         }
         printf("---\n");
         for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ compute_size,
-                /*.mem_buffer =*/ compute_addr,
-                /*.no_alloc   =*/ false,
-            };
+	  struct ggml_init_params params(
+					 //.mem_size   =
+					 compute_size,
+					 //.mem_buffer =
+					 compute_addr,
+					 //.no_alloc   =
+					 false
+					 );
             struct ggml_context * ctx0 = ggml_init(params);
 
             ggml_cgraph gf = {};
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 533c55c17aad1..ee3ad3b8c8307 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -121,16 +121,18 @@ int main(int argc, char ** argv) {
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
+            llama_batch batch_view(
+	      /* .n_tokens= */ n_tokens,
+	      /* .token=    */  batch.token    + i,
+	      /* .embd=     */ nullptr,
+	      /* .pos= */      batch.pos      + i,
+              /* .n_seq_id= */ batch.n_seq_id + i,
+	      /* .seq_id= */ batch.seq_id   + i,
+	      /* .logits= */ batch.logits   + i,
+	      /* .all_pos_0= */0,
+	      /* .all_pos_1= */0,
+	      /* .all_seq_id= */0 // unused
+				   );
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0) {
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 22a4265df77c0..2a872e72ddd86 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -169,10 +169,13 @@ int main(int argc, char ** argv) {
             candidates.reserve(n_vocab);
 
             for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+	      candidates.emplace_back(llama_token_data(
+						       token_id,
+						       logits[token_id],
+						       0.0f ));
             }
 
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p (candidates.data(), candidates.size(), false );
 
             const int   top_k = 40;
             const float top_p = 0.9f;
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 284733b1035c9..924da92a75871 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -140,11 +140,14 @@ int main(int argc, char ** argv)  {
 
     printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
 
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /* no_alloc   =*/ 0
-    };
+    struct ggml_init_params params(
+				   //.mem_size   =
+				   ctx_size,
+				   //.mem_buffer =
+				   NULL,
+				   //.no_alloc   =
+				   0
+				   );
 
     ctx = ggml_init(params);
     if (!ctx) {
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index cae3bf3c3dc65..aea3c767f2f9e 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -553,10 +553,12 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
     if (is_ggml_file(filename)) {
         struct ggml_context * ctx_data = NULL;
 
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ &ctx_data,
-        };
+        struct gguf_init_params params(
+				       //.no_alloc =
+				       false,
+				       //.ctx      =
+				       &ctx_data
+				       );
 
         struct gguf_context * ctx = gguf_init_from_file(filename, params);
         GGML_ASSERT(ctx != NULL);
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index c8754ce70f37d..8858c0cca0dca 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -389,9 +389,11 @@ static void export_lora(struct export_lora_params * params) {
 
     // open base model gguf, read tensors without their data
     struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
+    struct gguf_init_params params_gguf(
+					//params_gguf.no_alloc =
+					true,
+					//params_gguf.ctx      =
+					&ctx_in);
     struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
 
     // create new gguf
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index af46e44a6e216..3b5fefda8aa52 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -294,10 +294,12 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
 
     // get parameters directly from gguf file
     {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
+      struct gguf_init_params params(
+				     //.no_alloc =
+				     false,
+				     //.ctx      =
+				     NULL
+				     );
         struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
 
         load_model_hparams_gguf(mctx, &hparams, "llama");
@@ -598,7 +600,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
         const  bool             enable_flash_attn,
         const  bool             enable_checkpointing) {
 
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+  //FIXME
+  assert(0);
+  //ggml_set_scratch(ctx, { 0, 0, nullptr, });
     const int n_past = 0;
     const int N = n_tokens;
     const auto & hparams  = model->hparams;
@@ -989,9 +993,11 @@ static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llam
 
 static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
     struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
+    struct gguf_init_params params(
+				   //params.no_alloc =
+				   false,
+				   //params.ctx =
+				   &f_ggml_ctx);
     struct gguf_context * fctx = gguf_init_from_file(filename, params);
     if (fctx == NULL) {
         return false;
@@ -1706,11 +1712,14 @@ int main(int argc, char ** argv) {
     std::vector<uint8_t> mem_compute_data;
 
     // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
+    struct ggml_init_params ctx_input_params(
+					     //.mem_size=
+					     ggml_tensor_overhead() * 2, // mem_size
+					     //.mem_buffer=
+					     NULL,                       // mem_buffer
+					     //.no_alloc=
+					     true                       // no_alloc
+					     );
     struct ggml_context * ctx_input = ggml_init(ctx_input_params);
 
     // the input tensors
@@ -1735,11 +1744,14 @@ int main(int argc, char ** argv) {
             2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
             (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
     );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
+    struct ggml_init_params ctx_compute_params(
+					       //.mem_size=
+					       estimated_compute_size_wo_data, // mem_size
+					       //.mem_buffer=
+					       NULL,                           // mem_buffer
+					       //.no_alloc=
+					       true                           // no_alloc
+					       );
     struct ggml_context * ctx_compute = NULL;
 
     struct ggml_tensor * loss   = NULL;
@@ -1902,11 +1914,14 @@ int main(int argc, char ** argv) {
     printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
 
     // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
+    struct ggml_init_params ctx_work_params(
+					    //.mem_size=
+					    max_work_size, // mem_size
+					    //.mem_buffer =
+					    NULL,          // mem_buffer
+					    //.no_alloc  =
+					    false         // no_alloc
+					    );
     struct ggml_context * ctx_work = ggml_init(ctx_work_params);
 
     int64_t t0 = ggml_time_ms();
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index 9ab63a29310ad..cfc077bb91163 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -40,11 +40,14 @@ static bool gguf_ex_write(const std::string & fname) {
     gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
     gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
 
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ 128ull*1024ull*1024ull,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params(
+      //.mem_size   =
+				   128ull*1024ull*1024ull,
+				   //.mem_buffer =
+				   NULL,
+				   //.no_alloc   =
+				   false
+				   );
 
     struct ggml_context * ctx_data = ggml_init(params);
 
@@ -86,10 +89,12 @@ static bool gguf_ex_write(const std::string & fname) {
 
 // just read tensor info
 static bool gguf_ex_read_0(const std::string & fname) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ NULL,
-    };
+  struct gguf_init_params params (
+      //.no_alloc =
+      false,
+      //.ctx      =
+      NULL
+      );
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
 
@@ -146,10 +151,12 @@ static bool gguf_ex_read_0(const std::string & fname) {
 static bool gguf_ex_read_1(const std::string & fname) {
     struct ggml_context * ctx_data = NULL;
 
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
-    };
+    struct gguf_init_params params (
+				    //.no_alloc =
+				    false,
+				    //.ctx      =
+				    &ctx_data
+				    );
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
 
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index fc0656c231a0c..684724ef9c76d 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -255,11 +255,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
 
     const auto & buf_compute = ctx->buf_compute;
 
-    struct ggml_init_params params = {
-        /*.mem_size =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc =*/ false,
-    };
+    struct ggml_init_params params(
+				   //.mem_size =
+				   buf_compute.size,
+				   //.mem_buffer =
+				   buf_compute.data,
+				   //.no_alloc =
+				   false
+				   );
 
     params.no_alloc = true;
 
@@ -455,10 +458,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
     struct ggml_context * meta = NULL;
 
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &meta,
-    };
+    struct gguf_init_params params(
+				   //.no_alloc =
+				   true,
+				   //.ctx      =
+				   &meta);
+    
 
     struct gguf_context * ctx = gguf_init_from_file(fname, params);
     if (!ctx) {
@@ -552,11 +557,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
     // load tensors
     {
-        struct ggml_init_params params = {
-            /*.mem_size =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc =*/ false,
-        };
+      struct ggml_init_params params(
+				     //.mem_size =
+				     ctx_size,
+				     //.mem_buffer =
+				     NULL,
+				     //.no_alloc =
+				     false
+				     );
 
         new_clip->ctx = ggml_init(params);
         if (!new_clip->ctx) {
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 0cae8c4b10a3a..9b3bbfd3c7049 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -75,7 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch(
+			  /* .n_tokens= */int32_t(n_eval),
+	  /* .token= */nullptr,
+	  /* .embd= */(image_embed->embed+i*n_embd),
+	  /* .pos= */nullptr,
+	  /* .n_seq_id= */nullptr,
+	  /* .seq_id= */nullptr,
+	  /* .logits= */nullptr,
+	  /* .all_pos_0= */*n_past,
+	  /* .all_pos_1= */1,
+	  /* .all_seq_id= */0
+			  );
         if (llama_decode(ctx_llama, batch)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 31ec8cade19be..aaf4937e4762c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -31,6 +31,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#include "print.hpp"
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -99,11 +101,15 @@ static void sigint_handler(int signo) {
     }
 }
 #endif
+using namespace refl;
 
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
 
+    //using Td = type_descriptor<gpt_params>;
+
+
     if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
@@ -117,7 +123,8 @@ int main(int argc, char ** argv) {
 
     // TODO: Dump params ?
     //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-
+    print_fields(params);
+    
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
     console::init(params.simple_io, params.use_color);
@@ -174,7 +181,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx_guidance = NULL;
     g_model = &model;
     g_ctx = &ctx;
-
+	
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
@@ -234,6 +241,8 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
+    print_fields(*model);
+	
     if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
         if (params.chatml) {
@@ -277,7 +286,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
-
+    print_fields(*ctx);
+    //print_fields(session_tokens);
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
     if (!session_tokens.empty()) {
@@ -365,6 +375,10 @@ int main(int argc, char ** argv) {
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
             }
+	    
+	    print_fields(*ctx_guidance);
+
+
         }
 
         if (params.n_keep > 0) {
@@ -473,7 +487,8 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd_guidance;
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-
+    print_fields(*ctx_sampling);
+    
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
         if (!embd.empty()) {
@@ -482,6 +497,7 @@ int main(int argc, char ** argv) {
             int max_embd_size = n_ctx - 4;
 
             // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+	    //print_fields(embd);
             if ((int) embd.size() > max_embd_size) {
                 const int skipped_tokens = (int) embd.size() - max_embd_size;
                 embd.resize(max_embd_size);
@@ -508,6 +524,7 @@ int main(int argc, char ** argv) {
                 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
+		print_fields(*ctx);
                 llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                 llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
@@ -624,7 +641,7 @@ int main(int argc, char ** argv) {
             }
 
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-
+	    print_fields(id);
             llama_sampling_accept(ctx_sampling, ctx, id, true);
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@@ -659,7 +676,7 @@ int main(int argc, char ** argv) {
         if (input_echo) {
             for (auto id : embd) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                printf("TOKEN:%s\n", token_str.c_str());
 
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
@@ -850,6 +867,9 @@ int main(int argc, char ** argv) {
     llama_print_timings(ctx);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
+    // dont dump core
+    //int *ptr = 0; *ptr = 1;
+    
     if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 48d80111010df..4c2336f3b595d 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -67,9 +67,12 @@ int main(int argc, char ** argv) {
         std::vector<llama_token_data> candidates;
         candidates.reserve(n_vocab);
         for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+	  candidates.emplace_back(llama_token_data(
+						   token_id,
+						   logits[token_id],
+						   0.0f));
         }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
         auto next_token = llama_sample_token(ctx, &candidates_p);
         auto next_token_str = llama_token_to_piece(ctx, next_token);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1f2c55f2dccdf..1bf37cf1a0e19 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,8 +31,16 @@
 
 using json = nlohmann::json;
 
-struct server_params
+struct server_params : refl::attr::usage::type
 {
+  
+  server_params():
+    hostname( "127.0.0.1"),
+    public_path(public_path),
+    port(port),
+    read_timeout(read_timeout),
+    write_timeout( 600) {};
+  
     std::string hostname = "127.0.0.1";
     std::string public_path = "examples/server/public";
     int32_t port = 8080;
@@ -522,6 +530,28 @@ struct llama_server_context
     std::vector<task_result> queue_results;
     std::mutex mutex_tasks;
     std::mutex mutex_results;
+  llama_server_context():
+    model(nullptr),
+    ctx(nullptr),
+    clp_ctx(nullptr),
+    params(params),
+    batch(batch),
+    multimodal(false),
+    clean_kv_cache( true),
+    all_slots_are_idle( false),
+    add_bos_token(  true),
+    //int32_t id_gen;
+    //int32_t n_ctx;  // total context for all clients / slots
+    system_need_update(false){}
+    //std::string              system_prompt;
+    //std::vector<llama_token> system_tokens;
+    //std::string name_user;      // this should be the antiprompt
+    //std::string name_assistant;
+    //std::vector<llama_client_slot> slots;
+    //std::vector<task_server> queue_tasks;
+    //std::vector<task_result> queue_results;
+    //std::mutex mutex_tasks;
+    //std::mutex mutex_results;
 
     ~llama_server_context()
     {
@@ -1303,7 +1333,7 @@ struct llama_server_context
             for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
             {
                 const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
+                llama_batch batch_view(
                     n_tokens,
                     batch.token    + i,
                     nullptr,
@@ -1311,8 +1341,8 @@ struct llama_server_context
                     batch.n_seq_id + i,
                     batch.seq_id   + i,
                     batch.logits   + i,
-                    0, 0, 0, // unused
-                };
+                    0, 0, 0 // unused
+		    );
                 if (llama_decode(ctx, batch_view))
                 {
                     LOG_TEE("%s : failed to eval\n", __func__);
@@ -1665,17 +1695,18 @@ struct llama_server_context
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
         {
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-            llama_batch batch_view =
-            {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
+            llama_batch batch_view(
+				   /* .n_tokens= */n_tokens,
+                /* .token= */batch.token    + i,
+                /* .embd= */nullptr,
+                /* .pos= */batch.pos      + i,
+                /* .n_seq_id= */batch.n_seq_id + i,
+                /* .seq_id= */batch.seq_id   + i,
+                /* .logits= */batch.logits   + i,
+                /* .all_pos_0= */.0,
+		/* .all_pos_1= */0,
+		/* .all_seq_id= */0 // unused
+		);
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0)
@@ -1722,7 +1753,10 @@ struct llama_server_context
                     slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
                 }
 
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+                llama_token_data_array cur_p(
+					     slot.ctx_sampling->cur.data(),
+					     slot.ctx_sampling->cur.size(),
+					     false );
                 result.tok = id;
 
                 const int32_t n_probs = slot.sparams.n_probs;
@@ -2591,4 +2625,4 @@ int main(int argc, char **argv)
 
     llama_backend_free();
     return 0;
-}
+} 
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 374aef6f16189..92a5442604dd7 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -124,10 +124,15 @@ int main(int argc, char ** argv) {
             candidates.reserve(n_vocab);
 
             for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+	      candidates.emplace_back(llama_token_data( token_id,
+							logits[token_id],
+							0.0f ));
             }
 
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p(
+						candidates.data(),
+						candidates.size(),
+						false );
 
             // sample the most likely token
             const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f049a3923669b..04e7546f7d3e2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -311,7 +311,8 @@ static struct ggml_tensor * llama_build_train_graphs(
         const  bool             enable_flash_attn,
         const  bool             enable_checkpointing) {
 
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+  assert(0);
+  //ggml_set_scratch(ctx, { 0, 0, nullptr, });
     const int n_past = 0;
     const int N = n_tokens;
     const auto & hparams = model->hparams;
@@ -599,10 +600,12 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
 
     // set vocab by copying from vocab_model gguf file
     {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
+      struct gguf_init_params params(
+	  //.no_alloc =
+	  false,
+	  //.ctx      =
+	  NULL
+				     );
         struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
 
         const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
@@ -744,9 +747,11 @@ static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_voc
 
 static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
     struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
+    struct gguf_init_params params(
+				   //params.no_alloc =
+				   false,
+				   //params.ctx =
+				   &f_ggml_ctx);
     struct gguf_context * fctx = gguf_init_from_file(filename, params);
     if (fctx == NULL) {
         return false;
@@ -1084,11 +1089,14 @@ int main(int argc, char ** argv) {
     ggml_allocr * alloc = NULL;
 
     // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
+    struct ggml_init_params ctx_input_params (
+					      //.mem_size =
+					      ggml_tensor_overhead() * 2, // mem_size
+					      //       .mem_buffer =
+					      NULL,                       // mem_buffer
+					      //       .no_alloc =
+					      true                       // no_alloc
+					      );
     struct ggml_context * ctx_input = ggml_init(ctx_input_params);
 
     // the input tensors
@@ -1113,11 +1121,14 @@ int main(int argc, char ** argv) {
             2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
             (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
     );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
+    struct ggml_init_params ctx_compute_params(
+					       //    .mem_size =
+					       estimated_compute_size_wo_data, // mem_size
+					       //.mem_buffer=
+					       NULL,                           // mem_buffer
+					       //.no_alloc =
+					       true                           // no_alloc
+					       );
     struct ggml_context * ctx_compute = NULL;
 
     struct ggml_tensor * loss   = NULL;
@@ -1266,11 +1277,14 @@ int main(int argc, char ** argv) {
     printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
 
     // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
+    struct ggml_init_params ctx_work_params(
+					    //.mem_size=
+					    max_work_size, // 
+					    //.mem_buffer=
+					    NULL,          // 
+					    //.no_alloc=
+					    false         // 
+					    );
     struct ggml_context * ctx_work = ggml_init(ctx_work_params);
 
     int64_t t0 = ggml_time_ms();
diff --git a/ggml-alloc.c b/ggml-alloc.cpp
similarity index 95%
rename from ggml-alloc.c
rename to ggml-alloc.cpp
index cdfe4caf69613..6dc45efb571c4 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.cpp
@@ -8,9 +8,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include "ggml-internal.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MAX_FREE_BLOCKS 256
+
 
 //#define GGML_ALLOCATOR_DEBUG
 
@@ -24,28 +24,7 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
     return offset + align;
 }
 
-struct free_block {
-    void * addr;
-    size_t size;
-};
-
-struct ggml_tallocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
-    void * base;
-    size_t alignment;
-
-    int n_free_blocks;
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-
-    size_t max_size;
 
-    bool measure;
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    struct ggml_tensor * allocated_tensors[1024];
-#endif
-};
 
 #ifdef GGML_ALLOCATOR_DEBUG
 static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
@@ -333,33 +312,20 @@ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
 
 // graph allocator
 
-struct hash_node {
-    int n_children;
-    int n_views;
-};
-
-struct ggml_gallocr {
-    ggml_tallocr_t talloc;
-    struct ggml_hash_set hash_set;
-    struct hash_node * hash_values;
-    size_t hash_values_size;
-    ggml_tallocr_t * hash_allocs;
-    int * parse_seq;
-    int parse_seq_len;
-};
-
 ggml_gallocr_t ggml_gallocr_new(void) {
     ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
 
+    ggml_hash_set hs = {.size=0, .keys=NULL};
     *galloc = (struct ggml_gallocr) {
-        /*.talloc           = */ NULL,
-        /*.hash_set         = */ {0},
-        /*.hash_values      = */ NULL,
-        /*.hash_values_size = */ 0,
-        /*.hash_allocs      = */ NULL,
-        /*.parse_seq        = */ NULL,
-        /*.parse_seq_len    = */ 0,
+      .talloc           =  NULL,
+      .hash_set  =hs,
+      .hash_values      =  NULL,
+      .hash_values_size =  0,
+      .hash_allocs      =  NULL,
+      .parse_seq        =  NULL,
+      .parse_seq_len    =  0,
     };
+    //((*galloc).hash_set)[0]         =  0;
 
     return galloc;
 }
@@ -698,16 +664,12 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
 
 // legacy API wrapper
 
-struct ggml_allocr {
-    ggml_tallocr_t talloc;
-    ggml_gallocr_t galloc;
-};
 
 static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
     ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
     *alloc = (struct ggml_allocr) {
-        /*.talloc = */ talloc,
-        /*.galloc = */ ggml_gallocr_new(),
+      .talloc =  talloc,
+      .galloc =  ggml_gallocr_new(),
     };
     return alloc;
 }
diff --git a/ggml-backend.c b/ggml-backend.cpp
similarity index 99%
rename from ggml-backend.c
rename to ggml-backend.cpp
index f6e5fceed0f4d..f258f69e32c44 100644
--- a/ggml-backend.c
+++ b/ggml-backend.cpp
@@ -25,10 +25,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
     GGML_ASSERT(iface.get_base != NULL);
 
     (*buffer) = (struct ggml_backend_buffer) {
-        /* .interface = */ iface,
-        /* .backend   = */ backend,
-        /* .context   = */ context,
-        /* .size      = */ size,
+      .iface =  iface,
+        .backend   =  backend,
+	.context   =  context,
+	.size      = size,
     };
 
     return buffer;
@@ -586,11 +586,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
     memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
     sched->n_splits = 0;
 
-    struct ggml_init_params params = {
-        /*.mem_size =   */ sizeof(sched->context_buffer),
-        /*.mem_buffer = */ sched->context_buffer,
-        /*.no_alloc =   */ true
-    };
+    struct ggml_init_params params(
+				   //.mem_size =
+				   sizeof(sched->context_buffer),
+				   //.mem_buffer =
+				   sched->context_buffer,
+				   //.no_alloc =
+				   true
+				   );
 
     if (sched->ctx != NULL) {
         ggml_free(sched->ctx);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 50e03de500747..b6d39cc29f4db 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7623,12 +7623,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #endif
 
     // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+           // printf("JSON: { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
+	   // src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+	   // ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
+	   // src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
+	   // dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
+	   // );
 
     if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
@@ -8056,9 +8056,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
 
     if (tensor->op == GGML_OP_MUL_MAT) {
         if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
+
             fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
+
             return false;
         }
     }
diff --git a/ggml-impl.h b/ggml-impl.h
index 06c07339e9269..1bf20a4af3985 100644
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 
diff --git a/ggml-internal.hpp b/ggml-internal.hpp
new file mode 100644
index 0000000000000..0725451fcbd3e
--- /dev/null
+++ b/ggml-internal.hpp
@@ -0,0 +1,258 @@
+struct ggml_context {
+    size_t mem_size;
+    void * mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    int    n_objects;
+
+    struct ggml_object * objects_begin;
+    struct ggml_object * objects_end;
+
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
+
+  ggml_context():
+    mem_size(0),
+    mem_buffer(0),
+    mem_buffer_owned(0),
+    no_alloc(0),
+    no_alloc_save(0),
+    n_objects(0),
+    objects_begin(0),
+    objects_end(0),
+    scratch(),
+    scratch_save()
+  {
+    
+  }
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+
+  ggml_context_container(): used(0),context(){
+    
+  }
+};
+
+typedef double ggml_float;
+typedef void * thread_ret_t;
+
+#define MAX_FREE_BLOCKS 256
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+struct ggml_tallocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
+    void * base;
+    size_t alignment;
+
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+
+    size_t max_size;
+
+    bool measure;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+
+struct hash_node {
+    int n_children;
+    int n_views;
+};
+
+typedef struct ggml_tallocr * ggml_tallocr_t;
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+struct ggml_gallocr {
+    ggml_tallocr_t talloc;
+    struct ggml_hash_set hash_set;
+    struct hash_node * hash_values;
+    size_t hash_values_size;
+    ggml_tallocr_t * hash_allocs;
+    int * parse_seq;
+    int parse_seq_len;
+};
+
+struct ggml_allocr {
+    ggml_tallocr_t talloc;
+    ggml_gallocr_t galloc;
+};
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_state {
+    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
+
+  ggml_state():contexts(), numa()
+  {
+    
+  }
+};
+
+struct gguf_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+struct hash_map {
+    struct ggml_hash_set set;
+    struct ggml_tensor ** vals;
+};
+
+#if defined(_WIN32)
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+#else
+#include<atomic>
+using namespace std;
+#endif
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
+};
+typedef pthread_t ggml_thread_t;
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared * shared;
+};
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_str str;
+
+    struct gguf_array_T {
+        enum gguf_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct ggml_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+struct gguf_kv {
+    struct gguf_str key;
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+
+
+struct gguf_header {
+    char magic[4];
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_tensor_info {
+    struct gguf_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_MAX_DIMS];
+
+    enum ggml_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_context {
+    struct gguf_header header;
+
+    struct gguf_kv          * kv;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+
+#include "ggml-backend-impl.h"
diff --git a/ggml-mpi.c b/ggml-mpi.cpp
similarity index 100%
rename from ggml-mpi.c
rename to ggml-mpi.cpp
diff --git a/ggml-quants.c b/ggml-quants.cpp
similarity index 93%
rename from ggml-quants.c
rename to ggml-quants.cpp
index 7285d5f7fbcc0..094fb8ccb6c9c 100644
--- a/ggml-quants.c
+++ b/ggml-quants.cpp
@@ -5,7 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
-
+#include <stdio.h>
 #ifdef __ARM_NEON
 
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
@@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
     }
 }
 
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k) {
     const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     }
 }
 
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q4_1_reference(x, y, k);
 }
 
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
     }
 }
 
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q5_0_reference(x, y, k);
 }
 
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k) {
     const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     }
 }
 
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q5_1_reference(x, y, k);
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
@@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
     }
 }
 
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
-    block_q8_0 * restrict y = vy;
+    block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k) {
     assert(QK8_1 == 32);
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
@@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
     }
 }
 
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
 
-    block_q8_1 * restrict y = vy;
+    block_q8_1 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
 #endif
 }
 
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK8_0;
 
     assert(k % qk == 0);
@@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
+static float make_qx_quants(int n, int nmax, const float * __restrict__ x, int8_t * __restrict__ L, int rmse_type) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     return scale;
 }
 
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * __restrict__ x, int8_t * __restrict__ L, bool do_rmse) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * __restrict__ x, uint8_t * __restrict__ L, float * __restrict__ the_min,
         int ntry, float alpha) {
     float min = x[0];
     float max = x[0];
@@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
     return scale;
 }
 
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * __restrict__ x, const float * __restrict__ weights,
+        uint8_t * __restrict__ L, float * __restrict__ the_min, uint8_t * __restrict__ Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
     float min = x[0];
     float max = x[0];
@@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
 }
 
 #if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
     if (j < 4) {
         *d = q[j] & 63; *m = q[j + 4] & 63;
     } else {
@@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 
 //========================- 2-bit (de)-quantization
 
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     }
 }
 
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     quantize_row_q2_K_reference(x, vy, k);
 }
 
-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q2_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
+        block_q2_K * __restrict__ y = (block_q2_K *)dst + j/QK_K;
         quantize_row_q2_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q2_K));
@@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
 
 //========================= 3-bit (de)-quantization
 
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
 }
 
 #if QK_K == 256
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * __restrict__ q = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
         uint8_t m = 1;
 
         memcpy(aux, x[i].scales, 12);
@@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
     }
 }
 #else
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     assert(QK_K == 64);
     const int nb = k / QK_K;
@@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * __restrict__ q = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
 
         const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
         const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
@@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 }
 #endif
 
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     quantize_row_q3_K_reference(x, vy, k);
 }
 
-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q3_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
+        block_q3_K * __restrict__ y = (block_q3_K *)dst + j/QK_K;
         quantize_row_q3_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q3_K));
@@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 4-bit (de)-quantization
 
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     }
 }
 
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
+    block_q4_K * __restrict__ y = vy;
     quantize_row_q4_K_reference(x, y, k);
 }
 
-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q4_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     assert(k % QK_K == 0);
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
+        block_q4_K * __restrict__ y = (block_q4_K *)dst + j/QK_K;
         quantize_row_q4_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q4_K));
@@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 5-bit (de)-quantization
 
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * __restrict__ qh = y[i].qh;
+        uint8_t * __restrict__ ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         uint8_t m1 = 1, m2 = 2;
@@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * __restrict__ qh = y[i].qh;
+        uint8_t * __restrict__ ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         for (int j = 0; j < 32; ++j) {
@@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     }
 }
 
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
         }
 #else
         float d = GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict s = x[i].scales;
+        const int8_t * __restrict__ s = x[i].scales;
         for (int l = 0; l < 8; ++l) {
             y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
             y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
@@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
+    block_q5_K * __restrict__ y = vy;
     quantize_row_q5_K_reference(x, y, k);
 }
 
-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q5_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     assert(k % QK_K == 0);
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
+        block_q5_K * __restrict__ y = (block_q5_K *)dst + j/QK_K;
         quantize_row_q5_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q5_K));
@@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 6-bit (de)-quantization
 
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             }
         }
 
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
+        uint8_t * __restrict__ ql = y[i].ql;
+        uint8_t * __restrict__ qh = y[i].qh;
 #if QK_K == 256
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
     }
 }
 
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
+        const uint8_t * __restrict__ ql = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ sc = x[i].scales;
 
 #if QK_K == 256
         for (int n = 0; n < QK_K; n += 128) {
@@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
+    block_q6_K * __restrict__ y = vy;
     quantize_row_q6_K_reference(x, y, k);
 }
 
@@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
+        block_q6_K * __restrict__ y = (block_q6_K *)dst + j/QK_K;
         quantize_row_q6_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q6_K));
@@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
 
 //===================================== Q8_K ==============================================
 
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
     }
 }
 
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2374,7 +2374,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q8_K_reference(x, y, k);
 }
 
@@ -2423,14 +2423,15 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
+  //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy);
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q4_0 * __restrict__ x = vx;
+    const block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2439,10 +2440,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q4_0 * __restrict__ x0 = &x[i + 0];
+        const block_q4_0 * __restrict__ x1 = &x[i + 1];
+        const block_q8_0 * __restrict__ y0 = &y[i + 0];
+        const block_q8_0 * __restrict__ y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
         const int8x16_t  s8b = vdupq_n_s8(0x8);
@@ -2733,14 +2734,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
 #endif
 }
 
-void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_1_q8_1(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q4_1 * __restrict__ x = vx;
+    const block_q8_1 * __restrict__ y = vy;
 
     // TODO: add WASM SIMD
 #if defined(__ARM_NEON)
@@ -2752,10 +2753,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
+        const block_q4_1 * __restrict__ x0 = &x[i + 0];
+        const block_q4_1 * __restrict__ x1 = &x[i + 1];
+        const block_q8_1 * __restrict__ y0 = &y[i + 0];
+        const block_q8_1 * __restrict__ y1 = &y[i + 1];
 
         summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
 
@@ -2893,15 +2894,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_0_q8_0(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
     assert(qk == QK5_0);
 
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q5_0 * __restrict__ x = vx;
+    const block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2916,10 +2917,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q5_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q5_0 * __restrict__ x0 = &x[i];
+        const block_q5_0 * __restrict__ x1 = &x[i + 1];
+        const block_q8_0 * __restrict__ y0 = &y[i];
+        const block_q8_0 * __restrict__ y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3000,8 +3001,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 
     // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
+        const block_q5_0 * __restrict__ x0 = &x[i];
+        const block_q8_0 * __restrict__ y0 = &y[i];
 
         const v128_t m4b  = wasm_i8x16_splat(0x0F);
 
@@ -3199,15 +3200,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_1_q8_1(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
     assert(n % qk == 0);
     assert(qk == QK5_1);
 
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q5_1 * __restrict__ x = vx;
+    const block_q8_1 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3225,10 +3226,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q5_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i];
-        const block_q8_1 * restrict y1 = &y[i + 1];
+        const block_q5_1 * __restrict__ x0 = &x[i];
+        const block_q5_1 * __restrict__ x1 = &x[i + 1];
+        const block_q8_1 * __restrict__ y0 = &y[i];
+        const block_q8_1 * __restrict__ y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3314,8 +3315,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 
     // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
+        const block_q5_1 * __restrict__ x0 = &x[i];
+        const block_q8_1 * __restrict__ y0 = &y[i];
 
         summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
 
@@ -3518,14 +3519,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q8_0_q8_0(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q8_0 * __restrict__ x = vx;
+    const block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3534,10 +3535,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q8_0 * __restrict__ x0 = &x[i + 0];
+        const block_q8_0 * __restrict__ x1 = &x[i + 1];
+        const block_q8_0 * __restrict__ y0 = &y[i + 0];
+        const block_q8_0 * __restrict__ y1 = &y[i + 1];
 
         const int8x16_t x0_0 = vld1q_s8(x0->qs);
         const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3642,10 +3643,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
 }
 
 #if QK_K == 256
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -3667,9 +3668,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
+        const uint8_t * __restrict__ sc = x[i].scales;
 
         const uint8x16_t mins_and_scales = vld1q_u8(sc);
         const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -3746,8 +3747,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
         const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -3813,8 +3814,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         // load mins and scales from block_q2_K.scales[QK_K/16]
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -4035,10 +4036,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -4061,9 +4062,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const float dmin = -y[i].d * (float)x[i].dmin;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
 
         aux32[0] = sc[0] & 0x0f0f0f0f;
         aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
@@ -4114,8 +4115,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
     __m256 acc = _mm256_setzero_ps();
 
     uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
+    const uint8_t * __restrict__ db = (const uint8_t *)&ud;
+    const uint8_t * __restrict__ mb = (const uint8_t *)&um;
 
     float summs = 0;
 
@@ -4126,10 +4127,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
         ud = (sc[0] >> 0) & 0x0f0f0f0f;
         um = (sc[0] >> 4) & 0x0f0f0f0f;
 
@@ -4166,8 +4167,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
     __m256 acc = _mm256_setzero_ps();
 
     uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
+    const uint8_t * __restrict__ db = (const uint8_t *)&ud;
+    const uint8_t * __restrict__ mb = (const uint8_t *)&um;
 
     float summs = 0;
 
@@ -4178,10 +4179,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
         ud = (sc[0] >> 0) & 0x0f0f0f0f;
         um = (sc[0] >> 4) & 0x0f0f0f0f;
 
@@ -4227,9 +4228,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const float dmin = -y[i].d * (float)x[i].dmin;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
 
         aux32[0] = sc[0] & 0x0f0f0f0f;
         aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
@@ -4311,14 +4312,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
     const uint32_t kmask1 = 0x03030303;
     const uint32_t kmask2 = 0x0f0f0f0f;
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -4346,9 +4347,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].hmask;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -4454,8 +4455,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         // Set up scales
         memcpy(aux, x[i].scales, 12);
@@ -4559,8 +4560,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         // Set up scales
         aux = (const uint32_t *)x[i].scales;
@@ -4694,9 +4695,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].hmask;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
         memcpy(aux, x[i].scales, 12);
         utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -4806,11 +4807,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -4855,11 +4856,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -4947,8 +4948,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5018,8 +5019,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5098,8 +5099,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5173,10 +5174,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
+        const  int8_t * __restrict__ q8 = y[i].qs;
+        int8_t * __restrict__ a = aux8;
         for (int l = 0; l < 8; ++l) {
             a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
             a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
@@ -5213,11 +5214,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -5262,8 +5263,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         int32_t sumi1 = 0;
         int32_t sumi2 = 0;
@@ -5334,8 +5335,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
 
@@ -5393,8 +5394,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -5494,8 +5495,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
         sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         vl = 32;
 
@@ -5548,10 +5549,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
             a += 32;
@@ -5594,11 +5595,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 }
 #else
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -5618,14 +5619,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
     float sum_mins = 0.f;
 
     uint16_t aux16[2];
-    const uint8_t * restrict scales = (const uint8_t *)aux16;
+    const uint8_t * __restrict__ scales = (const uint8_t *)aux16;
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const uint16_t * restrict a = (const uint16_t *)x[i].scales;
+        const uint16_t * __restrict__ a = (const uint16_t *)x[i].scales;
         aux16[0] = a[0] & 0x0f0f;
         aux16[1] = (a[0] >> 4) & 0x0f0f;
 
@@ -5698,8 +5699,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
         const __m256i q4l = _mm256_and_si256(q4bits, m4);
@@ -5744,8 +5745,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
         const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
@@ -5778,16 +5779,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #elif defined __riscv_v_intrinsic
 
     uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
+    const uint8_t * __restrict__ scales = (const uint8_t *)s16;
 
     float sumf = 0;
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        const uint16_t * __restrict__ b = (const uint16_t *)x[i].scales;
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
@@ -5827,17 +5828,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
     memset(sums, 0, 8*sizeof(float));
 
     uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
+    const uint8_t * __restrict__ scales = (const uint8_t *)s16;
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        uint8_t * restrict a = aux8;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const  int8_t * __restrict__ q8 = y[i].qs;
+        uint8_t * __restrict__ a = aux8;
         for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
         for (int l = 0; l < 32; ++l) a[l+32] = q4[l]  >> 4;
 
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        const uint16_t * __restrict__ b = (const uint16_t *)x[i].scales;
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
@@ -5861,11 +5862,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -5911,9 +5912,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -5976,8 +5977,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
    for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
 #if QK_K == 256
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
@@ -6065,8 +6066,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6163,9 +6164,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
         vl = 8;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -6249,11 +6250,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -6302,11 +6303,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -6328,9 +6329,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const int8_t * sc = x[i].scales;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint8x8_t qhbits = vld1_u8(qh);
 
@@ -6387,8 +6388,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
@@ -6433,8 +6434,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
@@ -6490,9 +6491,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const int8_t * sc = x[i].scales;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
 
@@ -6560,10 +6561,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
+        int8_t * __restrict__ a = aux8;
         for (int l = 0; l < 32; ++l) {
             a[l+ 0] = q4[l] & 0xF;
             a[l+32] = q4[l]  >> 4;
@@ -6574,7 +6575,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         }
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
+        const int8_t * __restrict__ sc = x[i].scales;
 
         for (int j = 0; j < QK_K/16; ++j) {
             const float dl = d * sc[j];
@@ -6591,11 +6592,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
 
 #if QK_K == 256
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * __restrict__ x = (const block_q6_K *)vx;
+    const block_q8_K * __restrict__ y = (const block_q8_K *)vy;
 
     const int nb = n / QK_K;
 
@@ -6618,11 +6619,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
         const int8x16_t scales = vld1q_s8(scale);
@@ -6750,9 +6751,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -6830,9 +6831,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -6942,11 +6943,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         size_t vl;
 
@@ -7030,11 +7031,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
                 a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -7067,11 +7068,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -7094,11 +7095,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = (float)x[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         int32_t isum = 0;
 
@@ -7157,9 +7158,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
         const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
@@ -7214,9 +7215,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
         const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
@@ -7281,11 +7282,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = (float)x[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         int32_t isum = 0;
 
@@ -7350,11 +7351,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         for (int l = 0; l < 16; ++l) {
             a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
             a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
diff --git a/ggml-quants.h b/ggml-quants.h
index 70c12c27465e8..2706e36ada7d3 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 
 
 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
-
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
+
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
+
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
+
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
 
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
+
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
 
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
diff --git a/ggml.c b/ggml.cpp
similarity index 96%
rename from ggml.c
rename to ggml.cpp
index f92292b39c635..9d2ab8ebd1847 100644
--- a/ggml.c
+++ b/ggml.cpp
@@ -1,3 +1,9 @@
+
+//https://github.com/Neargye/magic_enum.git
+#include <magic_enum.hpp>
+
+
+
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
@@ -58,9 +64,6 @@ static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
 static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
     (void) unused;
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
@@ -86,9 +89,15 @@ static int sched_yield (void) {
 }
 #else
 #include <pthread.h>
+//#include <stdatomic.h>
+#ifdef __cplusplus
+#include <atomic>
+using namespace std;
+#else
 #include <stdatomic.h>
+#endif
+
 
-typedef void * thread_ret_t;
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -100,6 +109,8 @@ typedef void * thread_ret_t;
 #include <hbwmalloc.h>
 #endif
 
+#include "ggml-internal.hpp"
+
 #if defined(__APPLE__)
 #include <TargetConditionals.h>
 #endif
@@ -269,7 +280,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #endif
 
 // floating point type used to accumulate sums
-typedef double ggml_float;
+
 
 #undef MIN
 #undef MAX
@@ -409,196 +420,11 @@ int64_t ggml_cycles_per_ms(void) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_f32(const int n, float * __restrict__ s, const float * __restrict__ x, const float * __restrict__ y);
+static void ggml_vec_dot_f16(const int n, float * __restrict__ s, ggml_fp16_t * __restrict__ x, ggml_fp16_t * __restrict__ y);
+
+static ggml_type_traits_t type_traits[GGML_TYPE_COUNT];
 
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_I8] = {
-        .type_name                = "i8",
-        .blck_size                = 1,
-        .type_size                = sizeof(int8_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I16] = {
-        .type_name                = "i16",
-        .blck_size                = 1,
-        .type_size                = sizeof(int16_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I32] = {
-        .type_name                = "i32",
-        .blck_size                = 1,
-        .type_size                = sizeof(int32_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_F32] = {
-        .type_name                = "f32",
-        .blck_size                = 1,
-        .type_size                = sizeof(float),
-        .is_quantized             = false,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
-        .vec_dot_type             = GGML_TYPE_F32,
-    },
-    [GGML_TYPE_F16] = {
-        .type_name                = "f16",
-        .blck_size                = 1,
-        .type_size                = sizeof(ggml_fp16_t),
-        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
-        .vec_dot_type             = GGML_TYPE_F16,
-    },
-    [GGML_TYPE_Q4_0] = {
-        .type_name                = "q4_0",
-        .blck_size                = QK4_0,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
-        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .type_name                = "q4_1",
-        .blck_size                = QK4_1,
-        .type_size                = sizeof(block_q4_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
-        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [4] = { // GGML_TYPE_Q4_2
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [5] = { // GGML_TYPE_Q4_3
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .type_name                = "q5_0",
-        .blck_size                = QK5_0,
-        .type_size                = sizeof(block_q5_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
-        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .type_name                = "q5_1",
-        .blck_size                = QK5_1,
-        .type_size                = sizeof(block_q5_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
-        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .type_name                = "q8_0",
-        .blck_size                = QK8_0,
-        .type_size                = sizeof(block_q8_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
-        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
-        .type_name                = "q8_1",
-        .blck_size                = QK8_1,
-        .type_size                = sizeof(block_q8_1),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q2_K] = {
-        .type_name                = "q2_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q2_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
-        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q3_K] = {
-        .type_name                = "q3_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q3_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
-        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q4_K] = {
-        .type_name                = "q4_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q4_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
-        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q5_K] = {
-        .type_name                = "q5_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q5_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
-        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q6_K] = {
-        .type_name                = "q6_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q6_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
-        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q8_K] = {
-        .type_name                = "q8_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q8_K),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_K,
-    }
-};
 
 // For internal test use
 ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
@@ -1160,7 +986,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(const int n, float * __restrict__ s, const float * __restrict__ x, const float * __restrict__ y) {
 #ifdef GGML_SIMD
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1197,7 +1023,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
     *s = sumf;
 }
 
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(const int n, float * __restrict__ s, ggml_fp16_t * __restrict__ x, ggml_fp16_t * __restrict__ y) {
     ggml_float sumf = 0.0;
 
 #if defined(GGML_SIMD)
@@ -1235,10 +1061,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
 
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * __restrict__ s, void * __restrict__ xv, ggml_fp16_t * __restrict__ y) {
     ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
 
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+    ggml_fp16_t * __restrict__ x[GGML_VEC_DOT_UNROLL];
 
     for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
         x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1288,7 +1114,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
     }
 }
 
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+inline static void ggml_vec_mad_f32(const int n, float * __restrict__ y, const float * __restrict__ x, const float v) {
 #if defined(GGML_SIMD)
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -1320,10 +1146,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 }
 
 // xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * __restrict__ y, const float * __restrict__ xv, const float * __restrict__ vv) {
 
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
+    const float * __restrict__ x[GGML_VEC_MAD_UNROLL];
+    const float * __restrict__ v[GGML_VEC_MAD_UNROLL];
 
     for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
         x[i] = (const float *) ((const char *) xv + i*xs);
@@ -1794,54 +1620,17 @@ static void ggml_setup_op_has_task_pass(void) {
 // ggml context
 //
 
-struct ggml_context {
-    size_t mem_size;
-    void * mem_buffer;
-    bool   mem_buffer_owned;
-    bool   no_alloc;
-    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
-
-    int    n_objects;
-
-    struct ggml_object * objects_begin;
-    struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
-};
-
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
 
 //
 // NUMA support
 //
 
-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512
 
-struct ggml_numa_node {
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes {
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-};
 
 //
 // ggml state
 //
 
-struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-    struct ggml_numa_nodes numa;
-};
 
 // global state
 static struct ggml_state g_state;
@@ -2175,18 +1964,257 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+static  size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT]={};
+
+static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};
+
 struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
 
-    static bool is_first_call = true;
+  GGUF_TYPE_SIZE[GGUF_TYPE_UINT8]   = sizeof(uint8_t);
+  GGUF_TYPE_SIZE [GGUF_TYPE_INT8]    = sizeof(int8_t);
+  GGUF_TYPE_SIZE[GGUF_TYPE_UINT16]  = sizeof(uint16_t);
+  GGUF_TYPE_SIZE [GGUF_TYPE_INT16]   = sizeof(int16_t);
+  GGUF_TYPE_SIZE [GGUF_TYPE_UINT32]  = sizeof(uint32_t);
+  GGUF_TYPE_SIZE   [GGUF_TYPE_INT32]   = sizeof(int32_t);
+  GGUF_TYPE_SIZE   [GGUF_TYPE_FLOAT32] = sizeof(float);
+  GGUF_TYPE_SIZE    [GGUF_TYPE_BOOL]    = sizeof(bool);
+  GGUF_TYPE_SIZE    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str);
+  GGUF_TYPE_SIZE    [GGUF_TYPE_UINT64]  = sizeof(uint64_t);
+  GGUF_TYPE_SIZE    [GGUF_TYPE_INT64]   = sizeof(int64_t);
+    GGUF_TYPE_SIZE    [GGUF_TYPE_FLOAT64] = sizeof(double);
+    GGUF_TYPE_SIZE    [GGUF_TYPE_ARRAY]   = 0; // undefined
+    
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT8]   = "u8";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT8]    = "i8";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT16]  = "u16";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT16]   = "i16";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT32]  = "u32";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT32]   = "i32";
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT32] = "f32";
+    GGUF_TYPE_NAME[GGUF_TYPE_BOOL]    = "bool";
+    GGUF_TYPE_NAME[GGUF_TYPE_STRING]  = "str";
+    GGUF_TYPE_NAME[GGUF_TYPE_ARRAY]   = "arr";
+    GGUF_TYPE_NAME[GGUF_TYPE_UINT64]  = "u64";
+    GGUF_TYPE_NAME[GGUF_TYPE_INT64]   = "i64";
+    GGUF_TYPE_NAME[GGUF_TYPE_FLOAT64] = "f64";
+
+  type_traits[GGML_TYPE_I8] = {
+        .type_name                = "i8",
+        .blck_size                = 1,
+        .type_size                = sizeof(int8_t),
+        .is_quantized             = false,
+	//.from_float = 
+  };
+  type_traits[GGML_TYPE_I16] = {
+        .type_name                = "i16",
+        .blck_size                = 1,
+        .type_size                = sizeof(int16_t),
+        .is_quantized             = false,
+  };
+  type_traits[GGML_TYPE_I32] = {
+        .type_name                = "i32",
+        .blck_size                = 1,
+        .type_size                = sizeof(int32_t),
+        .is_quantized             = false,
+  };
+  
+  type_traits[GGML_TYPE_F32] = {
+        .type_name                = "f32",
+        .blck_size                = 1,
+        .type_size                = sizeof(float),
+        .is_quantized             = false,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+        .vec_dot_type             = GGML_TYPE_F32,
+  };   
 
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
+  type_traits[GGML_TYPE_F16] = {
+        .type_name                = "f16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_fp16_t),
+        .is_quantized             = false,
+        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+        .vec_dot_type             = GGML_TYPE_F16,
+  };
+  type_traits[GGML_TYPE_Q4_0] = {
+        .type_name                = "q4_0",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+        .from_float               = quantize_row_q4_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+  };
+  
+    type_traits[GGML_TYPE_Q4_1] = {
+        .type_name                = "q4_1",
+        .blck_size                = QK4_1,
+        .type_size                = sizeof(block_q4_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+        .from_float               = quantize_row_q4_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
+        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    };
+    
+    type_traits[4] = { //GGML_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+    };
+    
+    type_traits[5] = { // GGML_TYPE_Q4_3
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+        .to_float                 = NULL,
+        .from_float               = NULL,
+        .from_float_reference     = NULL,
+        .vec_dot                  = NULL,
+        .vec_dot_type             = GGML_TYPE_COUNT,
+    };
+      
+    type_traits[GGML_TYPE_Q5_0] = {
+        .type_name                = "q5_0",
+        .blck_size                = QK5_0,
+        .type_size                = sizeof(block_q5_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
+        .from_float               = quantize_row_q5_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
+        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    };
+    
+    type_traits[GGML_TYPE_Q5_1] = {
+        .type_name                = "q5_1",
+        .blck_size                = QK5_1,
+        .type_size                = sizeof(block_q5_1),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
+        .from_float               = quantize_row_q5_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
+        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    };
+    
+    type_traits[GGML_TYPE_Q8_0] = {
+        .type_name                = "q8_0",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
+        .from_float               = quantize_row_q8_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
+        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    };
+    
+    type_traits[GGML_TYPE_Q8_1] = {
+        .type_name                = "q8_1",
+        .blck_size                = QK8_1,
+        .type_size                = sizeof(block_q8_1),
+        .is_quantized             = true,
+        .from_float               = quantize_row_q8_1,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    };
+    
+    type_traits[GGML_TYPE_Q2_K] = {
+        .type_name                = "q2_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
+        .from_float               = quantize_row_q2_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
+        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    };
 
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
+    
+    type_traits[GGML_TYPE_Q3_K] = {
+        .type_name                = "q3_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
+        .from_float               = quantize_row_q3_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
+        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    };
+    
+    type_traits[GGML_TYPE_Q4_K] = {
+        .type_name                = "q4_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q4_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
+        .from_float               = quantize_row_q4_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
+        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    };
+    
+    type_traits[GGML_TYPE_Q5_K] = {
+        .type_name                = "q5_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
+        .from_float               = quantize_row_q5_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
+        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    };
+    
+    type_traits[GGML_TYPE_Q6_K] = {
+        .type_name                = "q6_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q6_K),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+        .from_float               = quantize_row_q6_K,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
+        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+    };
+    
+    type_traits[GGML_TYPE_Q8_K] = {
+        .type_name                = "q8_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q8_K),
+        .is_quantized             = true,
+        .from_float               = quantize_row_q8_K,
+	//.to_float                 = dequantize_row_q8_K, //TODOFITXME
+    };
+
+
+  struct ggml_context * ctx = NULL;
+  static bool is_first_call = true;
+  // make this function thread safe
+  ggml_critical_section_start();
+  
+
+  if (is_first_call) {
+    // initialize time system (required on Windows)
+    ggml_time_init();
+    
+    // initialize GELU, Quick GELU, SILU and EXP F32 tables
+    {
             const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
             ggml_fp16_t ii;
@@ -2209,13 +2237,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         {
             const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-                /*.numa =*/ {
-                    .n_nodes = 0,
-                    .total_cpus = 0,
-                },
-            };
+            g_state = ggml_state();
+
+	   
+	    
+	    
 
             for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
                 g_state.contexts[i].used = false;
@@ -2238,7 +2264,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     }
 
     // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+    
 
     for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
         if (!g_state.contexts[i].used) {
@@ -2265,18 +2291,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
 
-    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.no_alloc           =*/ params.no_alloc,
-        /*.no_alloc_save      =*/ params.no_alloc,
-        /*.n_objects          =*/ 0,
-        /*.objects_begin      =*/ NULL,
-        /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
-    };
+
+    (*ctx).mem_size           = mem_size;
+    (*ctx).mem_buffer         = params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
+      (*ctx).mem_buffer_owned   = params.mem_buffer ? false : true;
+    (*ctx).no_alloc           = params.no_alloc;
+    (*ctx).no_alloc_save      = params.no_alloc;
+    (*ctx).n_objects          = 0;
+    (*ctx).objects_begin      = NULL;
+    (*ctx).objects_end        = NULL;
+    ggml_scratch a;
+    (*ctx).scratch            = a;
+    (*ctx).scratch_save       = a;
+    // };
 
     GGML_ASSERT(ctx->mem_buffer != NULL);
 
@@ -2402,7 +2429,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
     // align to GGML_MEM_ALIGN
     size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
 
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@@ -2411,13 +2438,11 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
         assert(false);
         return NULL;
     }
-
-    *obj_new = (struct ggml_object) {
-        .offs = cur_end + GGML_OBJECT_SIZE,
-        .size = size_needed,
-        .next = NULL,
-        .type = type,
-    };
+    //*obj_new = //(struct ggml_object) {
+    (*obj_new).offs = cur_end + GGML_OBJECT_SIZE;
+    (*obj_new).size = size_needed;
+    (*obj_new).next = NULL;
+    (*obj_new).type = type;
 
     ggml_assert_aligned(mem_buffer + obj_new->offs);
 
@@ -2475,7 +2500,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
                 return NULL;
             }
 
-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+            data = (void*)(((char *)ctx->scratch.data) + ctx->scratch.offs);
 
             ctx->scratch.offs += data_size;
         } else {
@@ -2490,28 +2515,29 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 
     struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
 
-    *result = (struct ggml_tensor) {
-        /*.type         =*/ type,
-        /*.backend      =*/ GGML_BACKEND_CPU,
-        /*.buffer       =*/ NULL,
-        /*.n_dims       =*/ n_dims,
-        /*.ne           =*/ { 1, 1, 1, 1 },
-        /*.nb           =*/ { 0, 0, 0, 0 },
-        /*.op           =*/ GGML_OP_NONE,
-        /*.op_params    =*/ { 0 },
-        /*.is_param     =*/ false,
-        /*.grad         =*/ NULL,
-        /*.src          =*/ { NULL },
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-        /*.view_src     =*/ view_src,
-        /*.view_offs    =*/ view_offs,
-        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
-        /*.name         =*/ { 0 },
-        /*.extra        =*/ NULL,
-        /*.padding      =*/ { 0 },
-    };
+    // *result = (struct ggml_tensor) {
+    (*result).type         = type;
+    (*result).backend      = GGML_BACKEND_CPU;
+    (*result).buffer       = NULL;
+    (*result).n_dims       = n_dims;
+    for (int i =0; i < 4; i++){
+      (*result).ne[i] = 1;
+      (*result).nb[i] = 0;
+    }
+    (*result).op           = GGML_OP_NONE;
+    (*result).op_params[0]    =  0 ;
+    (*result).is_param     = false;
+    (*result).grad         = NULL;
+    (*result).src[0]          =  NULL ;
+    (*result).perf_runs    = 0;
+    (*result).perf_cycles  = 0;
+    (*result).perf_time_us = 0;
+    (*result).view_src     = view_src;
+    (*result).view_offs    = view_offs;
+    (*result).data         =obj_alloc_size > 0 ? (void *)(result + 1) : data;
+    (*result).name[0]         = 0 ;
+    (*result).extra        = NULL;
+    (*result).padding[0]      =  0 ;
 
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
     //ggml_assert_aligned(result->data);
@@ -2630,7 +2656,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
     const int nc    = tensor->ne[0];
     const size_t n1 = tensor->nb[1];
 
-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;
 
     switch (tensor->type) {
         case GGML_TYPE_I8:
@@ -2682,7 +2708,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
     const int nc    = tensor->ne[0];
     const size_t n1 = tensor->nb[1];
 
-    char * const data = tensor->data;
+    char * const data = (char*)tensor->data;
 
     switch (tensor->type) {
         case GGML_TYPE_I8:
@@ -2794,6 +2820,43 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
     return 0.0f;
 }
 
+void ggml_tensor_checksum(const struct ggml_tensor * tensor);
+void ggml_tensor_checksum(const struct ggml_tensor * tensor) {
+//   const int64_t ne = ggml_nelements(tensor) ;
+//   float fmin=0;
+//   float ffirst=0;
+//   float fmax=0;
+//   float fsum=0;
+
+//   for (int64_t j = 0; j < ne; ++j) {
+//     float f = ggml_get_f32_1d(tensor, j);
+//     if (j ==0) {
+//       ffirst = f;
+//       fmin = f;
+//       fmax = f;
+//     }
+//     fsum += f;
+//     if (f < fmin){
+//       fmin = f;
+//     }
+//     if (f >fmax){
+//       fmax = f;
+//     }    
+//   }
+
+//   auto type_name = magic_enum::enum_name(tensor->type);
+// // color_name
+//   fprintf(stderr, "JSON: { \"cnt\":%ld, \"first\":%f,\"max\":%f,\"min\":%f,\"sum\":%f, \"name\":\"%s\", \"type\":\"%s\"}\n",
+// 	  ne,
+// 	  ffirst,
+// 	  fmax,
+// 	  fmin,
+// 	  fsum,
+// 	  tensor->name,
+// 	  std::string(type_name).c_str()
+// 	  );
+}
+
 void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
     if (!ggml_is_contiguous(tensor)) {
         int64_t id[4] = { 0, 0, 0, 0 };
@@ -2911,17 +2974,30 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
                 GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                 return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
             }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
 
+    case GGML_TYPE_Q2_K:
+    case GGML_TYPE_Q3_K:
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
+    case GGML_TYPE_Q4_K:
+    case GGML_TYPE_Q5_0:
+    case GGML_TYPE_Q5_1:
+    case GGML_TYPE_Q5_K:
+    case GGML_TYPE_Q6_K:
+    case GGML_TYPE_Q8_0:
+    case GGML_TYPE_Q8_1:
+    case GGML_TYPE_Q8_K:
+    case GGML_TYPE_F32:
+      {
+	//GGML_ASSERT(tensor->nb[0] == sizeof(float));
+	return ((float *)(tensor->data))[i];
+      }
+      
+    default:
+      {
+	GGML_ASSERT(false);
+      }
+    }
     return 0.0f;
 }
 
@@ -3063,7 +3139,7 @@ struct ggml_tensor * ggml_view_tensor(
 struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
     struct ggml_object * obj = ctx->objects_begin;
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
 
     while (obj != NULL) {
         if (obj->type == GGML_OBJECT_TENSOR) {
@@ -3080,7 +3156,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
     struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
     obj = obj->next;
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
 
     while (obj != NULL) {
         if (obj->type == GGML_OBJECT_TENSOR) {
@@ -3096,7 +3172,7 @@ struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml
 struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
     struct ggml_object * obj = ctx->objects_begin;
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
 
     while (obj != NULL) {
         if (obj->type == GGML_OBJECT_TENSOR) {
@@ -3292,7 +3368,7 @@ static struct ggml_tensor * ggml_acc_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1, (int32_t)nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ACC;
@@ -4145,7 +4221,7 @@ static struct ggml_tensor * ggml_set_impl(
     // make a view of the destination
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
+    int32_t params[] = { (int32_t)nb1,(int32_t) nb2, (int32_t)nb3, (int32_t)offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_SET;
@@ -5402,7 +5478,7 @@ struct ggml_tensor * ggml_pool_2d(
     };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
 
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    int32_t params[] = { op, k0, k1, s0, s1, (int32_t)p0, (int32_t)p1 };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op = GGML_OP_POOL_2D;
@@ -5983,11 +6059,6 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
 }
 
 // ggml_map_custom1
-struct ggml_map_custom1_op_params {
-    ggml_custom1_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
 
 static struct ggml_tensor * ggml_map_custom1_impl(
         struct ggml_context          * ctx,
@@ -6040,11 +6111,6 @@ struct ggml_tensor * ggml_map_custom1_inplace(
 
 // ggml_map_custom2
 
-struct ggml_map_custom2_op_params {
-    ggml_custom2_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
 
 static struct ggml_tensor * ggml_map_custom2_impl(
         struct ggml_context          * ctx,
@@ -6101,11 +6167,6 @@ struct ggml_tensor * ggml_map_custom2_inplace(
 
 // ggml_map_custom3
 
-struct ggml_map_custom3_op_params {
-    ggml_custom3_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
 
 static struct ggml_tensor * ggml_map_custom3_impl(
         struct ggml_context          * ctx,
@@ -6365,7 +6426,7 @@ static void ggml_compute_forward_dup_f16(
                 GGML_ASSERT(false); // TODO: implement
             }
         } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
+	  printf("%s: this is not optimal - fix me\n", __func__);
 
             if (dst->type == GGML_TYPE_F32) {
                 size_t id = 0;
@@ -6612,7 +6673,7 @@ static void ggml_compute_forward_dup_f32(
                 GGML_ASSERT(false); // TODO: implement
             }
         } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
+	  printf("%s: this is not optimal - fix me\n", __func__);
 
             if (dst->type == GGML_TYPE_F32) {
                 size_t id = 0;
@@ -8262,7 +8323,7 @@ static void ggml_compute_forward_repeat_back_f32(
     GGML_ASSERT(nb00 == sizeof(float));
 
     if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
     } else {
         for         (int k3 = 0; k3 < ne3; k3++) {
             for     (int k2 = 0; k2 < ne2; k2++) {
@@ -9390,6 +9451,7 @@ static void ggml_compute_forward_mul_mat(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
+
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
@@ -9427,7 +9489,8 @@ static void ggml_compute_forward_mul_mat(
 
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
-
+    //fprintf(stderr, "%s: params_type:%d src0:%p ->data %p src1:%p ->data %p\n", __func__, params->type,  (const void*)src0, src0->data, (const void*)src1, src1->data);
+  
 #if defined(GGML_USE_CLBLAST)
     if (ggml_cl_can_mul_mat(src0, src1, dst)) {
         if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
@@ -9492,7 +9555,7 @@ static void ggml_compute_forward_mul_mat(
 
     if (params->type == GGML_TASK_INIT) {
         if (src1->type != vec_dot_type) {
-            char * wdata = params->wdata;
+	  char * wdata = (char*)params->wdata;
             const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
 
             for (int64_t i13 = 0; i13 < ne13; ++i13) {
@@ -9518,7 +9581,7 @@ static void ggml_compute_forward_mul_mat(
     const int64_t nr0 = ne01;           // src0 rows
     const int64_t nr1 = ne11*ne12*ne13; // src1 rows
 
-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+    ///printf("nr0 = %ld, nr1 = %ld\n", nr0, nr1);
 
     // distribute the thread work across the inner or outer loop based on which one is larger
 
@@ -9537,7 +9600,7 @@ static void ggml_compute_forward_mul_mat(
     const int64_t ir110 = dr1*ith1;
     const int64_t ir111 = MIN(ir110 + dr1, nr1);
 
-    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+    //printf("ir010 = %6ld, ir011 = %6ld, ir110 = %6ld, ir111 = %6ld\n", ir010, ir011, ir110, ir111);
 
     // threads with no work simply yield (not sure if it helps)
     if (ir010 >= ir011 || ir110 >= ir111) {
@@ -9646,7 +9709,7 @@ static void ggml_compute_forward_out_prod_f32(
             return;
         }
 #endif
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
         return;
     }
 
@@ -9829,7 +9892,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 
     if (params->type == GGML_TASK_INIT) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+      ggml_vec_set_f32(ne0*ne1*ne2*ne3, (float*)dst->data, 0);
         return;
     }
 
@@ -11843,7 +11906,7 @@ static void ggml_compute_forward_pool_1d(
               struct ggml_tensor * dst) {
 
     const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
     const int k0 = opts[1];
     const int s0 = opts[2];
     const int p0 = opts[3];
@@ -11867,7 +11930,7 @@ static void ggml_compute_forward_pool_2d(
     }
 
     const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
+    enum ggml_op_pool op = (ggml_op_pool)opts[0];
     const int k0 = opts[1];
     const int k1 = opts[2];
     const int s0 = opts[3];
@@ -13696,6 +13759,105 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 
 /////////////////////////////////
 
+/* const char *  ggml_op_name_table [] =  {  */
+/*   "GGML_OP_NONE", */
+/*   "GGML_OP_DUP", */
+/*   "GGML_OP_ADD", */
+/*   "GGML_OP_ADD1", */
+/*   "GGML_OP_ACC", */
+/*   "GGML_OP_SUB", */
+/*   "GGML_OP_MUL", */
+/*   "GGML_OP_DIV", */
+/*   "GGML_OP_SQR", */
+/*   "GGML_OP_SQRT", */
+/*   "GGML_OP_LOG", */
+/*   "GGML_OP_SUM", */
+/*   "GGML_OP_SUM_ROWS", */
+/*   "GGML_OP_MEAN", */
+/*   "GGML_OP_ARGMAX", */
+/*         "GGML_OP_REPEAT", */
+/*         "GGML_OP_REPEAT_BACK", */
+/*         "GGML_OP_CONCAT", */
+/*         "GGML_OP_SILU_BACK", */
+/*         "GGML_OP_NORM", */
+/*         "GGML_OP_RMS_NORM", */
+/*         "GGML_OP_RMS_NORM_BACK", */
+/*         "GGML_OP_GROUP_NORM", */
+/*         "GGML_OP_MUL_MAT", */
+/*         "GGML_OP_OUT_PROD", */
+/*         "GGML_OP_SCALE", */
+/*         "GGML_OP_SET", */
+/*         "GGML_OP_CPY", */
+/*         "GGML_OP_CONT", */
+/*         "GGML_OP_RESHAPE", */
+/*         "GGML_OP_VIEW", */
+/*         "GGML_OP_PERMUTE", */
+/*         "GGML_OP_TRANSPOSE", */
+/*         "GGML_OP_GET_ROWS", */
+/*         "GGML_OP_GET_ROWS_BACK", */
+/*         "GGML_OP_DIAG", */
+/*         "GGML_OP_DIAG_MASK_INF", */
+/*         "GGML_OP_DIAG_MASK_ZERO", */
+/*         "GGML_OP_SOFT_MAX", */
+/*         "GGML_OP_SOFT_MAX_BACK", */
+/*         "GGML_OP_ROPE", */
+/*         "GGML_OP_ROPE_BACK", */
+/*         "GGML_OP_ALIBI", */
+/*         "GGML_OP_CLAMP", */
+/*         "GGML_OP_CONV_TRANSPOSE_1D", */
+/*         "GGML_OP_IM2COL", */
+/*         "GGML_OP_CONV_TRANSPOSE_2D", */
+/*         "GGML_OP_POOL_1D", */
+/*         "GGML_OP_POOL_2D", */
+/*         "GGML_OP_UPSCALE", */
+/*         "GGML_OP_FLASH_ATTN", */
+/*         "GGML_OP_FLASH_FF", */
+/*         "GGML_OP_FLASH_ATTN_BACK", */
+/*         "GGML_OP_WIN_PART", */
+/*         "GGML_OP_WIN_UNPART", */
+/*         "GGML_OP_GET_REL_POS", */
+/*         "GGML_OP_ADD_REL_POS", */
+/*         "GGML_OP_UNARY", */
+/*         "GGML_OP_MAP_UNARY", */
+/*         "GGML_OP_MAP_BINARY", */
+/*         "GGML_OP_MAP_CUSTOM1_F32", */
+/*         "GGML_OP_MAP_CUSTOM2_F32", */
+/*         "GGML_OP_MAP_CUSTOM3_F32", */
+/*         "GGML_OP_MAP_CUSTOM1", */
+/*         "GGML_OP_MAP_CUSTOM2", */
+/*         "GGML_OP_MAP_CUSTOM3", */
+/*         "GGML_OP_CROSS_ENTROPY_LOSS", */
+/*         "GGML_OP_CROSS_ENTROPY_LOSS_BACK", */
+/*         "GGML_OP_COUNT", */
+/*     }; */
+
+    /* enum ggml_unary_op { */
+    /*     GGML_UNARY_OP_ABS, */
+    /*     GGML_UNARY_OP_SGN, */
+    /*     GGML_UNARY_OP_NEG, */
+    /*     GGML_UNARY_OP_STEP, */
+    /*     GGML_UNARY_OP_TANH, */
+    /*     GGML_UNARY_OP_ELU, */
+    /*     GGML_UNARY_OP_RELU, */
+    /*     GGML_UNARY_OP_GELU, */
+    /*     GGML_UNARY_OP_GELU_QUICK, */
+    /*     GGML_UNARY_OP_SILU, */
+    /*     GGML_UNARY_OP_LEAKY */
+    /* }; */
+
+    /* enum ggml_object_type { */
+    /*     GGML_OBJECT_TENSOR, */
+    /*     GGML_OBJECT_GRAPH, */
+    /*     GGML_OBJECT_WORK_BUFFER */
+    /* }; */
+
+    /* enum ggml_log_level { */
+    /*     GGML_LOG_LEVEL_ERROR = 2, */
+    /*     GGML_LOG_LEVEL_WARN = 3, */
+    /*     GGML_LOG_LEVEL_INFO = 4 */
+    /* }; */
+
+
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
@@ -13703,10 +13865,100 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
+    // float fmin1=0;
+    // //    float ffirst1=0;
+    // float fmax1=0;
+    // float fsum1=0;
+
+    // float fmin0=0;
+    // float ffirst0=0;
+    // float fmax0=0;
+    // float fsum0=0;
+
+    // float fmin2=0;
+    // float ffirst2=0;
+    // float fmax2=0;
+    // float fsum2=0;
+
+    // int64_t elem_src = ggml_nelements(tensor->src[0]);
+    // int64_t elem_src1 = 0; //ggml_nelements(tensor->src[1]);
+    
+    // if (tensor->src[0]) {
+    //   const size_t size = ggml_nbytes(tensor->src[0])/sizeof(float);
+    //   for (size_t i = 0; i <size; i++){       
+    // 	float f = *( ((float*)(tensor->src[0]->data))+i);
+    //   }
+    // }
+    
+    // if (tensor->src[1]) {
+    //    elem_src1 = ggml_nelements(tensor->src[1]);
+    //   const size_t size = ggml_nbytes(tensor->src[1])/sizeof(float);
+    //   for (size_t i = 0; i <size; i++){       
+    // 	float f = *( ((float*)(tensor->src[1]->data))+i);
+    // 	if (i ==0) {
+    // 	  ffirst1 = f;
+    // 	  fmin1 = f;
+    // 	  fmax1 = f;
+    // 	}
+    // 	fsum1 += f;
+    // 	if (f < fmin1){
+    // 	  fmin1 = f;
+    // 	}
+    // 	if (f >fmax1){
+    // 	  fmax1 = f;
+    // 	}
+    //   }
+    //}
+
 #ifdef GGML_USE_CUBLAS
     bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
     if (skip_cpu) {
-        return;
+
+      if (tensor->src[1]) {
+	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor->src[1]);
+	ggml_tensor_checksum(tensor);
+	
+	/* fprintf(stderr, "JSON:{\"bop\":\"%s\",\"src\":\"%s\",\"src2\":\"%s\",\"cnt1\":%ld,\"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f,\"cnt2\":%ld,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"dst\":\"%s\"}\n", */
+	/*     ggml_op_name_table[tensor->op], */
+	/*     tensor->src[0]->name, */
+	/*     tensor->src[1]->name, */
+	/*     elem_src, */
+	/*     ffirst0, */
+	/*     fmax0, */
+	/*     fmin0, */
+	/*     fsum0, */
+	    
+	/*     elem_src1, */
+	/*     ffirst1, */
+	/*     fmax1, */
+	/*     fmin1, */
+	/*     fsum1, */
+
+	/*     ffirst2, */
+	/*     fmax2, */
+	/*     fmin2, */
+	/*     fsum2, */
+
+	/*     tensor->name); */
+      }  else {
+	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor);
+	/* fprintf(stderr, "JSON: { \"uop\":%d, \"src\":\"%s\", \"cnt1\":%ld, \"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f, \"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f, \"dst\":\"%s\"}\n", */
+	/* 	tensor->op, */
+	/* 	tensor->src[0]->name, */
+	/* 	elem_src, */
+	/* 	ffirst0, */
+	/* 	fmax0, */
+	/* 	fmin0, */
+	/* 	fsum0,	       */
+	/* 	ffirst2, */
+	/* 	fmax2, */
+	/* 	fmin2, */
+	/* 	fsum2, */
+	/* 	tensor->name); */
+      }
+      return;
     }
     GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
     GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
@@ -14016,6 +14268,82 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 GGML_ASSERT(false);
             } break;
     }
+    
+    // now report
+    // int64_t elem_dst = ggml_nelements(tensor);
+
+    // const size_t size = ggml_nbytes(tensor)/sizeof(float);
+    
+    // for (size_t i = 0; i <size; i++){       
+    // 	float f = *( ((float*)(tensor->data))+i);
+    // 	if (i ==0) {
+    // 	  ffirst2 = f;
+    // 	  fmin2 = f;
+    // 	  fmax2 = f;
+    // 	}
+    // 	fsum2 += f;
+    // 	if (f < fmin2){
+    // 	  fmin2 = f;
+    // 	}
+    // 	if (f >fmax2){
+    // 	  fmax2 = f;
+    // 	}
+    //   }
+    
+    if (tensor->src[1]) {
+      	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor->src[1]);
+	ggml_tensor_checksum(tensor);
+
+    /* fprintf(stderr, "JSON:{\"bop\":\"%s\",\"src\":\"%s\",\"src2\":\"%s\",\"cnt1\":%ld,\"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f,\"cnt2\":%ld,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"cnt2\":%ld,\"dst\":\"%s\"}\n", */
+    /* 	    ggml_op_name_table[tensor->op], */
+    /* 	    tensor->src[0]->name, */
+    /* 	    tensor->src[1]->name, */
+    /* 	    elem_src, */
+    /* 	    ffirst0, */
+    /* 	    fmax0, */
+    /* 	    fmin0, */
+    /* 	    fsum0, */
+	    
+    /* 	    elem_src1, */
+    /* 	    ffirst1, */
+    /* 	    fmax1, */
+    /* 	    fmin1, */
+    /* 	    fsum1, */
+
+    /* 	    ffirst2, */
+    /* 	    fmax2, */
+    /* 	    fmin2, */
+    /* 	    fsum2, */
+
+    /* 	    elem_dst, */
+    /* 	    tensor->name); */
+
+
+    } else {
+      	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor);
+
+      /* fprintf(stderr, "JSON: { \"uop\":%d, \"src\":\"%s\", \"cnt1\":%ld, \"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f, \"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"cnt2\":%ld,\"dst\":\"%s\"}\n", */
+      /* 	      tensor->op, */
+      /* 	      tensor->src[0]->name, */
+      /* 	      // src */
+      /* 	      elem_src, */
+      /* 	      ffirst0, */
+      /* 	      fmax0, */
+      /* 	      fmin0, */
+      /* 	      fsum0, */
+
+      /* 	      // dest */
+      /* 	      ffirst2, */
+      /* 	      fmax2, */
+      /* 	      fmin2, */
+      /* 	      fsum2, */
+      /* 	      elem_dst, */
+      /* 	      tensor->name); */
+
+      }
+
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -14098,7 +14426,7 @@ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
     size = ggml_hash_size(size);
     struct ggml_hash_set result;
     result.size = size;
-    result.keys = malloc(sizeof(struct ggml_tensor *) * size);
+    result.keys = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * size);
     memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
     return result;
 }
@@ -14107,15 +14435,11 @@ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
     free(hash_set.keys);
 }
 
-struct hash_map {
-    struct ggml_hash_set set;
-    struct ggml_tensor ** vals;
-};
 
 static struct hash_map * ggml_new_hash_map(size_t size) {
-    struct hash_map * result = malloc(sizeof(struct hash_map));
+  struct hash_map * result = (hash_map *)malloc(sizeof(struct hash_map));
     result->set = ggml_hash_set_new(size);
-    result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
+    result->vals = (ggml_tensor **)malloc(sizeof(struct ggml_tensor *) * result->set.size);
     memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
     return result;
 }
@@ -15236,19 +15560,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
 
     memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
 
-    *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ size,
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ nodes_ptr,
-        /*.grads        =*/ grads_ptr,
-        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
+    (*cgraph).size         = size;
+    (*cgraph).n_nodes      = 0;
+    (*cgraph).n_leafs      = 0;
+    (*cgraph).nodes        = nodes_ptr;
+    (*cgraph).grads        = grads_ptr;
+    (*cgraph).leafs        = leafs_ptr;
+    (*cgraph).visited_hash_table.size = hash_size;
+    (*cgraph).visited_hash_table.keys = hash_keys_ptr;
+    (*cgraph).order        = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT;
+    (*cgraph).perf_runs    = 0;
+    (*cgraph).perf_cycles  = 0;
+    (*cgraph).perf_time_us = 0;
+
 
     return cgraph;
 }
@@ -15262,19 +15586,22 @@ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgra
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
     struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
 
-    *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL },
-        /*.order        =*/ cgraph0->order,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
+    // *cgraph = (struct ggml_cgraph) {
+    (*cgraph).size         = 0;
+    (*cgraph).n_nodes      = i1 - i0;
+    (*cgraph).n_leafs      = 0;
+    (*cgraph).nodes        = cgraph0->nodes + i0;
+    (*cgraph).grads        = cgraph0->grads ? cgraph0->grads + i0 : NULL;
+    (*cgraph).leafs        = NULL;
+    //(*cgraph).hash_table   = { 0, NULL };
+    (*cgraph).visited_hash_table.size = 0;
+    (*cgraph).visited_hash_table.keys = NULL;
+
+    (*cgraph).order        = cgraph0->order;
+    (*cgraph).perf_runs    = 0;
+    (*cgraph).perf_cycles  = 0;
+    (*cgraph).perf_time_us = 0;
+    // };
 
     return cgraph;
 }
@@ -15363,7 +15690,7 @@ typedef int ggml_lock_t;
 
 #define GGML_LOCK_INITIALIZER 0
 
-typedef pthread_t ggml_thread_t;
+
 
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
@@ -15397,6 +15724,7 @@ typedef pthread_t ggml_thread_t;
 
 #endif
 
+
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__linux__) && !defined(__BIONIC__)
 static void set_numa_thread_affinity(int thread_n, int n_threads) {
@@ -15452,28 +15780,7 @@ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(threa
 static void clear_numa_thread_affinity(void) {}
 #endif
 
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan  * cplan;
-
-    int64_t perf_node_start_cycles;
-    int64_t perf_node_start_time_us;
 
-    const int n_threads;
-
-    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
-
-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
-    void * abort_callback_data;
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-    int ith;
-    struct ggml_compute_state_shared * shared;
-};
 
 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
     int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
@@ -15726,11 +16033,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             // all other threads are finished and spinning
             // do finalize and init here so we don't have synchronize again
             struct ggml_compute_params params = {
-                /*.type  =*/ GGML_TASK_FINALIZE,
-                /*.ith   =*/ 0,
-                /*.nth   =*/ 0,
-                /*.wsize =*/ cplan->work_size,
-                /*.wdata =*/ cplan->work_data,
+	      .type  = GGML_TASK_FINALIZE,
+	      .ith   = 0,
+	      .nth   = 0,
+	      .wsize = cplan->work_size,
+	      .wdata = cplan->work_data,
             };
 
             if (node_n != -1) {
@@ -15809,11 +16116,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         const int n_tasks = ggml_get_n_tasks(node, n_threads);
 
         struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
-            /*.ith   =*/ state->ith,
-            /*.nth   =*/ n_tasks,
-            /*.wsize =*/ cplan->work_size,
-            /*.wdata =*/ cplan->work_data,
+	  .type  = GGML_TASK_COMPUTE,
+	  .ith   = state->ith,
+	  .nth   = n_tasks,
+	  .wsize = cplan->work_size,
+	  .wdata = cplan->work_data,
         };
 
         if (state->ith < n_tasks) {
@@ -16034,18 +16341,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
         /*.abort_callback          =*/ NULL,
         /*.abort_callback_data     =*/ NULL,
     };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+    struct ggml_compute_state * workers = (struct ggml_compute_state *)alloca(sizeof(struct ggml_compute_state)*n_threads);
 
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-            };
+	  // workers[j] = (struct ggml_compute_state) {
+	  workers[j].thrd   = 0;
+	  workers[j].ith = j;
+	  workers[j].shared = &state_shared;
 
-            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+	  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
             GGML_ASSERT(rc == 0);
             UNUSED(rc);
         }
@@ -16125,29 +16431,29 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            tensor->n_dims,
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
+    //    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
+            // ggml_type_name(tensor->type),
+            // ggml_op_name  (tensor->op),
+            // tensor->n_dims,
+            // ne[0], ne[1], ne[2], ne[3],
+            // nb[0], nb[1], nb[2], nb[3],
+            // tensor->data,
+            // tensor->name);
 }
 
 static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            arg,
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            tensor->n_dims,
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
+    //fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
+            // arg,
+            // ggml_type_name(tensor->type),
+            // ggml_op_name  (tensor->op),
+            // tensor->n_dims,
+            // ne[0], ne[1], ne[2], ne[3],
+            // nb[0], nb[1], nb[2], nb[3],
+            // tensor->data,
+            // tensor->name);
 }
 
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
@@ -16362,12 +16668,13 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
         // create the data context
         {
             const size_t overhead = 1*ggml_tensor_overhead();
+	    GGML_ASSERT(0);
+	    // FIXME
+            struct ggml_init_params params(
+					   fsize + overhead,
+					   NULL,
+					   false);
 
-            struct ggml_init_params params = {
-                .mem_size   = fsize + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = false,
-            };
 
             *ctx_data = ggml_init(params);
 
@@ -16419,11 +16726,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
         {
             const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
 
-            struct ggml_init_params params = {
-                .mem_size   = size_eval + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = true,
-            };
+            struct ggml_init_params params(
+					   size_eval + overhead,
+NULL,
+					   true);
 
             *ctx_eval = ggml_init(params);
 
@@ -16631,7 +16937,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
             continue;
         }
 
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name((ggml_op)i), (double) perf_total_per_op_us[i] / 1000.0);
     }
 
     GGML_PRINT("========================================\n");
@@ -16903,11 +17209,11 @@ static enum ggml_opt_result ggml_opt_adam(
     const int n_accum = MAX(1, params.n_gradient_accumulation);
     const float accum_norm = 1.0f / (float) n_accum;
 
-    float * g  = opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
+    float * g  = (float*)opt->adam.g->data;  // gradients
+    float * m  = (float*)opt->adam.m->data;  // first moment
+    float * v  = (float*)opt->adam.v->data;  // second moment
 
-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float *)opt->adam.pf->data : NULL; // past function values
 
     struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
@@ -17085,12 +17391,6 @@ static enum ggml_opt_result ggml_opt_adam(
 //   https://github.com/chokkan/liblbfgs
 //
 
-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};
 
 static enum ggml_opt_result linesearch_backtracking(
         const struct ggml_opt_params * params,
@@ -17175,7 +17475,7 @@ static enum ggml_opt_result linesearch_backtracking(
         } else {
             // Armijo condition is satisfied
             if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
+	      return (ggml_opt_result)count;
             }
 
             ggml_vec_dot_f32(nx, &dg, g, d);
@@ -17186,14 +17486,14 @@ static enum ggml_opt_result linesearch_backtracking(
             } else {
                 if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
                     // regular Wolfe conditions
-                    return count;
+		  return (ggml_opt_result)count;
                 }
 
                 if(dg > -params->lbfgs.wolfe*dginit) {
                     width = dec;
                 } else {
                     // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
+		  return (ggml_opt_result)count;
                 }
             }
         }
@@ -17258,13 +17558,13 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
-    float * x  = opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
+    float * x  = (float*)opt->lbfgs.x->data;  // current parameters
+    float * xp = (float*)opt->lbfgs.xp->data; // previous parameters
+    float * g  = (float*)opt->lbfgs.g->data;  // current gradient
+    float * gp = (float*)opt->lbfgs.gp->data; // previous gradient
+    float * d  = (float*)opt->lbfgs.d->data;  // search direction
 
-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
+    float * pf = params.past > 0 ? (float*)opt->lbfgs.pf->data : NULL; // past function values
 
     const int n_accum = MAX(1, params.n_gradient_accumulation);
     const float accum_norm = 1.0f / (float) n_accum;
@@ -17277,10 +17577,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     ggml_opt_get_params(np, ps, x);
 
     // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
+    float * lm_alpha = (float*)opt->lbfgs.lmal->data;
+    float * lm_ys    = (float*)opt->lbfgs.lmys->data;
+    float * lm_s     = (float*)opt->lbfgs.lms->data;
+    float * lm_y     = (float*)opt->lbfgs.lmy->data;
 
     bool cancel = false;
 
@@ -17377,7 +17677,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             ggml_vec_cpy_f32(nx, x, xp);
             ggml_vec_cpy_f32(nx, g, gp);
 
-            return ls;
+            return (ggml_opt_result)ls;
         }
 
         opt->loss_after = fx;
@@ -17488,66 +17788,65 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
     switch (type) {
         case GGML_OPT_ADAM:
             {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_ADAM,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 100,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .adam = {
-                        .n_iter = 10000,
-                        .sched  = 1.000f,
-                        .decay  = 0.0f,
-                        .decay_min_ndim = 2,
-                        .alpha  = 0.001f,
-                        .beta1  = 0.9f,
-                        .beta2  = 0.999f,
-                        .eps    = 1e-8f,
-                        .eps_f  = 1e-5f,
-                        .eps_g  = 1e-3f,
-                        .gclip  = 0.0f,
-                    },
-                };
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_LBFGS,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1,
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 0,
 
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
+	      // result = (struct ggml_opt_params) {
+	      result.type       = GGML_OPT_ADAM;
+	      result.graph_size = GGML_DEFAULT_GRAPH_SIZE;
+	      result.n_threads  = 1; // FIXME: GGML_DEFAULT_N_THREADS ?
+	      result.past       = 0;
+	      result.delta      = 1e-5f;
 
-                    .n_gradient_accumulation = 1,
+                result.max_no_improvement = 100;
 
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
+                result.print_forward_graph  = true;
+                result.print_backward_graph = true;
 
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
+                result.n_gradient_accumulation = 1;
 
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
+                // result.adam = {
+		  result.adam.n_iter = 10000;
+		  result.adam.sched  = 1.000f;
+                result.adam.decay  = 0.0f;
+                result.adam.decay_min_ndim = 2;
+                result.adam.alpha  = 0.001f;
+                result.adam.beta1  = 0.9f;
+                result.adam.beta2  = 0.999f;
+                result.adam.eps    = 1e-8f;
+                result.adam.eps_f  = 1e-5f;
+                result.adam.eps_g  = 1e-3f;
+                result.adam.gclip  = 0.0f;
+                //     },
+                // };
             } break;
+        case GGML_OPT_LBFGS:
+	  break;
+	  //{
+
+	      // TODO FIXME
+                // result = (struct ggml_opt_params) {
+	  result.type       = GGML_OPT_LBFGS;
+	  result.graph_size = GGML_DEFAULT_GRAPH_SIZE;
+	  result.n_threads  = 1;
+	  result.past       = 0;
+	  result.delta      = 1e-5f	;	
+	  result.max_no_improvement = 0;
+	  result.print_forward_graph  = true;
+	  result.print_backward_graph = true;
+	  result.n_gradient_accumulation = 1;
+	  
+	  result.lbfgs.m              = 6;
+	  result.lbfgs.n_iter         = 100;
+	  result.lbfgs.max_linesearch = 20;
+	  result.lbfgs.eps      = 1e-5f;
+	  result.lbfgs.ftol     = 1e-4f;
+	  result.lbfgs.wolfe    = 0.9f;
+	  result.lbfgs.min_step = 1e-20f;
+	  result.lbfgs.max_step = 1e+20f;
+	  result.lbfgs.linesearch = GGML_LINESEARCH_DEFAULT;
+	  
+                //     }
+	      //};
+            //} break;
     }
 
     return result;
@@ -17564,7 +17863,7 @@ GGML_API void ggml_opt_init(
     opt->nx = nx;
     opt->just_initialized = true;
     if (opt->ctx == NULL) {
-        struct ggml_init_params ctx_opt_params;
+      struct ggml_init_params ctx_opt_params;
         if (opt->params.type == GGML_OPT_ADAM) {
             ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
             if (opt->params.past > 0) {
@@ -17632,11 +17931,11 @@ enum ggml_opt_result ggml_opt(
         struct ggml_tensor * f) {
     bool free_ctx = false;
     if (ctx == NULL) {
-        struct ggml_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
+      struct ggml_init_params params_ctx;// = {
+      params_ctx.mem_size   = 16*1024*1024;
+      params_ctx.mem_buffer = NULL;
+      params_ctx.no_alloc   = false;
+        // };
 
         ctx = ggml_init(params_ctx);
         if (ctx == NULL) {
@@ -17718,7 +18017,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+        block_q4_0 * __restrict__ y = (block_q4_0 *) dst + b/QK4_0;
 
         quantize_row_q4_0_reference(src + b, y, k);
 
@@ -17741,7 +18040,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_1;
 
     for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+        block_q4_1 * __restrict__ y = (block_q4_1 *) dst + b/QK4_1;
 
         quantize_row_q4_1_reference(src + b, y, k);
 
@@ -17764,7 +18063,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK5_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+        block_q5_0 * __restrict__ y = (block_q5_0 *)dst + b/QK5_0;
 
         quantize_row_q5_0_reference(src + b, y, k);
 
@@ -17794,7 +18093,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK5_1;
 
     for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+        block_q5_1 * __restrict__ y = (block_q5_1 *)dst + b/QK5_1;
 
         quantize_row_q5_1_reference(src + b, y, k);
 
@@ -17824,7 +18123,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK8_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+        block_q8_0 * __restrict__ y = (block_q8_0 *)dst + b/QK8_0;
 
         quantize_row_q8_0_reference(src + b, y, k);
 
@@ -17923,110 +18222,41 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct gguf_str {
-    uint64_t n;  // GGUFv2
-    char * data;
-};
 
-static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
-};
-static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
-
-static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = "u8",
-    [GGUF_TYPE_INT8]    = "i8",
-    [GGUF_TYPE_UINT16]  = "u16",
-    [GGUF_TYPE_INT16]   = "i16",
-    [GGUF_TYPE_UINT32]  = "u32",
-    [GGUF_TYPE_INT32]   = "i32",
-    [GGUF_TYPE_FLOAT32] = "f32",
-    [GGUF_TYPE_BOOL]    = "bool",
-    [GGUF_TYPE_STRING]  = "str",
-    [GGUF_TYPE_ARRAY]   = "arr",
-    [GGUF_TYPE_UINT64]  = "u64",
-    [GGUF_TYPE_INT64]   = "i64",
-    [GGUF_TYPE_FLOAT64] = "f64",
-};
+//static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+  // [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
+  // [GGUF_TYPE_INT8]    = sizeof(int8_t),
+  // [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
+  // [GGUF_TYPE_INT16]   = sizeof(int16_t),
+  // [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
+  // [GGUF_TYPE_INT32]   = sizeof(int32_t),
+  // [GGUF_TYPE_FLOAT32] = sizeof(float),
+  // [GGUF_TYPE_BOOL]    = sizeof(bool),
+  // [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
+  // [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
+  // [GGUF_TYPE_INT64]   = sizeof(int64_t),
+  // [GGUF_TYPE_FLOAT64] = sizeof(double),
+  // [GGUF_TYPE_ARRAY]   = 0, // undefined
+//};
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 
-union gguf_value {
-    uint8_t  uint8;
-    int8_t   int8;
-    uint16_t uint16;
-    int16_t  int16;
-    uint32_t uint32;
-    int32_t  int32;
-    float    float32;
-    uint64_t uint64;
-    int64_t  int64;
-    double   float64;
-    bool     bool_;
-
-    struct gguf_str str;
-
-    struct {
-        enum gguf_type type;
-
-        uint64_t n;  // GGUFv2
-        void * data;
-    } arr;
-};
-
-struct gguf_kv {
-    struct gguf_str key;
-
-    enum  gguf_type  type;
-    union gguf_value value;
-};
-
-struct gguf_header {
-    char magic[4];
-    uint32_t version;
-    uint64_t n_tensors; // GGUFv2
-    uint64_t n_kv;      // GGUFv2
-};
-
-struct gguf_tensor_info {
-    struct gguf_str name;
-
-    uint32_t n_dims;
-    uint64_t ne[GGML_MAX_DIMS];
-
-    enum ggml_type type;
-
-    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
-
-    // for writing API
-    const void * data;
-    size_t size;
-};
 
-struct gguf_context {
-    struct gguf_header header;
-
-    struct gguf_kv          * kv;
-    struct gguf_tensor_info * infos;
-
-    size_t alignment;
-    size_t offset;    // offset of `data` from beginning of file
-    size_t size;      // size of `data` in bytes
+    // [GGUF_TYPE_UINT8]   = "u8",
+    // [GGUF_TYPE_INT8]    = "i8",
+    // [GGUF_TYPE_UINT16]  = "u16",
+    // [GGUF_TYPE_INT16]   = "i16",
+    // [GGUF_TYPE_UINT32]  = "u32",
+    // [GGUF_TYPE_INT32]   = "i32",
+    // [GGUF_TYPE_FLOAT32] = "f32",
+    // [GGUF_TYPE_BOOL]    = "bool",
+    // [GGUF_TYPE_STRING]  = "str",
+    // [GGUF_TYPE_ARRAY]   = "arr",
+    // [GGUF_TYPE_UINT64]  = "u64",
+    // [GGUF_TYPE_INT64]   = "i64",
+    // [GGUF_TYPE_FLOAT64] = "f64",
+//};
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 
-    //uint8_t * padding;
-    void * data;
-};
 
 static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
     const size_t n = fread(dst, 1, size, file);
@@ -18040,14 +18270,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
 
     bool ok = true;
 
-    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
     ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
 
     return ok;
 }
 
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+  struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
@@ -18092,7 +18322,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     bool ok = true;
 
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     // read the header
     {
@@ -18124,12 +18354,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the kv pairs
     {
-        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
+      ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
 
         for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
 
-            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+            fprintf(stderr, "%s: reading kv %ld\n", __func__, i);
 
             ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
             ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
@@ -18199,7 +18429,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the tensor infos
     {
-        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+      ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
 
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct gguf_tensor_info * info = &ctx->infos[i];
@@ -18283,11 +18513,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             (ctx->header.n_tensors    )*ggml_tensor_overhead() :
             (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
 
-        struct ggml_init_params pdata = {
-            .mem_size   = mem_size,
-            .mem_buffer = NULL,
-            .no_alloc   = params.no_alloc,
-        };
+	// FIXME
+        struct ggml_init_params pdata(
+				      mem_size,
+				      NULL,
+				      params.no_alloc);
 
         *params.ctx = ggml_init(pdata);
 
@@ -18319,10 +18549,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         // create the tensors
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
+	      (int64_t)ctx->infos[i].ne[0],
+	      (int64_t)ctx->infos[i].ne[1],
+	      (int64_t)ctx->infos[i].ne[2],
+	      (int64_t)ctx->infos[i].ne[3],
             };
 
             struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@@ -18603,7 +18833,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
 
     const int n_kv = gguf_get_n_kv(ctx);
 
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
     ctx->header.n_kv++;
@@ -18739,7 +18969,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
             case GGUF_TYPE_ARRAY:
                 {
                     if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+		      const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
                         for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                             data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                         }
@@ -18760,7 +18990,7 @@ void gguf_add_tensor(
              struct gguf_context * ctx,
         const struct ggml_tensor * tensor) {
     const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
 
     ctx->infos[idx].name.n    = strlen(tensor->name);
     ctx->infos[idx].name.data = strdup(tensor->name);
@@ -18819,11 +19049,6 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 //    fwrite(val, sizeof(char), size, file);
 //}
 
-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
 
 static struct gguf_buf gguf_buf_init(size_t size) {
     struct gguf_buf buf = {
diff --git a/ggml.h b/ggml.h
index f2fce0f22d357..66a6b65bc5a82 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include<refl-cpp/refl.hpp>
 //
 // GGML Tensor Library
 //
@@ -285,7 +286,7 @@
     GGML_UNUSED(prefix##3);
 
 #ifdef  __cplusplus
-extern "C" {
+//extern "C" {
 #endif
 
 #if defined(__ARM_NEON) && defined(__CUDACC__)
@@ -465,7 +466,7 @@ extern "C" {
     };
 
     // ggml object
-    struct ggml_object {
+    struct ggml_object : refl::attr::usage::type {
         size_t offs;
         size_t size;
 
@@ -479,7 +480,7 @@ extern "C" {
     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
     // n-dimensional tensor
-    struct ggml_tensor {
+    struct ggml_tensor : refl::attr::usage::type{
         enum ggml_type         type;
         enum ggml_backend_type backend;
 
@@ -524,7 +525,7 @@ extern "C" {
 
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
+    struct ggml_cplan : refl::attr::usage::type{
         size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
@@ -541,13 +542,13 @@ extern "C" {
         GGML_CGRAPH_EVAL_ORDER_COUNT
     };
 
-    struct ggml_hash_set {
+    struct ggml_hash_set : refl::attr::usage::type{
         size_t size;
         struct ggml_tensor ** keys;
     };
 
     // computation graph
-    struct ggml_cgraph {
+    struct ggml_cgraph : refl::attr::usage::type{
         int size;
         int n_nodes;
         int n_leafs;
@@ -567,13 +568,31 @@ extern "C" {
     };
 
     // scratch buffer
-    struct ggml_scratch {
+    struct ggml_scratch : refl::attr::usage::type{
         size_t offs;
         size_t size;
         void * data;
+
+      ggml_scratch()
+      : offs(0),
+	  size(0),
+	  data(0)
+      {}
     };
 
-    struct ggml_init_params {
+    struct ggml_init_params : refl::attr::usage::type{
+
+      ggml_init_params(size_t mem_size,
+		       void * mem_buffer,
+		       bool   no_alloc):
+	mem_size( mem_size),
+        mem_buffer(mem_buffer),
+        no_alloc(no_alloc){}
+      ggml_init_params():
+	mem_size(0),
+        mem_buffer(0),
+        no_alloc(0){}
+      
         // memory pool
         size_t mem_size;   // bytes
         void * mem_buffer; // if NULL, memory will be allocated internally
@@ -591,7 +610,7 @@ extern "C" {
         GGML_TASK_FINALIZE,
     };
 
-    struct ggml_compute_params {
+    struct ggml_compute_params : refl::attr::usage::type{
         enum ggml_task_type type;
 
         // ith = thread index, nth = number of threads
@@ -1829,7 +1848,7 @@ extern "C" {
     //
     //   see ggml.c (ggml_opt_default_params) for default values
     //
-    struct ggml_opt_params {
+    struct ggml_opt_params : refl::attr::usage::type{
         enum ggml_opt_type type;
 
         size_t graph_size;
@@ -1859,7 +1878,7 @@ extern "C" {
         int n_gradient_accumulation;
 
         // ADAM parameters
-        struct {
+        struct ggml_adam: refl::attr::usage::type{
             int n_iter;
 
             float sched; // schedule multiplier (fixed, decay or warmup)
@@ -1875,7 +1894,7 @@ extern "C" {
         } adam;
 
         // LBFGS parameters
-        struct {
+        struct ggml_lbfgs: refl::attr::usage::type{
             int m; // number of corrections to approximate the inv. Hessian
             int n_iter;
             int max_linesearch;
@@ -1890,7 +1909,7 @@ extern "C" {
         } lbfgs;
     };
 
-    struct ggml_opt_context {
+    struct ggml_opt_context : refl::attr::usage::type{
         struct ggml_context * ctx;
         struct ggml_opt_params params;
 
@@ -1902,7 +1921,7 @@ extern "C" {
         float loss_before;
         float loss_after;
 
-        struct {
+        struct ggml_grad : refl::attr::usage::type{
             struct ggml_tensor * g;  // current gradient
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
@@ -1912,7 +1931,7 @@ extern "C" {
             int n_no_improvement;
         } adam;
 
-        struct {
+        struct ggml_params : refl::attr::usage::type{
             struct ggml_tensor * x;    // current parameters
             struct ggml_tensor * xp;   // previous parameters
             struct ggml_tensor * g;    // current gradient
@@ -2005,7 +2024,9 @@ extern "C" {
 
     struct gguf_context;
 
-    struct gguf_init_params {
+    struct gguf_init_params : refl::attr::usage::type{
+      gguf_init_params(bool no_alloc, struct ggml_context ** ctx): no_alloc(no_alloc),ctx(ctx){}
+      
         bool no_alloc;
 
         // if not NULL, create a ggml_context and allocate the tensor data in it
@@ -2142,7 +2163,7 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
     typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
-    typedef struct {
+    typedef struct ggml_something : refl::attr::usage::type{
         const char      * type_name;
         int               blck_size;
         size_t            type_size;
@@ -2157,5 +2178,5 @@ extern "C" {
     GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
 #ifdef  __cplusplus
-}
+//}
 #endif
diff --git a/llama-internal.hpp b/llama-internal.hpp
new file mode 100644
index 0000000000000..33cf39e5d4f58
--- /dev/null
+++ b/llama-internal.hpp
@@ -0,0 +1,896 @@
+#include <set>
+#include <queue>
+enum llm_arch {
+    LLM_ARCH_LLAMA,
+    LLM_ARCH_FALCON,
+    LLM_ARCH_BAICHUAN,
+    LLM_ARCH_GPT2,
+    LLM_ARCH_GPTJ,
+    LLM_ARCH_GPTNEOX,
+    LLM_ARCH_MPT,
+    LLM_ARCH_STARCODER,
+    LLM_ARCH_PERSIMMON,
+    LLM_ARCH_REFACT,
+    LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+    LLM_KV_GENERAL_ARCHITECTURE,
+    LLM_KV_GENERAL_QUANTIZATION_VERSION,
+    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_NAME,
+    LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_URL,
+    LLM_KV_GENERAL_DESCRIPTION,
+    LLM_KV_GENERAL_LICENSE,
+    LLM_KV_GENERAL_SOURCE_URL,
+    LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+    LLM_KV_CONTEXT_LENGTH,
+    LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_BLOCK_COUNT,
+    LLM_KV_FEED_FORWARD_LENGTH,
+    LLM_KV_USE_PARALLEL_RESIDUAL,
+    LLM_KV_TENSOR_DATA_LAYOUT,
+
+    LLM_KV_ATTENTION_HEAD_COUNT,
+    LLM_KV_ATTENTION_HEAD_COUNT_KV,
+    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+    LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_LAYERNORM_EPS,
+    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+
+    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
+
+    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_LIST,
+    LLM_KV_TOKENIZER_TOKEN_TYPE,
+    LLM_KV_TOKENIZER_SCORES,
+    LLM_KV_TOKENIZER_MERGES,
+    LLM_KV_TOKENIZER_BOS_ID,
+    LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_UNK_ID,
+    LLM_KV_TOKENIZER_SEP_ID,
+    LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_HF_JSON,
+    LLM_KV_TOKENIZER_RWKV,
+};
+
+// available llama models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_1B,
+    MODEL_3B,
+    MODEL_7B,
+    MODEL_8B,
+    MODEL_13B,
+    MODEL_15B,
+    MODEL_30B,
+    MODEL_34B,
+    MODEL_40B,
+    MODEL_65B,
+    MODEL_70B,
+};
+
+enum llama_fver {
+    GGUF_FILE_VERSION_V1 = 1,
+    GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
+};
+
+struct LLM_KV {
+  LLM_KV(llm_arch arch) : arch(arch) {}
+
+  llm_arch arch;
+
+  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
+
+};
+
+enum llm_tensor {
+    LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_ROPE_FREQS,
+    LLM_TENSOR_ATTN_Q,
+    LLM_TENSOR_ATTN_K,
+    LLM_TENSOR_ATTN_V,
+    LLM_TENSOR_ATTN_QKV,
+    LLM_TENSOR_ATTN_OUT,
+    LLM_TENSOR_ATTN_NORM,
+    LLM_TENSOR_ATTN_NORM_2,
+    LLM_TENSOR_ATTN_ROT_EMBD,
+    LLM_TENSOR_FFN_GATE,
+    LLM_TENSOR_FFN_DOWN,
+    LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_ATTN_Q_NORM,
+    LLM_TENSOR_ATTN_K_NORM,
+};
+
+
+struct llama_cparams {
+    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_threads;       // number of threads to use for generation
+    uint32_t n_threads_batch; // number of threads to use for batch processing
+
+    float    rope_freq_base;
+    float    rope_freq_scale;
+
+    uint32_t n_yarn_orig_ctx;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+
+    bool mul_mat_q;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attn_norm;
+    struct ggml_tensor * attn_norm_b;
+    struct ggml_tensor * attn_norm_2;
+    struct ggml_tensor * attn_norm_2_b;
+    struct ggml_tensor * attn_q_norm;
+    struct ggml_tensor * attn_q_norm_b;
+    struct ggml_tensor * attn_k_norm;
+    struct ggml_tensor * attn_k_norm_b;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+    struct ggml_tensor * wqkv;
+
+    // attention bias
+    struct ggml_tensor * bo;
+    struct ggml_tensor * bqkv;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    struct ggml_tensor * ffn_norm_b;
+
+    // ff
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
+
+    // ff bias
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
+};
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+	return seq_id.find(id) != seq_id.end();
+    }
+};
+
+struct llama_buffer {
+    void * data = NULL;
+    size_t size = 0;
+
+    // fallback to malloc / free
+    // useful in cases where CUDA can try to allocate PINNED memory
+    bool fallback = false;
+
+  void resize(size_t n) ;
+
+
+  ~llama_buffer();
+
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<llama_kv_cell> cells;
+
+    struct ggml_tensor * k = NULL;
+    struct ggml_tensor * v = NULL;
+
+    struct ggml_context * ctx = NULL;
+
+    llama_buffer buf;
+
+    ~llama_kv_cache() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    ggml_cuda_free_data(k);
+	    ggml_cuda_free_data(v);
+	}
+#endif
+    }
+};
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+	token text;
+	float score;
+	ttype type;
+    };
+
+    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::unordered_map<token, id> special_tokens_cache;
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id = 1;
+    id special_eos_id = 2;
+    id special_unk_id = 0;
+    id special_sep_id = -1;
+    id special_pad_id = -1;
+
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
+    id linefeed_id       = 13;
+    id special_prefix_id = 32007;
+    id special_middle_id = 32009;
+    id special_suffix_id = 32008;
+    id special_eot_id    = 32010;
+
+    int find_bpe_rank(std::string token_left, std::string token_right) const {
+	GGML_ASSERT(token_left.find(" ") == std::string::npos);
+	GGML_ASSERT(token_left.find("\n") == std::string::npos);
+	GGML_ASSERT(token_right.find(" ") == std::string::npos);
+	GGML_ASSERT(token_right.find("\n") == std::string::npos);
+
+	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+	if (it == bpe_ranks.end()) {
+	    return -1;
+	}
+
+	return it->second;
+    }
+};
+
+struct llama_mmap {
+  void * addr;
+  size_t size;
+
+  llama_mmap(const llama_mmap &) = delete;
+
+  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
+  ~llama_mmap();
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+#else
+    static constexpr bool SUPPORTED = false;
+#endif
+};
+
+
+struct llama_hparams {
+    bool     vocab_only;
+    uint32_t n_vocab;
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_yarn_orig_ctx;
+    int8_t   rope_scaling_type_train : 3;
+    bool     rope_finetuned : 1;
+
+    float f_clamp_kqv;
+    float f_max_alibi_bias;
+
+  bool operator!=(const llama_hparams & other) const;
+    uint32_t n_gqa() const {
+	return n_head/n_head_kv;
+    }
+
+    uint32_t n_embd_head() const {
+	return n_embd/n_head;
+    }
+
+    uint32_t n_embd_gqa() const {
+	return n_embd/n_gqa();
+    }
+};
+
+struct llama_mlock {
+  void * addr = NULL;
+  size_t size = 0;
+  bool failed_already = false;
+  llama_mlock() ;
+
+  llama_mlock(const llama_mlock &) = delete;
+  ~llama_mlock();
+  void init(void * ptr);
+  void grow_to(size_t target_size);
+#ifdef _POSIX_MEMLOCK_RANGE
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION						\
+  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION						\
+  "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+#endif
+  bool raw_lock(const void * addr, size_t size) const ;
+#undef MLOCK_SUGGESTION
+  static void raw_unlock(void * addr, size_t size);
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+  bool raw_lock(void * ptr, size_t len) const ;
+  static void raw_unlock(void * ptr, size_t len);
+#else
+    static constexpr bool SUPPORTED = false;
+  static size_t lock_granularity();
+  bool raw_lock(const void * addr, size_t len) const;
+  static void raw_unlock(const void * addr, size_t len);
+#endif
+};
+
+
+struct llama_model {
+    e_model     type  = MODEL_UNKNOWN;
+    llm_arch    arch  = LLM_ARCH_UNKNOWN;
+    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+
+    std::string name = "n/a";
+
+    llama_hparams hparams = {};
+    llama_vocab   vocab;
+
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
+    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm_b;
+
+    struct ggml_tensor * output_norm;
+    struct ggml_tensor * output_norm_b;
+    struct ggml_tensor * output;
+
+    std::vector<llama_layer> layers;
+
+    int n_gpu_layers;
+
+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
+    // context
+    struct ggml_context * ctx = NULL;
+
+    // the model memory buffer
+    llama_buffer buf;
+
+    // model memory mapped file
+    std::unique_ptr<llama_mmap> mapping;
+
+    // objects representing data potentially being locked in memory
+    llama_mlock mlock_buf;
+    llama_mlock mlock_mmap;
+
+    // for quantize-stats only
+    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+
+    ~llama_model() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+		ggml_cuda_free_data(tensors_by_name[i].second);
+	    }
+	    ggml_cuda_free_scratch();
+	}
+#endif
+
+#if defined(GGML_USE_CLBLAST)
+	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+	    ggml_cl_free_data(tensors_by_name[i].second);
+	}
+#endif
+    }
+};
+
+struct llama_context {
+    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
+  ~llama_context();
+
+    llama_cparams cparams;
+
+    const llama_model & model;
+
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
+
+    std::mt19937 rng;
+
+    bool has_evaluated_once = false;
+
+    int64_t t_start_us;
+    int64_t t_load_us;
+    int64_t t_sample_us = 0;
+    int64_t t_p_eval_us = 0;
+    int64_t t_eval_us   = 0;
+
+    int32_t n_sample = 0; // number of tokens sampled
+    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    int32_t n_eval   = 0; // number of eval calls
+
+    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    std::vector<float> logits;
+    bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
+
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
+    // memory buffers used to evaluate the model
+    llama_buffer buf_compute;
+
+    llama_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+
+#ifdef GGML_USE_METAL
+    ggml_metal_context * ctx_metal = NULL;
+#endif
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+};
+
+
+struct LLM_TN {
+  LLM_TN(llm_arch arch) ;
+
+  llm_arch arch;
+
+  std::string operator()(llm_tensor tensor) const;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
+
+  std::string operator()(llm_tensor tensor, int bid) const ;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
+
+};
+
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+  llama_file(const char * fname, const char * mode) ;
+  size_t tell() const;
+  void seek(size_t offset, int whence) const;
+  void read_raw(void * ptr, size_t len) const;
+  uint32_t read_u32() const;
+  void write_raw(const void * ptr, size_t len) const ;
+  void write_u32(std::uint32_t val) const;
+  ~llama_file();
+
+};
+
+
+struct llama_state {
+  llama_state();
+    // We save the log callback globally
+    ggml_log_callback log_callback;
+    void * log_callback_user_data = nullptr;
+};
+
+
+
+struct llama_model_loader {
+    int n_kv      = 0;
+    int n_tensors = 0;
+    int n_created = 0;
+
+    int64_t n_elements = 0;
+    size_t  n_bytes    = 0;
+
+    bool use_mmap = false;
+
+    llama_file  file;
+    llama_ftype ftype;
+    llama_fver  fver;
+
+    std::unique_ptr<llama_mmap> mapping;
+
+    struct gguf_context * ctx_gguf = NULL;
+    struct ggml_context * ctx_meta = NULL;
+
+  llama_model_loader(const std::string & fname, bool use_mmap) ;
+
+  ~llama_model_loader();
+
+  std::string get_arch_name() const;
+
+  enum llm_arch get_arch() const ;
+  const char * get_tensor_name(int i) const;
+
+  struct ggml_tensor * get_tensor_meta(int i) const;
+
+  void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
+
+  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
+
+  struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
+
+  void done_getting_tensors() const;
+
+  size_t file_offset(const char * name) const;
+
+
+  void load_data_for(struct ggml_tensor * cur) const ;
+  void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
+};
+
+struct llama_data_context {
+    virtual void write(const void * src, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
+    virtual ~llama_data_context() = default;
+};
+
+struct llama_data_buffer_context : llama_data_context {
+    uint8_t * ptr;
+    size_t size_written = 0;
+  llama_data_buffer_context(uint8_t * p) ;
+  void write(const void * src, size_t size) override ;
+  size_t get_size_written() override ;
+};
+
+struct llama_data_file_context : llama_data_context {
+    llama_file * file;
+    size_t size_written = 0;
+  llama_data_file_context(llama_file * f);
+  size_t get_size_written() override ;
+  void write(const void * src, size_t size);
+};
+
+
+struct llama_beam {
+  std::vector<llama_token> tokens;
+  float p;  // Cumulative beam probability (renormalized relative to all beams)
+  bool eob; // Initialize end-of-beam to false. Callback sets this to true.
+  // Sort beams by probability. In case of ties, prefer beams at eob.
+  bool operator<(const llama_beam & rhs) const ;
+  void shift_tokens(const size_t n) ;
+  llama_beam_view view() const;
+};
+
+// A struct for calculating logit-related info.
+struct llama_logit_info {
+    const float * const logits;
+    const int n_vocab;
+    const float max_l;
+    const float normalizer;
+    struct sum_exp {
+	float max_l;
+	float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+    };
+  llama_logit_info(llama_context * ctx);
+  llama_token_data get_token_data(const llama_token token_id) const ;
+  std::vector<llama_token_data> top_k(size_t k) ;
+  float probability_from_logit(float logit) const ;
+};
+
+
+struct llama_beam_search_data {
+  llama_context * ctx;
+  size_t n_beams;
+  int n_past;
+  int n_predict;
+  std::vector<llama_beam> beams;
+  std::vector<llama_beam> next_beams;
+  size_t common_prefix_length;
+  std::vector<llama_beam_view> beam_views;
+  llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
+  void collapse_beams(const size_t beam_idx) ;
+  void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
+  size_t find_common_prefix_length() ;
+  llama_beams_state get_beams_state(const bool last_call) ;
+  void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
+  static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
+  size_t top_beam_index();
+  void update_beams_from_beam_views();
+};
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llm_build_context {
+    const llama_model    & model;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head;
+    const int64_t n_embd_gqa;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_orig_ctx;
+
+    const bool do_rope_shift;
+
+    const llm_build_cb & cb;
+
+    llama_buffer & buf_compute;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+	llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+	bool   worst_case);
+
+  void init() ;
+  void free() ;
+  struct ggml_cgraph * build_llama() ;
+  struct ggml_cgraph * build_baichuan() ;
+  struct ggml_cgraph * build_falcon() ;
+  struct ggml_cgraph * build_starcoder() ;
+  struct ggml_cgraph * build_persimmon() ;
+  struct ggml_cgraph * build_refact() ;
+  struct ggml_cgraph * build_bloom() ;
+  struct ggml_cgraph * build_mpt() ;
+  struct ggml_cgraph * build_stablelm();
+};
+
+
+enum llm_offload_func_e {
+    OFFLOAD_FUNC_NOP,
+    OFFLOAD_FUNC,
+    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_NR,
+    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_OUT,
+};
+
+struct llm_offload_trie {
+  struct node {
+    ~node() ;
+    node * children[256] = { nullptr };
+    llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+  };
+  node * root = nullptr;
+  llm_offload_trie();
+  llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
+  ~llm_offload_trie();
+  void add(const char * name, llm_offload_func_e func);
+  llm_offload_func_e find(const char * name) const;
+  
+};
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+
+struct llm_bigram_spm {
+    struct comparator {
+      bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm {
+  llm_tokenizer_spm(const llama_vocab & vocab);
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+
+private:
+  void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
+  void try_add_bigram(int left, int right) ;
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  llm_bigram_spm::queue work_queue;
+
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+    struct comparator {
+      bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe {
+  llm_tokenizer_bpe(const llama_vocab & vocab);
+
+  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+private:
+  void add_new_bigram(int left, int right) ;
+
+  std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
+
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  std::vector<llm_symbol> symbols_final;
+
+    llm_bigram_bpe::queue work_queue;
+};
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+  fragment_buffer_variant(llama_vocab::id _token);
+  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
+  const FRAGMENT_BUFFER_VARIANT_TYPE type;
+  const llama_vocab::id token;
+  const std::string _dummy;
+  const std::string & raw_text;
+  const uint64_t offset;
+  const uint64_t length;
+};
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
diff --git a/llama.cpp b/llama.cpp
index c2ad048699472..5682234e7b490 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -29,20 +29,20 @@
 
 #ifdef __has_include
     #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/mman.h>
-        #endif
-        #if defined(_POSIX_MEMLOCK_RANGE)
-            #include <sys/resource.h>
-        #endif
+	#include <unistd.h>
+	#if defined(_POSIX_MAPPED_FILES)
+	    #include <sys/mman.h>
+	#endif
+	#if defined(_POSIX_MEMLOCK_RANGE)
+	    #include <sys/resource.h>
+	#endif
     #endif
 #endif
 
 #if defined(_WIN32)
     #define WIN32_LEAN_AND_MEAN
     #ifndef NOMINMAX
-        #define NOMINMAX
+	#define NOMINMAX
     #endif
     #include <windows.h>
     #include <io.h>
@@ -77,6 +77,8 @@
 #include <thread>
 #include <unordered_map>
 
+#include "llama-internal.hpp"
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -118,13 +120,13 @@ static size_t utf8_len(char src) {
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
     std::string result;
     for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
+	auto new_pos = s.find(search, pos);
+	if (new_pos == std::string::npos) {
+	    result += s.substr(pos, s.size() - pos);
+	    break;
+	}
+	result += s.substr(pos, new_pos - pos) + replace;
+	pos = new_pos;
     }
     s = std::move(result);
 }
@@ -132,17 +134,17 @@ static void replace_all(std::string & s, const std::string & search, const std::
 static bool is_float_close(float a, float b, float abs_tol) {
     // Check for non-negative tolerance
     if (abs_tol < 0.0) {
-        throw std::invalid_argument("Tolerance must be non-negative");
+	throw std::invalid_argument("Tolerance must be non-negative");
     }
 
     // Exact equality check
     if (a == b) {
-        return true;
+	return true;
     }
 
     // Check for infinities
     if (std::isinf(a) || std::isinf(b)) {
-        return false;
+	return false;
     }
 
     // Regular comparison using the provided absolute tolerance
@@ -156,7 +158,7 @@ static bool is_float_close(float a, float b, float abs_tol) {
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
-        file.write(&zero, 1);
+	file.write(&zero, 1);
     }
 }
 
@@ -176,25 +178,11 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
+
 //
 // gguf constants (sync with gguf.py)
 //
 
-enum llm_arch {
-    LLM_ARCH_LLAMA,
-    LLM_ARCH_FALCON,
-    LLM_ARCH_BAICHUAN,
-    LLM_ARCH_GPT2,
-    LLM_ARCH_GPTJ,
-    LLM_ARCH_GPTNEOX,
-    LLM_ARCH_MPT,
-    LLM_ARCH_STARCODER,
-    LLM_ARCH_PERSIMMON,
-    LLM_ARCH_REFACT,
-    LLM_ARCH_BLOOM,
-    LLM_ARCH_STABLELM,
-    LLM_ARCH_UNKNOWN,
-};
 
 static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,           "llama"     },
@@ -211,55 +199,6 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_STABLELM,        "stablelm"  },
 };
 
-enum llm_kv {
-    LLM_KV_GENERAL_ARCHITECTURE,
-    LLM_KV_GENERAL_QUANTIZATION_VERSION,
-    LLM_KV_GENERAL_ALIGNMENT,
-    LLM_KV_GENERAL_NAME,
-    LLM_KV_GENERAL_AUTHOR,
-    LLM_KV_GENERAL_URL,
-    LLM_KV_GENERAL_DESCRIPTION,
-    LLM_KV_GENERAL_LICENSE,
-    LLM_KV_GENERAL_SOURCE_URL,
-    LLM_KV_GENERAL_SOURCE_HF_REPO,
-
-    LLM_KV_CONTEXT_LENGTH,
-    LLM_KV_EMBEDDING_LENGTH,
-    LLM_KV_BLOCK_COUNT,
-    LLM_KV_FEED_FORWARD_LENGTH,
-    LLM_KV_USE_PARALLEL_RESIDUAL,
-    LLM_KV_TENSOR_DATA_LAYOUT,
-
-    LLM_KV_ATTENTION_HEAD_COUNT,
-    LLM_KV_ATTENTION_HEAD_COUNT_KV,
-    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
-    LLM_KV_ATTENTION_CLAMP_KQV,
-    LLM_KV_ATTENTION_LAYERNORM_EPS,
-    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
-
-    LLM_KV_ROPE_DIMENSION_COUNT,
-    LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_SCALE_LINEAR,
-    LLM_KV_ROPE_SCALING_TYPE,
-    LLM_KV_ROPE_SCALING_FACTOR,
-    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
-    LLM_KV_ROPE_SCALING_FINETUNED,
-
-    LLM_KV_TOKENIZER_MODEL,
-    LLM_KV_TOKENIZER_LIST,
-    LLM_KV_TOKENIZER_TOKEN_TYPE,
-    LLM_KV_TOKENIZER_SCORES,
-    LLM_KV_TOKENIZER_MERGES,
-    LLM_KV_TOKENIZER_BOS_ID,
-    LLM_KV_TOKENIZER_EOS_ID,
-    LLM_KV_TOKENIZER_UNK_ID,
-    LLM_KV_TOKENIZER_SEP_ID,
-    LLM_KV_TOKENIZER_PAD_ID,
-    LLM_KV_TOKENIZER_ADD_BOS,
-    LLM_KV_TOKENIZER_ADD_EOS,
-    LLM_KV_TOKENIZER_HF_JSON,
-    LLM_KV_TOKENIZER_RWKV,
-};
 
 static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  },
@@ -311,228 +250,196 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
 };
 
-struct LLM_KV {
-    LLM_KV(llm_arch arch) : arch(arch) {}
-
-    llm_arch arch;
-
-    std::string operator()(llm_kv kv) const {
-        return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
-    }
-};
-
-enum llm_tensor {
-    LLM_TENSOR_TOKEN_EMBD,
-    LLM_TENSOR_TOKEN_EMBD_NORM,
-    LLM_TENSOR_POS_EMBD,
-    LLM_TENSOR_OUTPUT,
-    LLM_TENSOR_OUTPUT_NORM,
-    LLM_TENSOR_ROPE_FREQS,
-    LLM_TENSOR_ATTN_Q,
-    LLM_TENSOR_ATTN_K,
-    LLM_TENSOR_ATTN_V,
-    LLM_TENSOR_ATTN_QKV,
-    LLM_TENSOR_ATTN_OUT,
-    LLM_TENSOR_ATTN_NORM,
-    LLM_TENSOR_ATTN_NORM_2,
-    LLM_TENSOR_ATTN_ROT_EMBD,
-    LLM_TENSOR_FFN_GATE,
-    LLM_TENSOR_FFN_DOWN,
-    LLM_TENSOR_FFN_UP,
-    LLM_TENSOR_FFN_NORM,
-    LLM_TENSOR_ATTN_Q_NORM,
-    LLM_TENSOR_ATTN_K_NORM,
-};
 
 static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
     {
-        LLM_ARCH_LLAMA,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_LLAMA,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+	    { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+	    { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
     {
-        LLM_ARCH_BAICHUAN,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_BAICHUAN,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+	    { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+	    { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
     {
-        LLM_ARCH_FALCON,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_FALCON,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
+	    { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
     {
-        LLM_ARCH_GPT2,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-        },
+	LLM_ARCH_GPT2,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	},
     },
     {
-        LLM_ARCH_GPTJ,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-        },
+	LLM_ARCH_GPTJ,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	},
     },
     {
-        LLM_ARCH_GPTNEOX,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_GPTNEOX,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
     {
-        LLM_ARCH_PERSIMMON,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd"},
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm"},
-            { LLM_TENSOR_OUTPUT,          "output"},
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm"},
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv"},
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output"},
-            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},
-            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm"},
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down"},
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up"},
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd"},
-        },
+	LLM_ARCH_PERSIMMON,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd"},
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm"},
+	    { LLM_TENSOR_OUTPUT,          "output"},
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm"},
+	    { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv"},
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output"},
+	    { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},
+	    { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm"},
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down"},
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up"},
+	    { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd"},
+	},
     },
     {
-        LLM_ARCH_MPT,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_MPT,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
     {
-        LLM_ARCH_STARCODER,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_POS_EMBD,        "position_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-        },
+	LLM_ARCH_STARCODER,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_POS_EMBD,        "position_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	},
     },
     {
-        LLM_ARCH_REFACT,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_REFACT,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+	    { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+	    { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
     {
-        LLM_ARCH_BLOOM,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-        },
+	LLM_ARCH_BLOOM,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	},
     },
     {
-        LLM_ARCH_STABLELM,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-        },
+	LLM_ARCH_STABLELM,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	    { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+	    { LLM_TENSOR_OUTPUT,          "output" },
+	    { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+	    { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+	    { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+	    { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+	    { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+	    { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+	    { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+	    { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+	    { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+	    { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+	},
     },
 
     {
-        LLM_ARCH_UNKNOWN,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-        },
+	LLM_ARCH_UNKNOWN,
+	{
+	    { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+	},
     },
 };
 
 static llm_arch llm_arch_from_string(const std::string & name) {
     for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
-        if (kv.second == name) {
-            return kv.first;
-        }
+	if (kv.second == name) {
+	    return kv.first;
+	}
     }
 
     return LLM_ARCH_UNKNOWN;
@@ -547,27 +454,6 @@ static llm_arch llm_arch_from_string(const std::string & name) {
 //   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
 //   std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3);     -> "blk.3.attn_norm.weight"
 //
-struct LLM_TN {
-    LLM_TN(llm_arch arch) : arch(arch) {}
-
-    llm_arch arch;
-
-    std::string operator()(llm_tensor tensor) const {
-        return LLM_TENSOR_NAMES[arch].at(tensor);
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix) const {
-        return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
-    }
-
-    std::string operator()(llm_tensor tensor, int bid) const {
-        return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
-        return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
-    }
-};
 
 //
 // gguf helpers
@@ -578,13 +464,13 @@ do { \
     const std::string skey(key); \
     const int kid = gguf_find_key(ctx, skey.c_str()); \
     if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
-        } \
-        (dst) = func(ctx, kid); \
+	enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
+	if (ktype != (type)) { \
+	    throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
+	} \
+	(dst) = func(ctx, kid); \
     } else if (req) { \
-        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
+	throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
     } \
 } while (0)
 
@@ -596,9 +482,9 @@ static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
 
 static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
     for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
-        if (kv.second == name) {
-            return kv.first;
-        }
+	if (kv.second == name) {
+	    return kv.first;
+	}
     }
 
     return LLAMA_ROPE_SCALING_UNSPECIFIED;
@@ -606,18 +492,18 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
 
 static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
     switch (type) {
-        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
-        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
-        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
-        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
-        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
-        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
-        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
-        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
-        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
-        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
-        default:                return format("unknown type %d", type);
+	case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+	case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+	case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+	case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+	case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+	case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+	case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+	case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+	case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+	case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+	case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+	default:                return format("unknown type %d", type);
     }
 }
 
@@ -625,36 +511,36 @@ static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
     const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
 
     switch (type) {
-        case GGUF_TYPE_STRING:
-            return gguf_get_val_str(ctx_gguf, i);
-        case GGUF_TYPE_ARRAY:
-            {
-                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
-                int arr_n = gguf_get_arr_n(ctx_gguf, i);
-                const void * data = gguf_get_arr_data(ctx_gguf, i);
-                std::stringstream ss;
-                ss << "[";
-                for (int j = 0; j < arr_n; j++) {
-                    if (arr_type == GGUF_TYPE_STRING) {
-                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
-                        // escape quotes
-                        replace_all(val, "\\", "\\\\");
-                        replace_all(val, "\"", "\\\"");
-                        ss << '"' << val << '"';
-                    } else if (arr_type == GGUF_TYPE_ARRAY) {
-                        ss << "???";
-                    } else {
-                        ss << gguf_data_to_str(arr_type, data, j);
-                    }
-                    if (j < arr_n - 1) {
-                        ss << ", ";
-                    }
-                }
-                ss << "]";
-                return ss.str();
-            }
-        default:
-            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+	case GGUF_TYPE_STRING:
+	    return gguf_get_val_str(ctx_gguf, i);
+	case GGUF_TYPE_ARRAY:
+	    {
+		const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+		int arr_n = gguf_get_arr_n(ctx_gguf, i);
+		const void * data = gguf_get_arr_data(ctx_gguf, i);
+		std::stringstream ss;
+		ss << "[";
+		for (int j = 0; j < arr_n; j++) {
+		    if (arr_type == GGUF_TYPE_STRING) {
+			std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+			// escape quotes
+			replace_all(val, "\\", "\\\\");
+			replace_all(val, "\"", "\\\"");
+			ss << '"' << val << '"';
+		    } else if (arr_type == GGUF_TYPE_ARRAY) {
+			ss << "???";
+		    } else {
+			ss << gguf_data_to_str(arr_type, data, j);
+		    }
+		    if (j < arr_n - 1) {
+			ss << ", ";
+		    }
+		}
+		ss << "]";
+		return ss.str();
+	    }
+	default:
+	    return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
     }
 }
 
@@ -666,8 +552,8 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
     if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
+	buf.resize(plan.work_size);
+	plan.work_data = buf.data();
     }
 
     ggml_graph_compute(graph, &plan);
@@ -680,9 +566,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 inline void * llama_host_malloc(size_t n) {
 #ifdef GGML_USE_CUBLAS
     if (ggml_cublas_loaded()) {
-        return ggml_cuda_host_malloc(n);
+	return ggml_cuda_host_malloc(n);
     } else {
-        return malloc(n);
+	return malloc(n);
     }
 #elif GGML_USE_METAL
     return ggml_metal_host_malloc(n);
@@ -696,9 +582,9 @@ inline void * llama_host_malloc(size_t n) {
 inline void llama_host_free(void * ptr) {
 #ifdef GGML_USE_CUBLAS
     if (ggml_cublas_loaded()) {
-        return ggml_cuda_host_free(ptr);
+	return ggml_cuda_host_free(ptr);
     } else {
-        return free(ptr);
+	return free(ptr);
     }
 #elif GGML_USE_METAL
     return ggml_metal_host_free(ptr);
@@ -713,9 +599,9 @@ inline void llama_host_free(void * ptr) {
 static std::string llama_format_win_err(DWORD err) {
     LPSTR buf;
     size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+				 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
     if (!size) {
-        return "FormatMessageA failed";
+	return "FormatMessageA failed";
     }
     std::string ret(buf, size);
     LocalFree(buf);
@@ -723,374 +609,353 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-struct llama_buffer {
-    void * data = NULL;
-    size_t size = 0;
-
-    // fallback to malloc / free
-    // useful in cases where CUDA can try to allocate PINNED memory
-    bool fallback = false;
+//struct llama_buffer {
 
-    void resize(size_t n) {
-        llama_host_free(data);
+void llama_buffer::resize(size_t n) {
+	llama_host_free(data);
 
-        data = llama_host_malloc(n);
-        if (!data) {
-            fallback = true;
-            data = malloc(n);
-        } else {
-            fallback = false;
-        }
+	data = llama_host_malloc(n);
+	if (!data) {
+	    fallback = true;
+	    data = malloc(n);
+	} else {
+	    fallback = false;
+	}
 
-        GGML_ASSERT(data);
-        size = n;
+	GGML_ASSERT(data);
+	size = n;
     }
 
-    ~llama_buffer() {
-        if (data) {
-            if (fallback) { // NOLINT
-                free(data);
-            } else {
-                llama_host_free(data);
-            }
-        }
+llama_buffer::~llama_buffer() {
+	if (data) {
+	    if (fallback) { // NOLINT
+		free(data);
+	    } else {
+		llama_host_free(data);
+	    }
+	}
 
-        data = NULL;
+	data = NULL;
     }
-};
 
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
 
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
+
+llama_file::llama_file(const char * fname, const char * mode) {
+	fp = std::fopen(fname, mode);
+	if (fp == NULL) {
+	    throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+	}
+	seek(0, SEEK_END);
+	size = tell();
+	seek(0, SEEK_SET);
     }
 
-    size_t tell() const {
+size_t llama_file::tell() const {
 #ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
+	__int64 ret = _ftelli64(fp);
 #else
-        long ret = std::ftell(fp);
+	long ret = std::ftell(fp);
 #endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
+	GGML_ASSERT(ret != -1); // this really shouldn't fail
+	return (size_t) ret;
     }
 
-    void seek(size_t offset, int whence) const {
+void llama_file::seek(size_t offset, int whence) const {
+
 #ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
+	int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
-        int ret = std::fseek(fp, (long) offset, whence);
+	int ret = std::fseek(fp, (long) offset, whence);
 #endif
-        GGML_ASSERT(ret == 0); // same
+	GGML_ASSERT(ret == 0); // same
     }
 
-    void read_raw(void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
+void llama_file::read_raw(void * ptr, size_t len) const {
+	if (len == 0) {
+	    return;
+	}
+	errno = 0;
+	std::size_t ret = std::fread(ptr, len, 1, fp);
+	if (ferror(fp)) {
+	    throw std::runtime_error(format("read error: %s", strerror(errno)));
+	}
+	if (ret != 1) {
+	    throw std::runtime_error(std::string("unexpectedly reached end of file"));
+	}
     }
 
-    uint32_t read_u32() const {
-        uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
+uint32_t llama_file::read_u32() const {
+	uint32_t ret;
+	read_raw(&ret, sizeof(ret));
+	return ret;
     }
 
-    void write_raw(const void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, len, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
+void llama_file::write_raw(const void * ptr, size_t len) const {
+	if (len == 0) {
+	    return;
+	}
+	errno = 0;
+	size_t ret = std::fwrite(ptr, len, 1, fp);
+	if (ret != 1) {
+	    throw std::runtime_error(format("write error: %s", strerror(errno)));
+	}
     }
 
-    void write_u32(std::uint32_t val) const {
-        write_raw(&val, sizeof(val));
+void llama_file::write_u32(std::uint32_t val) const {
+	write_raw(&val, sizeof(val));
     }
 
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
+llama_file::~llama_file() {
+	if (fp) {
+	    std::fclose(fp);
+	}
     }
-};
 
-struct llama_mmap {
-    void * addr;
-    size_t size;
 
-    llama_mmap(const llama_mmap &) = delete;
+//
+
 
 #ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch , bool numa ) {
+	size = file->size;
+	int fd = fileno(file->fp);
+	int flags = MAP_SHARED;
+	// prefetch/readahead impairs performance on NUMA systems
+	if (numa) { prefetch = 0; }
 #ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
+	if (prefetch) { flags |= MAP_POPULATE; }
 #endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
-                fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
-                fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~llama_mmap() {
-        munmap(addr, size);
+	addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+	if (addr == MAP_FAILED) {
+	    throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
+	}
+
+	if (prefetch > 0) {
+	    // Advise the kernel to preload the mapped memory
+	    if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
+		fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
+			strerror(errno));
+	    }
+	}
+	if (numa) {
+	    // advise the kernel not to use readahead
+	    // (because the next page might not belong on the same node)
+	    if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
+		fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
+			strerror(errno));
+	    }
+	}
+    }
+
+llama_mmap::~llama_mmap() {
+	munmap(addr, size);
     }
 #elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
 
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) numa;
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch = 1, bool numa = false) {
+	(void) numa;
 
-        size = file->size;
+	size = file->size;
 
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
+	HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
 
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
+	HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+	DWORD error = GetLastError();
 
-        if (hMapping == NULL) {
-            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
-        }
+	if (hMapping == NULL) {
+	    throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
+	}
 
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
-        CloseHandle(hMapping);
+	addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+	error = GetLastError();
+	CloseHandle(hMapping);
 
-        if (addr == NULL) {
-            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
-        }
+	if (addr == NULL) {
+	    throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
+	}
 
-        if (prefetch) {
-            // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
-            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
-            HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
+	if (prefetch == 1) {
+	    // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
+	    BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
+	    HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
 
-            // may fail on pre-Windows 8 systems
-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
+	    // may fail on pre-Windows 8 systems
+	    pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
 
-            if (pPrefetchVirtualMemory) {
-                // advise the kernel to preload the mapped memory
-                WIN32_MEMORY_RANGE_ENTRY range;
-                range.VirtualAddress = addr;
-                range.NumberOfBytes = (SIZE_T)size;
-                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                            llama_format_win_err(GetLastError()).c_str());
-                }
-            }
-        }
+	    if (pPrefetchVirtualMemory) {
+		// advise the kernel to preload the mapped memory
+		WIN32_MEMORY_RANGE_ENTRY range;
+		range.VirtualAddress = addr;
+		range.NumberOfBytes = (SIZE_T)size;
+		if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+		    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+			    llama_format_win_err(GetLastError()).c_str());
+		}
+	    }
+	}
     }
 
-    ~llama_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
+llama_mmap::~llama_mmap() {
+	if (!UnmapViewOfFile(addr)) {
+	    fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+		    llama_format_win_err(GetLastError()).c_str());
+	}
     }
 #else
     static constexpr bool SUPPORTED = false;
 
     llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) file;
-        (void) prefetch;
-        (void) numa;
+	(void) file;
+	(void) prefetch;
+	(void) numa;
 
-        throw std::runtime_error(std::string("mmap not supported"));
+	throw std::runtime_error(std::string("mmap not supported"));
     }
 #endif
-};
+
 
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
+// llama_mlock 
 
-    bool failed_already = false;
 
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
+llama_mlock::llama_mlock() {}
 
-    ~llama_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
+llama_mlock::~llama_mlock() {
+	if (size) {
+	    raw_unlock(addr, size);
+	}
     }
 
-    void init(void * ptr) {
-        GGML_ASSERT(addr == NULL && size == 0); // NOLINT
-        addr = ptr;
+void llama_mlock::init(void * ptr) {
+	GGML_ASSERT(addr == NULL && size == 0); // NOLINT
+	addr = ptr;
     }
 
-    void grow_to(size_t target_size) {
-        GGML_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
+void llama_mlock::grow_to(size_t target_size) {
+	GGML_ASSERT(addr);
+	if (failed_already) {
+	    return;
+	}
+	size_t granularity = lock_granularity();
+	target_size = (target_size + granularity - 1) & ~(granularity - 1);
+	if (target_size > size) {
+	    if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+		size = target_size;
+	    } else {
+		failed_already = true;
+	    }
+	}
     }
 
 #ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
 
-    static size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
+size_t llama_mlock::lock_granularity() {
+	return (size_t) sysconf(_SC_PAGESIZE);
     }
 
     #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+	#define MLOCK_SUGGESTION \
+	    "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+	    "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
     #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+	#define MLOCK_SUGGESTION \
+	    "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
     #endif
 
-    bool raw_lock(const void * addr, size_t size) const {
-        if (!mlock(addr, size)) {
-            return true;
-        }
+    bool llama_mlock::raw_lock(const void * addr, size_t size) const {
+	if (!mlock(addr, size)) {
+	    return true;
+	}
 
-        char* errmsg = std::strerror(errno);
-        bool suggest = (errno == ENOMEM);
+	char* errmsg = std::strerror(errno);
+	bool suggest = (errno == ENOMEM);
 
-        // Check if the resource limit is fine after all
-        struct rlimit lock_limit;
-        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
-            suggest = false;
-        }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
-            suggest = false;
-        }
+	// Check if the resource limit is fine after all
+	struct rlimit lock_limit;
+	if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
+	    suggest = false;
+	}
+	if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+	    suggest = false;
+	}
 
-        fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-        return false;
+	fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+		size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+	return false;
     }
 
     #undef MLOCK_SUGGESTION
 
-    static void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
+ void llama_mlock::raw_unlock(void * addr, size_t size) {
+	if (munlock(addr, size)) {
+	    fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+	}
     }
 #elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    static size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * ptr, size_t len) const {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(ptr, len)) {
-                return true;
-            }
-            if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                    len, size, llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = len + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    static void raw_unlock(void * ptr, size_t len) {
-        if (!VirtualUnlock(ptr, len)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
+    
+
+ size_t llama_mlock::lock_granularity() {
+	SYSTEM_INFO si;
+	GetSystemInfo(&si);
+	return (size_t) si.dwPageSize;
+    }
+
+    bool llama_mlock::raw_lock(void * ptr, size_t len) const {
+	for (int tries = 1; ; tries++) {
+	    if (VirtualLock(ptr, len)) {
+		return true;
+	    }
+	    if (tries == 2) {
+		fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+		    len, size, llama_format_win_err(GetLastError()).c_str());
+		return false;
+	    }
+
+	    // It failed but this was only the first try; increase the working
+	    // set size and try again.
+	    SIZE_T min_ws_size, max_ws_size;
+	    if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+		fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+			llama_format_win_err(GetLastError()).c_str());
+		return false;
+	    }
+	    // Per MSDN: "The maximum number of pages that a process can lock
+	    // is equal to the number of pages in its minimum working set minus
+	    // a small overhead."
+	    // Hopefully a megabyte is enough overhead:
+	    size_t increment = len + 1048576;
+	    // The minimum must be <= the maximum, so we need to increase both:
+	    min_ws_size += increment;
+	    max_ws_size += increment;
+	    if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+		fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+			llama_format_win_err(GetLastError()).c_str());
+		return false;
+	    }
+	}
+    }
+
+    static void llama_mlock::raw_unlock(void * ptr, size_t len) {
+	if (!VirtualUnlock(ptr, len)) {
+	    fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+		    llama_format_win_err(GetLastError()).c_str());
+	}
     }
 #else
-    static constexpr bool SUPPORTED = false;
-
-    static size_t lock_granularity() {
-        return (size_t) 65536;
+    
+    static size_t llama_mlock::lock_granularity() {
+	return (size_t) 65536;
     }
 
-    bool raw_lock(const void * addr, size_t len) const {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
-        return false;
+    bool llama_mlock::raw_lock(const void * addr, size_t len) const {
+	fprintf(stderr, "warning: mlock not supported on this system\n");
+	return false;
     }
 
-    static void raw_unlock(const void * addr, size_t len) {}
+    static void llama_mlock::raw_unlock(const void * addr, size_t len) {}
 #endif
-};
+
 
 typedef void (*offload_func_t)(struct ggml_tensor * tensor);
 
@@ -1102,12 +967,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
     if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-        GGML_ASSERT(check == -n_tokens);
+	result.resize(-n_tokens);
+	int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+	GGML_ASSERT(check == -n_tokens);
     }
     else {
-        result.resize(n_tokens);
+	result.resize(n_tokens);
     }
 
     return std::string(result.data(), result.size());
@@ -1117,383 +982,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
 // globals
 //
 
-struct llama_state {
-    // We save the log callback globally
-    ggml_log_callback log_callback = llama_log_callback_default;
-    void * log_callback_user_data = nullptr;
-};
-
 static llama_state g_state;
 
-// available llama models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_1B,
-    MODEL_3B,
-    MODEL_7B,
-    MODEL_8B,
-    MODEL_13B,
-    MODEL_15B,
-    MODEL_30B,
-    MODEL_34B,
-    MODEL_40B,
-    MODEL_65B,
-    MODEL_70B,
-};
 
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 
-struct llama_hparams {
-    bool     vocab_only;
-    uint32_t n_vocab;
-    uint32_t n_ctx_train; // context size the model was trained on
-    uint32_t n_embd;
-    uint32_t n_head;
-    uint32_t n_head_kv;
-    uint32_t n_layer;
-    uint32_t n_rot;
-    uint32_t n_ff;
-
-    float f_norm_eps;
-    float f_norm_rms_eps;
-
-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_yarn_orig_ctx;
-    int8_t   rope_scaling_type_train : 3;
-    bool     rope_finetuned : 1;
-
-    float f_clamp_kqv;
-    float f_max_alibi_bias;
-
-    bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only  != other.vocab_only)  return true;
-        if (this->n_vocab     != other.n_vocab)     return true;
-        if (this->n_ctx_train != other.n_ctx_train) return true;
-        if (this->n_embd      != other.n_embd)      return true;
-        if (this->n_head      != other.n_head)      return true;
-        if (this->n_head_kv   != other.n_head_kv)   return true;
-        if (this->n_layer     != other.n_layer)     return true;
-        if (this->n_rot       != other.n_rot)       return true;
-        if (this->n_ff        != other.n_ff)        return true;
-        if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
-
-        const float EPSILON = 1e-9;
-
-        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
-
-        return false;
-    }
-
-    uint32_t n_gqa() const {
-        return n_head/n_head_kv;
-    }
-
-    uint32_t n_embd_head() const {
-        return n_embd/n_head;
-    }
-
-    uint32_t n_embd_gqa() const {
-        return n_embd/n_gqa();
-    }
-};
-
-struct llama_cparams {
-    uint32_t n_ctx;       // context size used during inference
-    uint32_t n_batch;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
-
-    float    rope_freq_base;
-    float    rope_freq_scale;
-
-    uint32_t n_yarn_orig_ctx;
-    // These hyperparameters are not exposed in GGUF, because all
-    // existing YaRN models use the same values for them.
-    float yarn_ext_factor;
-    float yarn_attn_factor;
-    float yarn_beta_fast;
-    float yarn_beta_slow;
-
-    bool mul_mat_q;
-};
-
-struct llama_layer {
-    // normalization
-    struct ggml_tensor * attn_norm;
-    struct ggml_tensor * attn_norm_b;
-    struct ggml_tensor * attn_norm_2;
-    struct ggml_tensor * attn_norm_2_b;
-    struct ggml_tensor * attn_q_norm;
-    struct ggml_tensor * attn_q_norm_b;
-    struct ggml_tensor * attn_k_norm;
-    struct ggml_tensor * attn_k_norm_b;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-    struct ggml_tensor * wqkv;
-
-    // attention bias
-    struct ggml_tensor * bo;
-    struct ggml_tensor * bqkv;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-    struct ggml_tensor * ffn_norm_b;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-
-    // ff bias
-    struct ggml_tensor * ffn_down_b; // b2
-    struct ggml_tensor * ffn_up_b;   // b3
-};
-
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
-
-    std::set<llama_seq_id> seq_id;
-
-    bool has_seq_id(const llama_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-};
-
-// ring-buffer of cached KV data
-struct llama_kv_cache {
-    bool has_shift = false;
-
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_internal also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<llama_kv_cell> cells;
-
-    struct ggml_tensor * k = NULL;
-    struct ggml_tensor * v = NULL;
-
-    struct ggml_context * ctx = NULL;
-
-    llama_buffer buf;
-
-    ~llama_kv_cache() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
-
-#ifdef GGML_USE_CUBLAS
-        if (ggml_cublas_loaded()) {
-            ggml_cuda_free_data(k);
-            ggml_cuda_free_data(v);
-        }
-#endif
-    }
-};
-
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
-
-    struct token_data {
-        token text;
-        float score;
-        ttype type;
-    };
-
-    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data>       id_to_token;
-
-    std::unordered_map<token, id> special_tokens_cache;
-
-    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
-
-    // default LLaMA special tokens
-    id special_bos_id = 1;
-    id special_eos_id = 2;
-    id special_unk_id = 0;
-    id special_sep_id = -1;
-    id special_pad_id = -1;
-
-    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
-    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
-
-    id linefeed_id       = 13;
-    id special_prefix_id = 32007;
-    id special_middle_id = 32009;
-    id special_suffix_id = 32008;
-    id special_eot_id    = 32010;
-
-    int find_bpe_rank(std::string token_left, std::string token_right) const {
-        GGML_ASSERT(token_left.find(" ") == std::string::npos);
-        GGML_ASSERT(token_left.find("\n") == std::string::npos);
-        GGML_ASSERT(token_right.find(" ") == std::string::npos);
-        GGML_ASSERT(token_right.find("\n") == std::string::npos);
-
-        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
-        if (it == bpe_ranks.end()) {
-            return -1;
-        }
-
-        return it->second;
-    }
-};
-
-struct llama_model {
-    e_model     type  = MODEL_UNKNOWN;
-    llm_arch    arch  = LLM_ARCH_UNKNOWN;
-    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::string name = "n/a";
-
-    llama_hparams hparams = {};
-    llama_vocab   vocab;
-
-    struct ggml_tensor * tok_embd;
-    struct ggml_tensor * pos_embd;
-    struct ggml_tensor * tok_norm;
-    struct ggml_tensor * tok_norm_b;
-
-    struct ggml_tensor * output_norm;
-    struct ggml_tensor * output_norm_b;
-    struct ggml_tensor * output;
-
-    std::vector<llama_layer> layers;
-
-    int n_gpu_layers;
-
-    // gguf metadata
-    std::unordered_map<std::string, std::string> gguf_kv;
-
-    // context
-    struct ggml_context * ctx = NULL;
-
-    // the model memory buffer
-    llama_buffer buf;
-
-    // model memory mapped file
-    std::unique_ptr<llama_mmap> mapping;
-
-    // objects representing data potentially being locked in memory
-    llama_mlock mlock_buf;
-    llama_mlock mlock_mmap;
-
-    // for quantize-stats only
-    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
-
-    int64_t t_load_us = 0;
-    int64_t t_start_us = 0;
-
-    ~llama_model() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
-
-#ifdef GGML_USE_CUBLAS
-        if (ggml_cublas_loaded()) {
-            for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-                ggml_cuda_free_data(tensors_by_name[i].second);
-            }
-            ggml_cuda_free_scratch();
-        }
-#endif
-
-#if defined(GGML_USE_CLBLAST)
-        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-            ggml_cl_free_data(tensors_by_name[i].second);
-        }
-#endif
-    }
-};
-
-struct llama_context {
-    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
-    ~llama_context() {
-#ifdef GGML_USE_METAL
-        if (ctx_metal) {
-            ggml_metal_free(ctx_metal);
-        }
-#endif
-        if (alloc) {
-            ggml_allocr_free(alloc);
-        }
-    }
-
-    llama_cparams cparams;
-
-    const llama_model & model;
-
-    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
-
-    std::mt19937 rng;
-
-    bool has_evaluated_once = false;
-
-    int64_t t_start_us;
-    int64_t t_load_us;
-    int64_t t_sample_us = 0;
-    int64_t t_p_eval_us = 0;
-    int64_t t_eval_us   = 0;
-
-    int32_t n_sample = 0; // number of tokens sampled
-    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    int32_t n_eval   = 0; // number of eval calls
-
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
-    std::vector<float> logits;
-    bool logits_all = false;
-
-    // input embedding (1-dimensional array: [n_embd])
-    std::vector<float> embedding;
 
-    // reusable buffer for `struct ggml_graph_plan.work_data`
-    std::vector<uint8_t> work_buffer;
-
-    // memory buffers used to evaluate the model
-    llama_buffer buf_compute;
-
-    llama_buffer buf_alloc;
-    ggml_allocr * alloc = NULL;
-
-#ifdef GGML_USE_METAL
-    ggml_metal_context * ctx_metal = NULL;
-#endif
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
-};
 
 //
 // kv cache helpers
 //
 
 static bool llama_kv_cache_init(
-        const struct llama_hparams & hparams,
-             struct llama_kv_cache & cache,
-                         ggml_type   wtype,
-                          uint32_t   n_ctx,
-                               int   n_gpu_layers) {
+	const struct llama_hparams & hparams,
+	     struct llama_kv_cache & cache,
+			 ggml_type   wtype,
+			  uint32_t   n_ctx,
+			       int   n_gpu_layers) {
+  fprintf(stderr, "GPULAYERS '%d'\n", n_gpu_layers);
     const uint32_t n_embd  = hparams.n_embd_gqa();
     const uint32_t n_layer = hparams.n_layer;
 
@@ -1519,8 +1027,8 @@ static bool llama_kv_cache_init(
     cache.ctx = ggml_init(params);
 
     if (!cache.ctx) {
-        LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
-        return false;
+	LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
+	return false;
     }
 
     cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
@@ -1531,23 +1039,26 @@ static bool llama_kv_cache_init(
     (void) n_gpu_layers;
 
 #ifdef GGML_USE_CUBLAS
+    fprintf(stderr, "USE CUBLAS\n");
     if (ggml_cublas_loaded()) {
-        size_t vram_kv_cache = 0;
-
-        if (n_gpu_layers > (int)n_layer + 1) {
-            ggml_cuda_assign_buffers_no_scratch(cache.v);
-            LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
-            vram_kv_cache += ggml_nbytes(cache.v);
-        }
-        if (n_gpu_layers > (int)n_layer + 2) {
-            ggml_cuda_assign_buffers_no_scratch(cache.k);
-            LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
-            vram_kv_cache += ggml_nbytes(cache.k);
-        }
-        if (vram_kv_cache > 0) {
-            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
-        }
-    }
+	size_t vram_kv_cache = 0;
+
+	if (n_gpu_layers > (int)n_layer + 1) {
+	    ggml_cuda_assign_buffers_no_scratch(cache.v);
+	    LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
+	    vram_kv_cache += ggml_nbytes(cache.v);
+	}
+	if (n_gpu_layers > (int)n_layer + 2) {
+	    ggml_cuda_assign_buffers_no_scratch(cache.k);
+	    LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
+	    vram_kv_cache += ggml_nbytes(cache.k);
+	}
+	if (vram_kv_cache > 0) {
+	    LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+	}
+    }
+   #else
+    fprintf(stderr, "NO USE CUBLAS\n");
 #endif
 
     return true;
@@ -1558,51 +1069,51 @@ static bool llama_kv_cache_init(
 // Note: On success, it's important that cache.head points
 // to the first cell of the slot.
 static bool llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-        const struct llama_batch & batch) {
+	   struct llama_kv_cache & cache,
+	const struct llama_batch & batch) {
     const uint32_t n_ctx    = cache.size;
     const uint32_t n_tokens = batch.n_tokens;
 
     if (n_tokens > n_ctx) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
-        return false;
+	LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
+	return false;
     }
 
     uint32_t n_tested = 0;
 
     while (true) {
-        if (cache.head + n_tokens > n_ctx) {
-            n_tested += n_ctx - cache.head;
-            cache.head = 0;
-            continue;
-        }
-
-        bool found = true;
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            if (cache.cells[cache.head + i].pos >= 0) {
-                found = false;
-                cache.head += i + 1;
-                n_tested   += i + 1;
-                break;
-            }
-        }
-
-        if (found) {
-            break;
-        }
-
-        if (n_tested >= n_ctx) {
-            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return false;
-        }
+	if (cache.head + n_tokens > n_ctx) {
+	    n_tested += n_ctx - cache.head;
+	    cache.head = 0;
+	    continue;
+	}
+
+	bool found = true;
+	for (uint32_t i = 0; i < n_tokens; i++) {
+	    if (cache.cells[cache.head + i].pos >= 0) {
+		found = false;
+		cache.head += i + 1;
+		n_tested   += i + 1;
+		break;
+	    }
+	}
+
+	if (found) {
+	    break;
+	}
+
+	if (n_tested >= n_ctx) {
+	    //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+	    return false;
+	}
     }
 
     for (uint32_t i = 0; i < n_tokens; i++) {
-        cache.cells[cache.head + i].pos = batch.pos[i];
+	cache.cells[cache.head + i].pos = batch.pos[i];
 
-        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
-            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
-        }
+	for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
+	    cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
+	}
     }
 
     return true;
@@ -1611,9 +1122,9 @@ static bool llama_kv_cache_find_slot(
 // find how many cells are currently in use
 static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     for (uint32_t i = cache.size - 1; i > 0; --i) {
-        if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
-            return i + 1;
-        }
+	if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
+	    return i + 1;
+	}
     }
 
     return 0;
@@ -1621,36 +1132,36 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
 
 static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
     for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
-        cache.cells[i].pos = -1;
-        cache.cells[i].seq_id.clear();
+	cache.cells[i].pos = -1;
+	cache.cells[i].seq_id.clear();
     }
     cache.head = 0;
 }
 
 static void llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1) {
+	struct llama_kv_cache & cache,
+		 llama_seq_id   seq_id,
+		    llama_pos   p0,
+		    llama_pos   p1) {
     uint32_t new_head = cache.size;
 
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cache.cells[i].seq_id.clear();
-            } else if (cache.cells[i].has_seq_id(seq_id)) {
-                cache.cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cache.cells[i].seq_id.empty()) {
-                cache.cells[i].pos = -1;
-                if (new_head == cache.size) new_head = i;
-            }
-        }
+	if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+	    if (seq_id < 0) {
+		cache.cells[i].seq_id.clear();
+	    } else if (cache.cells[i].has_seq_id(seq_id)) {
+		cache.cells[i].seq_id.erase(seq_id);
+	    } else {
+		continue;
+	    }
+	    if (cache.cells[i].seq_id.empty()) {
+		cache.cells[i].pos = -1;
+		if (new_head == cache.size) new_head = i;
+	    }
+	}
     }
 
     // If we freed up a slot, set head to it so searching can start there.
@@ -1658,20 +1169,20 @@ static void llama_kv_cache_seq_rm(
 }
 
 static void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1) {
+	struct llama_kv_cache & cache,
+		 llama_seq_id   seq_id_src,
+		 llama_seq_id   seq_id_dst,
+		    llama_pos   p0,
+		    llama_pos   p1) {
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     cache.head = 0;
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.insert(seq_id_dst);
-        }
+	if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+	    cache.cells[i].seq_id.insert(seq_id_dst);
+	}
     }
 }
 
@@ -1679,14 +1190,14 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
     uint32_t new_head = cache.size;
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (!cache.cells[i].has_seq_id(seq_id)) {
-            cache.cells[i].pos = -1;
-            cache.cells[i].seq_id.clear();
-            if (new_head == cache.size) new_head = i;
-        } else {
-            cache.cells[i].seq_id.clear();
-            cache.cells[i].seq_id.insert(seq_id);
-        }
+	if (!cache.cells[i].has_seq_id(seq_id)) {
+	    cache.cells[i].pos = -1;
+	    cache.cells[i].seq_id.clear();
+	    if (new_head == cache.size) new_head = i;
+	} else {
+	    cache.cells[i].seq_id.clear();
+	    cache.cells[i].seq_id.insert(seq_id);
+	}
     }
 
     // If we freed up a slot, set head to it so searching can start there.
@@ -1694,28 +1205,28 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
 }
 
 static void llama_kv_cache_seq_shift(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta) {
+	struct llama_kv_cache & cache,
+		 llama_seq_id   seq_id,
+		    llama_pos   p0,
+		    llama_pos   p1,
+		    llama_pos   delta) {
     uint32_t new_head = cache.size;
 
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-            cache.cells[i].pos   += delta;
-            cache.cells[i].delta += delta;
-
-            if (cache.cells[i].pos < 0) {
-                cache.cells[i].pos = -1;
-                cache.cells[i].seq_id.clear();
-                if (new_head == cache.size) new_head = i;
-            }
-        }
+	if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+	    cache.has_shift = true;
+	    cache.cells[i].pos   += delta;
+	    cache.cells[i].delta += delta;
+
+	    if (cache.cells[i].pos < 0) {
+		cache.cells[i].pos = -1;
+		cache.cells[i].seq_id.clear();
+		if (new_head == cache.size) new_head = i;
+	    }
+	}
     }
 
     // If we freed up a slot, set head to it so searching can start there.
@@ -1727,17 +1238,12 @@ static void llama_kv_cache_seq_shift(
 // model loading and saving
 //
 
-enum llama_fver {
-    GGUF_FILE_VERSION_V1 = 1,
-    GGUF_FILE_VERSION_V2 = 2,
-    GGUF_FILE_VERSION_V3 = 3,
-};
 
 static const char * llama_file_version_name(llama_fver version) {
     switch (version) {
-        case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
-        case GGUF_FILE_VERSION_V2: return "GGUF V2";
-        case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
+	case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
+	case GGUF_FILE_VERSION_V2: return "GGUF V2";
+	case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
     }
 
     return "unknown";
@@ -1747,7 +1253,7 @@ static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
     for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
     }
     return buf;
 }
@@ -1756,341 +1262,324 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
     }
     return buf;
 }
 
-struct llama_model_loader {
-    int n_kv      = 0;
-    int n_tensors = 0;
-    int n_created = 0;
 
-    int64_t n_elements = 0;
-    size_t  n_bytes    = 0;
+llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
+      struct gguf_init_params params(
+				     /*.no_alloc =*/  true,
+				     /*.ctx      = */ &ctx_meta
+				     );
+
+	ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+	if (!ctx_gguf) {
+	    throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
+	}
+
+	n_kv      = gguf_get_n_kv(ctx_gguf);
+	n_tensors = gguf_get_n_tensors(ctx_gguf);
+
+	fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
+
+	for (int i = 0; i < n_tensors; i++) {
+	    const char * name = gguf_get_tensor_name(ctx_gguf, i);
+	    struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
+	    n_elements += ggml_nelements(t);
+	    n_bytes    += ggml_nbytes(t);
+	}
+
+	LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
+		__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+
+	// determine file type based on the number of tensors for each quantization and print meta data
+	// TODO: make optional
+	{
+	    std::map<enum ggml_type, uint32_t> n_type;
+
+	    uint32_t n_type_max = 0;
+	    enum ggml_type type_max = GGML_TYPE_F32;
+
+	    for (int i = 0; i < n_tensors; i++) {
+		const char * name = gguf_get_tensor_name(ctx_gguf, i);
+		struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
 
-    bool use_mmap = false;
+		n_type[meta->type]++;
 
-    llama_file  file;
-    llama_ftype ftype;
-    llama_fver  fver;
+		if (n_type_max < n_type[meta->type]) {
+		    n_type_max = n_type[meta->type];
+		    type_max   = meta->type;
+		}
 
-    std::unique_ptr<llama_mmap> mapping;
+		LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
+	    }
 
-    struct gguf_context * ctx_gguf = NULL;
-    struct ggml_context * ctx_meta = NULL;
+	    switch (type_max) {
+		case GGML_TYPE_F32:  ftype = LLAMA_FTYPE_ALL_F32;       break;
+		case GGML_TYPE_F16:  ftype = LLAMA_FTYPE_MOSTLY_F16;    break;
+		case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0;   break;
+		case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1;   break;
+		case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0;   break;
+		case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1;   break;
+		case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0;   break;
+		case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K;   break;
+		case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
+		case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
+		case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
+		case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K;   break;
+		default:
+		    {
+			LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
+			ftype = LLAMA_FTYPE_ALL_F32;
+		    } break;
+	    }
 
-    llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx_meta,
-        };
+	    // this is a way to mark that we have "guessed" the file type
+	    ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
-        ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-        if (!ctx_gguf) {
-            throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
-        }
+	    {
+		const int kid = gguf_find_key(ctx_gguf, "general.file_type");
+		if (kid >= 0) {
+		    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
+		}
+	    }
 
-        n_kv      = gguf_get_n_kv(ctx_gguf);
-        n_tensors = gguf_get_n_tensors(ctx_gguf);
+	    for (int i = 0; i < n_kv; i++) {
+		const char * name           = gguf_get_key(ctx_gguf, i);
+		const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
+		const std::string type_name =
+		    type == GGUF_TYPE_ARRAY
+		    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
+		    : gguf_type_name(type);
 
-        fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
+		std::string value          = gguf_kv_to_str(ctx_gguf, i);
+		const size_t MAX_VALUE_LEN = 40;
+		if (value.size() > MAX_VALUE_LEN) {
+		    value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+		}
+		replace_all(value, "\n", "\\n");
 
-        for (int i = 0; i < n_tensors; i++) {
-            const char * name = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
-            n_elements += ggml_nelements(t);
-            n_bytes    += ggml_nbytes(t);
-        }
+		LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+	    }
 
-        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-                __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+	    // print type counts
+	    for (auto & kv : n_type) {
+		if (kv.second == 0) {
+		    continue;
+		}
+
+		LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+	    }
+	}
+
+	if (!llama_mmap::SUPPORTED) {
+	    LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
+	    use_mmap = false;
+	}
 
-        // determine file type based on the number of tensors for each quantization and print meta data
-        // TODO: make optional
-        {
-            std::map<enum ggml_type, uint32_t> n_type;
-
-            uint32_t n_type_max = 0;
-            enum ggml_type type_max = GGML_TYPE_F32;
-
-            for (int i = 0; i < n_tensors; i++) {
-                const char * name = gguf_get_tensor_name(ctx_gguf, i);
-                struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
-
-                n_type[meta->type]++;
-
-                if (n_type_max < n_type[meta->type]) {
-                    n_type_max = n_type[meta->type];
-                    type_max   = meta->type;
-                }
-
-                LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
-            }
-
-            switch (type_max) {
-                case GGML_TYPE_F32:  ftype = LLAMA_FTYPE_ALL_F32;       break;
-                case GGML_TYPE_F16:  ftype = LLAMA_FTYPE_MOSTLY_F16;    break;
-                case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0;   break;
-                case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1;   break;
-                case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0;   break;
-                case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1;   break;
-                case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0;   break;
-                case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K;   break;
-                case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
-                case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
-                case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
-                case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K;   break;
-                default:
-                    {
-                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
-                        ftype = LLAMA_FTYPE_ALL_F32;
-                    } break;
-            }
-
-            // this is a way to mark that we have "guessed" the file type
-            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
-
-            {
-                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
-                if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
-                }
-            }
-
-            for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(ctx_gguf, i);
-                const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
-                const std::string type_name =
-                    type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
-                    : gguf_type_name(type);
-
-                std::string value          = gguf_kv_to_str(ctx_gguf, i);
-                const size_t MAX_VALUE_LEN = 40;
-                if (value.size() > MAX_VALUE_LEN) {
-                    value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
-                }
-                replace_all(value, "\n", "\\n");
-
-                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
-            }
-
-            // print type counts
-            for (auto & kv : n_type) {
-                if (kv.second == 0) {
-                    continue;
-                }
-
-                LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
-            }
-        }
-
-        if (!llama_mmap::SUPPORTED) {
-            LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
-            use_mmap = false;
-        }
-
-        this->use_mmap = use_mmap;
+	this->use_mmap = use_mmap;
     }
 
-    ~llama_model_loader() {
-        if (ctx_gguf) {
-            gguf_free(ctx_gguf);
-        }
-        if (ctx_meta) {
-            ggml_free(ctx_meta);
-        }
+  llama_model_loader::~llama_model_loader() {
+	if (ctx_gguf) {
+	    gguf_free(ctx_gguf);
+	}
+	if (ctx_meta) {
+	    ggml_free(ctx_meta);
+	}
     }
 
-    std::string get_arch_name() const {
-        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
+  std::string llama_model_loader::get_arch_name() const {
+	const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
 
-        std::string arch_name;
-        GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
+	std::string arch_name;
+	GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
 
-        return arch_name;
+	return arch_name;
     }
 
-    enum llm_arch get_arch() const {
-        const std::string arch_name = get_arch_name();
+  enum llm_arch llama_model_loader::get_arch() const {
+	const std::string arch_name = get_arch_name();
 
-        return llm_arch_from_string(arch_name);
+	return llm_arch_from_string(arch_name);
     }
 
-    const char * get_tensor_name(int i) const {
-        return gguf_get_tensor_name(ctx_gguf, i);
+  const char * llama_model_loader::get_tensor_name(int i) const {
+	return gguf_get_tensor_name(ctx_gguf, i);
     }
 
-    struct ggml_tensor * get_tensor_meta(int i) const {
-        return ggml_get_tensor(ctx_meta, get_tensor_name(i));
+  struct ggml_tensor * llama_model_loader::get_tensor_meta(int i) const {
+	return ggml_get_tensor(ctx_meta, get_tensor_name(i));
     }
 
-    void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
-        ctx_size_p     = 0;
-        mmapped_size_p = 0;
+  void llama_model_loader::calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
+	ctx_size_p     = 0;
+	mmapped_size_p = 0;
 
-        for (int i = 0; i < n_tensors; i++) {
-            struct ggml_tensor * meta = get_tensor_meta(i);
-            ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
-        }
+	for (int i = 0; i < n_tensors; i++) {
+	    struct ggml_tensor * meta = get_tensor_meta(i);
+	    ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
+	    (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
+	}
     }
 
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
-        if (backend != GGML_BACKEND_CPU) {
-            ggml_set_no_alloc(ctx, true);
-        }
+  struct ggml_tensor * llama_model_loader::create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
+	if (backend != GGML_BACKEND_CPU) {
+	    ggml_set_no_alloc(ctx, true);
+	}
 
-        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
-        tensor->backend = backend; // TODO: ggml_set_backend
-        ggml_set_name(tensor, ggml_get_name(meta));
+	struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
+	tensor->backend = backend; // TODO: ggml_set_backend
+	ggml_set_name(tensor, ggml_get_name(meta));
 
-        if (backend != GGML_BACKEND_CPU) {
-            ggml_set_no_alloc(ctx, use_mmap);
-        }
+	if (backend != GGML_BACKEND_CPU) {
+	    ggml_set_no_alloc(ctx, use_mmap);
+	}
 
-        n_created++;
+	n_created++;
 
-        return tensor;
+	return tensor;
     }
 
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
+  struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
+	struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
 
-        if (cur == NULL) {
-            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
-        }
+	if (cur == NULL) {
+	    throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
+	}
 
-        if (backend == GGML_BACKEND_GPU_SPLIT) {
-            if (ne.size() == 1) {
-                throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
-            }
-        }
+	if (backend == GGML_BACKEND_GPU_SPLIT) {
+	    if (ne.size() == 1) {
+		throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
+	    }
+	}
 
-        {
-            bool is_ok = true;
-            for (size_t i = 0; i < ne.size(); ++i) {
-                if (ne[i] != cur->ne[i]) {
-                    is_ok = false;
-                    break;
-                }
-            }
-            if (!is_ok) {
-                throw std::runtime_error(
-                        format("%s: tensor '%s' has wrong shape; expected %s, got %s",
-                            __func__, name.c_str(),
-                            llama_format_tensor_shape(ne).c_str(),
-                            llama_format_tensor_shape(cur).c_str()));
-            }
-        }
+	{
+	    bool is_ok = true;
+	    for (size_t i = 0; i < ne.size(); ++i) {
+		if (ne[i] != cur->ne[i]) {
+		    is_ok = false;
+		    break;
+		}
+	    }
+	    if (!is_ok) {
+		throw std::runtime_error(
+			format("%s: tensor '%s' has wrong shape; expected %s, got %s",
+			    __func__, name.c_str(),
+			    llama_format_tensor_shape(ne).c_str(),
+			    llama_format_tensor_shape(cur).c_str()));
+	    }
+	}
 
-        return create_tensor_for(ctx, cur, backend);
+	return create_tensor_for(ctx, cur, backend);
     }
 
-    void done_getting_tensors() const {
-        if (n_created != n_tensors) {
-            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
-        }
+  void llama_model_loader::done_getting_tensors() const {
+	if (n_created != n_tensors) {
+	    throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+	}
     }
 
-    size_t file_offset(const char * name) const {
-        const int idx = gguf_find_tensor(ctx_gguf, name);
+  size_t llama_model_loader::file_offset(const char * name) const {
+	const int idx = gguf_find_tensor(ctx_gguf, name);
 
-        if (idx < 0) {
-            throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
-        }
+	if (idx < 0) {
+	    throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
+	}
 
-        return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
+	return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
     }
 
-    void load_data_for(struct ggml_tensor * cur) const {
-        const size_t offs = file_offset(ggml_get_name(cur));
+  void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
+	const size_t offs = file_offset(ggml_get_name(cur));
 
-        if (use_mmap) {
-            cur->data = (uint8_t *) mapping->addr + offs;
-        } else {
-            file.seek(offs, SEEK_SET);
-            file.read_raw(cur->data, ggml_nbytes(cur));
-        }
+	if (use_mmap) {
+	    cur->data = (uint8_t *) mapping->addr + offs;
+	} else {
+	    file.seek(offs, SEEK_SET);
+	    file.read_raw(cur->data, ggml_nbytes(cur));
+	}
     }
 
-    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
-        size_t size_data = 0;
-        size_t size_lock = 0;
-        size_t size_pref = 0; // prefetch
-
-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-            size_data += ggml_nbytes(cur);
-            if (cur->backend == GGML_BACKEND_CPU) {
-                size_pref += ggml_nbytes(cur);
-            }
-        }
-
-        if (use_mmap) {
-            mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
-            if (lmlock) {
-                lmlock->init(mapping->addr);
-            }
-        }
-
-        size_t done_size = 0;
-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-            GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
-
-            if (progress_callback) {
-                progress_callback((float) done_size / size_data, progress_callback_user_data);
-            }
-
-            // allocate temp buffer if not using mmap
-            if (!use_mmap && cur->data == NULL) {
-                GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
-                #ifdef GGML_USE_CPU_HBM
-                cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
-                #else
-                cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
-                #endif
-            }
-
-            load_data_for(cur);
-
-            switch (cur->backend) {
-                case GGML_BACKEND_CPU:
-                    if (use_mmap && lmlock) {
-                        size_lock += ggml_nbytes(cur);
-                        lmlock->grow_to(size_lock);
-                    }
-                    break;
+  void llama_model_loader::load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
+	size_t size_data = 0;
+	size_t size_lock = 0;
+	size_t size_pref = 0; // prefetch
+
+	for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
+	    struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
+	    size_data += ggml_nbytes(cur);
+	    if (cur->backend == GGML_BACKEND_CPU) {
+		size_pref += ggml_nbytes(cur);
+	    }
+	}
+
+	if (use_mmap) {
+	    mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
+	    if (lmlock) {
+		lmlock->init(mapping->addr);
+	    }
+	}
+
+	size_t done_size = 0;
+	for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
+	    struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
+	    GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
+
+	    if (progress_callback) {
+		progress_callback((float) done_size / size_data, progress_callback_user_data);
+	    }
+
+	    // allocate temp buffer if not using mmap
+	    if (!use_mmap && cur->data == NULL) {
+		GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
+		#ifdef GGML_USE_CPU_HBM
+		cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
+		#else
+		cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
+		#endif
+	    }
+
+	    load_data_for(cur);
+
+	    switch (cur->backend) {
+		case GGML_BACKEND_CPU:
+		    if (use_mmap && lmlock) {
+			size_lock += ggml_nbytes(cur);
+			lmlock->grow_to(size_lock);
+		    }
+		    break;
 #ifdef GGML_USE_CUBLAS
-                case GGML_BACKEND_GPU:
-                case GGML_BACKEND_GPU_SPLIT:
-                    // old code:
-                    //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
-
-                    // TODO: test if this works !!
-                    ggml_cuda_transform_tensor(cur->data, cur);
-                    if (!use_mmap) {
-                        free(cur->data);
-                    }
-                    break;
+		case GGML_BACKEND_GPU:
+		  
+		case GGML_BACKEND_GPU_SPLIT:
+		    // old code:
+		    //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
+
+		    // TODO: test if this works !!
+		    ggml_cuda_transform_tensor(cur->data, cur);
+		    if (!use_mmap) {
+			free(cur->data);
+		    }
+		    break;
 #elif defined(GGML_USE_CLBLAST)
-                case GGML_BACKEND_GPU:
-                    ggml_cl_transform_tensor(cur->data, cur);
-                    if (!use_mmap) {
-                        free(cur->data);
-                    }
-                    break;
+		case GGML_BACKEND_GPU:
+		    ggml_cl_transform_tensor(cur->data, cur);
+		    if (!use_mmap) {
+			free(cur->data);
+		    }
+		    break;
 #endif
-                default:
-                    continue;
-            }
+		default:
+		    continue;
+	    }
 
-            done_size += ggml_nbytes(cur);
-        }
+	    done_size += ggml_nbytes(cur);
+	}
     }
-};
+  //};
 
 //
 // load LLaMA models
@@ -2099,69 +1588,69 @@ struct llama_model_loader {
 static std::string llama_model_arch_name(llm_arch arch) {
     auto it = LLM_ARCH_NAMES.find(arch);
     if (it == LLM_ARCH_NAMES.end()) {
-        return "unknown";
+	return "unknown";
     }
     return it->second;
 }
 
 static std::string llama_model_ftype_name(llama_ftype ftype) {
     if (ftype & LLAMA_FTYPE_GUESSED) {
-        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+	return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
     }
 
     switch (ftype) {
-        case LLAMA_FTYPE_ALL_F32:     return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
-        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-                                      return "mostly Q4_1, some F16";
-        case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
-
-        // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "mostly Q2_K";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "mostly Q6_K";
-
-        default: return "unknown, may not work";
+	case LLAMA_FTYPE_ALL_F32:     return "all F32";
+	case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
+	case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
+	case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
+	case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
+				      return "mostly Q4_1, some F16";
+	case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
+	case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
+	case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
+
+	// K-quants
+	case LLAMA_FTYPE_MOSTLY_Q2_K:   return "mostly Q2_K";
+	case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
+	case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
+	case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
+	case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
+	case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
+	case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
+	case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
+	case LLAMA_FTYPE_MOSTLY_Q6_K:   return "mostly Q6_K";
+
+	default: return "unknown, may not work";
     }
 }
 
 static const char * llama_model_type_name(e_model type) {
     switch (type) {
-        case MODEL_1B:  return "1B";
-        case MODEL_3B:  return "3B";
-        case MODEL_7B:  return "7B";
-        case MODEL_8B:  return "8B";
-        case MODEL_13B: return "13B";
-        case MODEL_15B: return "15B";
-        case MODEL_30B: return "30B";
-        case MODEL_34B: return "34B";
-        case MODEL_40B: return "40B";
-        case MODEL_65B: return "65B";
-        case MODEL_70B: return "70B";
-        default:        return "?B";
+	case MODEL_1B:  return "1B";
+	case MODEL_3B:  return "3B";
+	case MODEL_7B:  return "7B";
+	case MODEL_8B:  return "8B";
+	case MODEL_13B: return "13B";
+	case MODEL_15B: return "15B";
+	case MODEL_30B: return "30B";
+	case MODEL_34B: return "34B";
+	case MODEL_40B: return "40B";
+	case MODEL_65B: return "65B";
+	case MODEL_70B: return "70B";
+	default:        return "?B";
     }
 }
 
 static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     model.arch = ml.get_arch();
     if (model.arch == LLM_ARCH_UNKNOWN) {
-        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
+	throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
     }
 }
 
 static void llm_load_hparams(
-        llama_model_loader & ml,
-        llama_model & model) {
+	llama_model_loader & ml,
+	llama_model & model) {
     struct gguf_context * ctx = ml.ctx_gguf;
 
     const auto kv = LLM_KV(model.arch);
@@ -2170,13 +1659,13 @@ static void llm_load_hparams(
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
-        enum gguf_type type = gguf_get_kv_type(ctx, i);
-        if (type == GGUF_TYPE_ARRAY) {
-            continue;
-        }
-        const char * name = gguf_get_key(ctx, i);
-        const std::string value = gguf_kv_to_str(ctx, i);
-        model.gguf_kv.emplace(name, value);
+	enum gguf_type type = gguf_get_kv_type(ctx, i);
+	if (type == GGUF_TYPE_ARRAY) {
+	    continue;
+	}
+	const char * name = gguf_get_key(ctx, i);
+	const std::string value = gguf_kv_to_str(ctx, i);
+	model.gguf_kv.emplace(name, value);
     }
 
     // get general kv
@@ -2196,11 +1685,11 @@ static void llm_load_hparams(
 
     hparams.rope_finetuned = false;
     GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
-                 kv(LLM_KV_ROPE_SCALING_FINETUNED));
+		 kv(LLM_KV_ROPE_SCALING_FINETUNED));
 
     hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
     GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
-                 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
+		 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
 
     // rope_freq_base (optional)
     hparams.rope_freq_base_train = 10000.0f;
@@ -2215,125 +1704,125 @@ static void llm_load_hparams(
     float ropescale = 0.0f;
     GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
     if (ropescale == 0.0f) { // try the old key name
-        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+	GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
     }
     hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
 
     // sanity check for n_rot (optional)
     {
-        hparams.n_rot = hparams.n_embd / hparams.n_head;
+	hparams.n_rot = hparams.n_embd / hparams.n_head;
 
-        GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+	GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
 
-        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
-            if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
-                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
-            }
-        }
-        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
-        // gpt-j n_rot = rotary_dim
+	if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+	    if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
+		throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+	    }
+	}
+	// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
+	// gpt-j n_rot = rotary_dim
     }
 
     // arch-specific KVs
     switch (model.arch) {
-        case LLM_ARCH_LLAMA:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-
-                switch (hparams.n_layer) {
-                    case 26: model.type = e_model::MODEL_3B; break;
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
-                    case 48: model.type = e_model::MODEL_34B; break;
-                    case 60: model.type = e_model::MODEL_30B; break;
-                    case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_FALCON:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 60: model.type = e_model::MODEL_40B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_BAICHUAN:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_STARCODER:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 36: model.type = e_model::MODEL_3B; break;
-                    case 42: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_15B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PERSIMMON:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                switch (hparams.n_layer) {
-                    case 36: model.type = e_model::MODEL_8B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_REFACT:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_1B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_BLOOM:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 30:
-                        switch (hparams.n_embd) {
-                            case 2560: model.type = e_model::MODEL_3B; break;
-                            case 4096: model.type = e_model::MODEL_7B; break;
-                        } break;
-                }
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                hparams.f_clamp_kqv = 0.0f;
-
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
-                GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
-
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 48: model.type = e_model::MODEL_30B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_STABLELM:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_3B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-               }
-            } break;
-
-        default: (void)0;
+	case LLM_ARCH_LLAMA:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+
+		switch (hparams.n_layer) {
+		    case 26: model.type = e_model::MODEL_3B; break;
+		    case 32: model.type = e_model::MODEL_7B; break;
+		    case 40: model.type = e_model::MODEL_13B; break;
+		    case 48: model.type = e_model::MODEL_34B; break;
+		    case 60: model.type = e_model::MODEL_30B; break;
+		    case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_FALCON:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+
+		switch (hparams.n_layer) {
+		    case 32: model.type = e_model::MODEL_7B; break;
+		    case 60: model.type = e_model::MODEL_40B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_BAICHUAN:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+		switch (hparams.n_layer) {
+		    case 32: model.type = e_model::MODEL_7B; break;
+		    case 40: model.type = e_model::MODEL_13B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_STARCODER:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+		switch (hparams.n_layer) {
+		    case 24: model.type = e_model::MODEL_1B; break;
+		    case 36: model.type = e_model::MODEL_3B; break;
+		    case 42: model.type = e_model::MODEL_7B; break;
+		    case 40: model.type = e_model::MODEL_15B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_PERSIMMON:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+		switch (hparams.n_layer) {
+		    case 36: model.type = e_model::MODEL_8B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_REFACT:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+		switch (hparams.n_layer) {
+		    case 32: model.type = e_model::MODEL_1B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_BLOOM:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+
+		switch (hparams.n_layer) {
+		    case 24: model.type = e_model::MODEL_1B; break;
+		    case 30:
+			switch (hparams.n_embd) {
+			    case 2560: model.type = e_model::MODEL_3B; break;
+			    case 4096: model.type = e_model::MODEL_7B; break;
+			} break;
+		}
+	    } break;
+	case LLM_ARCH_MPT:
+	    {
+		hparams.f_clamp_kqv = 0.0f;
+
+		GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+		GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
+		GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
+
+		switch (hparams.n_layer) {
+		    case 32: model.type = e_model::MODEL_7B; break;
+		    case 48: model.type = e_model::MODEL_30B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+		}
+	    } break;
+	case LLM_ARCH_STABLELM:
+	    {
+		GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+
+		switch (hparams.n_layer) {
+		    case 32: model.type = e_model::MODEL_3B; break;
+		    default: model.type = e_model::MODEL_UNKNOWN;
+	       }
+	    } break;
+
+	default: (void)0;
     }
 
     model.ftype = ml.ftype;
@@ -2344,8 +1833,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 
 static void llm_load_vocab(
-        llama_model_loader & ml,
-        llama_model & model) {
+	llama_model_loader & ml,
+	llama_model & model) {
     auto & vocab = model.vocab;
 
     struct gguf_context * ctx = ml.ctx_gguf;
@@ -2354,76 +1843,76 @@ static void llm_load_vocab(
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
     if (token_idx == -1) {
-        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+	throw std::runtime_error("cannot find tokenizer vocab in model file\n");
     }
 
     const float * scores = nullptr;
     const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
     if (score_idx != -1) {
-        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+	scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
     }
 
     const int * toktypes = nullptr;
     const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
     if (toktype_idx != -1) {
-        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+	toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
     }
 
     // determine vocab type
     {
-        std::string tokenizer_name;
-
-        GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
-
-        if (tokenizer_name == "llama") {
-            vocab.type = LLAMA_VOCAB_TYPE_SPM;
-
-            // default special tokens
-            vocab.special_bos_id = 1;
-            vocab.special_eos_id = 2;
-            vocab.special_unk_id = 0;
-            vocab.special_sep_id = -1;
-            vocab.special_pad_id = -1;
-        } else if (tokenizer_name == "gpt2") {
-            vocab.type = LLAMA_VOCAB_TYPE_BPE;
-
-            // read bpe merges and populate bpe ranks
-            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
-            if (merges_keyidx == -1) {
-                throw std::runtime_error("cannot find tokenizer merges in model file\n");
-            }
+	std::string tokenizer_name;
 
-            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+	GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
 
-            for (int i = 0; i < n_merges; i++) {
-                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
-                GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
+	if (tokenizer_name == "llama") {
+	    vocab.type = LLAMA_VOCAB_TYPE_SPM;
 
-                std::string first;
-                std::string second;
+	    // default special tokens
+	    vocab.special_bos_id = 1;
+	    vocab.special_eos_id = 2;
+	    vocab.special_unk_id = 0;
+	    vocab.special_sep_id = -1;
+	    vocab.special_pad_id = -1;
+	} else if (tokenizer_name == "gpt2") {
+	    vocab.type = LLAMA_VOCAB_TYPE_BPE;
 
-                const size_t pos = word.find(' ', 1);
+	    // read bpe merges and populate bpe ranks
+	    const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+	    if (merges_keyidx == -1) {
+		throw std::runtime_error("cannot find tokenizer merges in model file\n");
+	    }
 
-                if (pos != std::string::npos) {
-                    first  = word.substr(0, pos);
-                    second = word.substr(pos + 1);
-                }
+	    const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
 
-                vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
-            }
+	    for (int i = 0; i < n_merges; i++) {
+		const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+		GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
 
-            // default special tokens
-            vocab.special_bos_id = 11;
-            vocab.special_eos_id = 11;
-            vocab.special_unk_id = -1;
-            vocab.special_sep_id = -1;
-            vocab.special_pad_id = -1;
-        } else {
-            LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
-            LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
+		std::string first;
+		std::string second;
 
-            vocab.type = LLAMA_VOCAB_TYPE_SPM;
-        }
+		const size_t pos = word.find(' ', 1);
+
+		if (pos != std::string::npos) {
+		    first  = word.substr(0, pos);
+		    second = word.substr(pos + 1);
+		}
+
+		vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
+	    }
+
+	    // default special tokens
+	    vocab.special_bos_id = 11;
+	    vocab.special_eos_id = 11;
+	    vocab.special_unk_id = -1;
+	    vocab.special_sep_id = -1;
+	    vocab.special_pad_id = -1;
+	} else {
+	    LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
+	    LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
+
+	    vocab.type = LLAMA_VOCAB_TYPE_SPM;
+	}
     }
 
     const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
@@ -2431,162 +1920,162 @@ static void llm_load_vocab(
     vocab.id_to_token.resize(n_vocab);
 
     for (uint32_t i = 0; i < n_vocab; i++) {
-        std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
+	std::string word = gguf_get_arr_str(ctx, token_idx, i);
+	GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
 
-        vocab.token_to_id[word] = i;
+	vocab.token_to_id[word] = i;
 
-        auto & token_data = vocab.id_to_token[i];
-        token_data.text  = std::move(word);
-        token_data.score = scores ? scores[i] : 0.0f;
-        token_data.type  = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
+	auto & token_data = vocab.id_to_token[i];
+	token_data.text  = std::move(word);
+	token_data.score = scores ? scores[i] : 0.0f;
+	token_data.type  = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
     }
     GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
     if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+	vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
     } else {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
+	const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
+	GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+	vocab.linefeed_id = ids[0];
     }
 
     // special tokens
     {
-        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
-            { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
-            { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
-            { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
-            { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
-        };
-        for (const auto & it : special_token_types) {
-            const std::string & key = kv(std::get<0>(it));
-            int32_t & id = std::get<1>(it), old_id = id;
-
-            GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
-            // Must be >= -1 and < vocab size. Since the key is unsigned, -1
-            // can only come from the default value, so there's no point in
-            // validating that.
-            if (size_t(id + 1) > vocab.id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
-                    __func__, key.c_str(), id, old_id);
-                id = old_id;
-            }
-
-        }
-
-        // Handle add_bos_token and add_eos_token
-        std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
-        int kid = gguf_find_key(ctx, key.c_str());
-        enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
-        vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
-        if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
-            LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
-        }
-        key = kv(LLM_KV_TOKENIZER_ADD_EOS);
-        kid = gguf_find_key(ctx, key.c_str());
-        ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
-        vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
-        if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
-            LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
-        }
+	const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
+	    { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
+	    { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
+	    { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
+	    { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
+	    { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
+	};
+	for (const auto & it : special_token_types) {
+	    const std::string & key = kv(std::get<0>(it));
+	    int32_t & id = std::get<1>(it), old_id = id;
+
+	    GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
+	    // Must be >= -1 and < vocab size. Since the key is unsigned, -1
+	    // can only come from the default value, so there's no point in
+	    // validating that.
+	    if (size_t(id + 1) > vocab.id_to_token.size()) {
+		LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
+		    __func__, key.c_str(), id, old_id);
+		id = old_id;
+	    }
+
+	}
+
+	// Handle add_bos_token and add_eos_token
+	std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
+	int kid = gguf_find_key(ctx, key.c_str());
+	enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
+	vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
+	if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
+	    LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
+	}
+	key = kv(LLM_KV_TOKENIZER_ADD_EOS);
+	kid = gguf_find_key(ctx, key.c_str());
+	ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
+	vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
+	if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
+	    LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
+	}
     }
 
     // build special tokens cache
     {
-        // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
-        //  and will always be correctly labeled in 'added_tokens.json' etc.
-        // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
-        //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
-        //  are special tokens.
-        // From testing, this appears to corelate 1:1 with special tokens.
-        //
-
-        // Counting special tokens and verifying in only one direction
-        //  is sufficient to detect difference in those two sets.
-        //
-        uint32_t special_tokens_count_by_type = 0;
-        uint32_t special_tokens_count_from_verification = 0;
-
-        bool special_tokens_definition_mismatch = false;
-
-        for (const auto & t : vocab.token_to_id) {
-            const auto & token = t.first;
-            const auto & id    = t.second;
-
-            // Count all non-normal tokens in the vocab while iterating
-            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
-                special_tokens_count_by_type++;
-            }
-
-            // Skip single character tokens
-            if (token.length() > 1) {
-                bool is_tokenizable = false;
-
-                // Split token string representation in two, in all possible ways
-                //  and check if both halves can be matched to a valid token
-                for (unsigned i = 1; i < token.length();) {
-                    const auto left  = token.substr(0, i);
-                    const auto right = token.substr(i);
-
-                    // check if we didnt partition in the middle of a utf sequence
-                    auto utf = utf8_len(left.at(left.length() - 1));
-
-                    if (utf == 1) {
-                        if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
-                            vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
-                            is_tokenizable = true;
-                            break;
-                        }
-                        i++;
-                    } else {
-                        // skip over the rest of multibyte utf sequence
-                        i += utf - 1;
-                    }
-                }
-
-                if (!is_tokenizable) {
-                    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
-                    //  it's faster to re-filter them here, since there are way less candidates now
-
-                    // Calculate a total "utf" length of a token string representation
-                    size_t utf8_str_len = 0;
-                    for (unsigned i = 0; i < token.length();) {
-                        utf8_str_len++;
-                        i += utf8_len(token.at(i));
-                    }
-
-                    // And skip the ones which are one character
-                    if (utf8_str_len > 1) {
-                        // At this point what we have left are special tokens only
-                        vocab.special_tokens_cache[token] = id;
-
-                        // Count manually found special tokens
-                        special_tokens_count_from_verification++;
-
-                        // If this manually found special token is not marked as such, flag a mismatch
-                        if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
-                            special_tokens_definition_mismatch = true;
-                        }
-                    }
-                }
-            }
-        }
-
-        if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
-            LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
-                __func__,
-                special_tokens_count_from_verification, vocab.id_to_token.size(),
-                special_tokens_count_by_type, vocab.id_to_token.size()
-            );
-        } else {
-            LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
-                __func__,
-                special_tokens_count_from_verification, vocab.id_to_token.size()
-            );
-        }
+	// TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
+	//  and will always be correctly labeled in 'added_tokens.json' etc.
+	// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
+	//  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
+	//  are special tokens.
+	// From testing, this appears to corelate 1:1 with special tokens.
+	//
+
+	// Counting special tokens and verifying in only one direction
+	//  is sufficient to detect difference in those two sets.
+	//
+	uint32_t special_tokens_count_by_type = 0;
+	uint32_t special_tokens_count_from_verification = 0;
+
+	bool special_tokens_definition_mismatch = false;
+
+	for (const auto & t : vocab.token_to_id) {
+	    const auto & token = t.first;
+	    const auto & id    = t.second;
+
+	    // Count all non-normal tokens in the vocab while iterating
+	    if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
+		special_tokens_count_by_type++;
+	    }
+
+	    // Skip single character tokens
+	    if (token.length() > 1) {
+		bool is_tokenizable = false;
+
+		// Split token string representation in two, in all possible ways
+		//  and check if both halves can be matched to a valid token
+		for (unsigned i = 1; i < token.length();) {
+		    const auto left  = token.substr(0, i);
+		    const auto right = token.substr(i);
+
+		    // check if we didnt partition in the middle of a utf sequence
+		    auto utf = utf8_len(left.at(left.length() - 1));
+
+		    if (utf == 1) {
+			if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
+			    vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
+			    is_tokenizable = true;
+			    break;
+			}
+			i++;
+		    } else {
+			// skip over the rest of multibyte utf sequence
+			i += utf - 1;
+		    }
+		}
+
+		if (!is_tokenizable) {
+		    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
+		    //  it's faster to re-filter them here, since there are way less candidates now
+
+		    // Calculate a total "utf" length of a token string representation
+		    size_t utf8_str_len = 0;
+		    for (unsigned i = 0; i < token.length();) {
+			utf8_str_len++;
+			i += utf8_len(token.at(i));
+		    }
+
+		    // And skip the ones which are one character
+		    if (utf8_str_len > 1) {
+			// At this point what we have left are special tokens only
+			vocab.special_tokens_cache[token] = id;
+
+			// Count manually found special tokens
+			special_tokens_count_from_verification++;
+
+			// If this manually found special token is not marked as such, flag a mismatch
+			if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
+			    special_tokens_definition_mismatch = true;
+			}
+		    }
+		}
+	    }
+	}
+
+	if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
+	    LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
+		__func__,
+		special_tokens_count_from_verification, vocab.id_to_token.size(),
+		special_tokens_count_by_type, vocab.id_to_token.size()
+	    );
+	} else {
+	    LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
+		__func__,
+		special_tokens_count_from_verification, vocab.id_to_token.size()
+	    );
+	}
     }
 }
 
@@ -2623,9 +2112,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
     if (ml.n_bytes < GiB) {
-        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
+	LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
     } else {
-        LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
+	LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
     }
 
     // general kv
@@ -2641,717 +2130,743 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 }
 
 static void llm_load_tensors(
-        llama_model_loader & ml,
-        llama_model & model,
-        int n_gpu_layers,
-        int main_gpu,
-        const float * tensor_split,
-        bool use_mlock,
-        llama_progress_callback progress_callback,
-        void * progress_callback_user_data) {
+	llama_model_loader & ml,
+	llama_model & model,
+	int n_gpu_layers,
+	int main_gpu,
+	const float * tensor_split,
+	bool use_mlock,
+	llama_progress_callback progress_callback,
+	void * progress_callback_user_data) {
     model.t_start_us = ggml_time_us();
 
     auto & ctx     = model.ctx;
-    auto & hparams = model.hparams;
-
-    model.n_gpu_layers = n_gpu_layers;
-
-    size_t ctx_size;
-    size_t mmapped_size;
-
-    ml.calc_sizes(ctx_size, mmapped_size);
-
-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
-
-    // create the ggml context
-    {
-        model.buf.resize(ctx_size);
-        if (use_mlock) {
-            model.mlock_buf.init   (model.buf.data);
-            model.mlock_buf.grow_to(model.buf.size);
-        }
-
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ model.buf.size,
-            /*.mem_buffer =*/ model.buf.data,
-            /*.no_alloc   =*/ ml.use_mmap,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            throw std::runtime_error(format("ggml_init() failed"));
-        }
-    }
-
-    (void) main_gpu;
-
-    enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
-    enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
-
-#ifdef GGML_USE_CUBLAS
-    if (ggml_cublas_loaded()) {
-        LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
-        ggml_cuda_set_main_device(main_gpu);
-
-        llama_backend_offload = GGML_BACKEND_GPU;
-        llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
-    }
-#elif defined(GGML_USE_CLBLAST)
-        LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
-        llama_backend_offload = GGML_BACKEND_GPU;
-        llama_backend_offload_split = GGML_BACKEND_GPU;
-#endif
-
-    // prepare memory for the weights
-    size_t vram_weights = 0;
-    {
-        const int64_t n_embd     = hparams.n_embd;
-        const int64_t n_embd_gqa = hparams.n_embd_gqa();
-        const int64_t n_layer    = hparams.n_layer;
-        const int64_t n_vocab    = hparams.n_vocab;
-
-        const auto tn = LLM_TN(model.arch);
-        switch (model.arch) {
-            case LLM_ARCH_LLAMA:
-            case LLM_ARCH_REFACT:
-                {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    const uint32_t n_ff = hparams.n_ff;
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
-
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_BAICHUAN:
-                {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    const uint32_t n_ff = hparams.n_ff;
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
-
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_FALCON:
-                {
-                    // TODO: CPU-only for now
-
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = llama_backend_offload;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
-#endif // _WIN32
-
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                            vram_weights += ggml_nbytes(model.output_norm_b);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
+    auto & hparams = model.hparams;
 
-                    const uint32_t n_ff = hparams.n_ff;
+    model.n_gpu_layers = n_gpu_layers;
 
-                    const int i_gpu_start = n_layer - n_gpu_layers;
+    size_t ctx_size;
+    size_t mmapped_size;
 
-                    model.layers.resize(n_layer);
+    ml.calc_sizes(ctx_size, mmapped_size);
 
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
 
-                        auto & layer = model.layers[i];
+    // create the ggml context
+    {
+	model.buf.resize(ctx_size);
+	if (use_mlock) {
+	    model.mlock_buf.init   (model.buf.data);
+	    model.mlock_buf.grow_to(model.buf.size);
+	}
 
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+	struct ggml_init_params params(
+				       model.buf.size,
+				       model.buf.data,
+				       
 
-                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
-                            layer.attn_norm_2   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
-                            layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, backend);
+				       ml.use_mmap				       );
 
-                            if (backend == GGML_BACKEND_GPU) {
-                                vram_weights += ggml_nbytes(layer.attn_norm_2);
-                                vram_weights += ggml_nbytes(layer.attn_norm_2_b);
-                            }
-                        }
+	model.ctx = ggml_init(params);
+	if (!model.ctx) {
+	    throw std::runtime_error(format("ggml_init() failed"));
+	}
+    }
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+    (void) main_gpu;
 
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+    enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
+    enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
 
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
-                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.wo)          +
-                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_up);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_STARCODER:
-                {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
-                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+#ifdef GGML_USE_CUBLAS
+    if (ggml_cublas_loaded()) {
+	LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
+	ggml_cuda_set_main_device(main_gpu);
 
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
+	llama_backend_offload = GGML_BACKEND_GPU;
+	llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
+    }
+#elif defined(GGML_USE_CLBLAST)
+	LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
+	llama_backend_offload = GGML_BACKEND_GPU;
+	llama_backend_offload_split = GGML_BACKEND_GPU;
+#endif
 
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
+    // prepare memory for the weights
+    size_t vram_weights = 0;
+    {
+	const int64_t n_embd     = hparams.n_embd;
+	const int64_t n_embd_gqa = hparams.n_embd_gqa();
+	const int64_t n_layer    = hparams.n_layer;
+	const int64_t n_vocab    = hparams.n_vocab;
+
+	const auto tn = LLM_TN(model.arch);
+	switch (model.arch) {
+	    case LLM_ARCH_LLAMA:
+	    case LLM_ARCH_REFACT:
+		{
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+		    // output
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = llama_backend_offload;
+			    backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
 
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                            vram_weights += ggml_nbytes(model.output_norm_b);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    const uint32_t n_ff = hparams.n_ff;
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
-
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
-
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
-
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
-
-                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
-
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights00 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights01 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+
+		    model.layers.resize(n_layer);
+
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+			auto & layer = model.layers[i];
+
+			layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+
+			layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+			layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+			layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+			layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+			layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+			layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+			layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+			layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+			if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights03 '%ld'\n", vram_weights);
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+				ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+				ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
+			}
+		    }
+		} break;
+	    case LLM_ARCH_BAICHUAN:
+		{
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+			    backend_norm = llama_backend_offload;
+#else
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+#endif // _WIN32
 
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
-                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
-                                ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
-                                ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b)  +
-                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_PERSIMMON:
-                {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights04 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights05 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+
+		    model.layers.resize(n_layer);
+
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+			auto & layer = model.layers[i];
+
+			layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+
+			layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+			layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+			layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+			layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+			layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+			layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+			layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+			layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+			if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights06 '%ld'\n", vram_weights);
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+				ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+				ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
+			}
+		    }
+		} break;
+	    case LLM_ARCH_FALCON:
+		{
+		    // TODO: CPU-only for now
+
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+		    // output
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+			    backend_norm = llama_backend_offload;
+#else
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+#endif // _WIN32
 
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+			model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights07 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			  fprintf(stderr, "vram_weights08 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm_b);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights09 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+
+		    model.layers.resize(n_layer);
+
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+			auto & layer = model.layers[i];
+
+			layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+			layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+
+			if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
+			    layer.attn_norm_2   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
+			    layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, backend);
+
+			    if (backend == GGML_BACKEND_GPU) {
+			      fprintf(stderr, "vram_weights10 '%ld'\n", vram_weights);
+				vram_weights += ggml_nbytes(layer.attn_norm_2);
+			      fprintf(stderr, "vram_weights11 '%ld'\n", vram_weights);
+				vram_weights += ggml_nbytes(layer.attn_norm_2_b);
+			    }
+			}
+
+			layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+			layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+
+			layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+			layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+			if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights12 '%ld'\n", vram_weights);
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
+				ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.wo)          +
+				ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_up);
+			}
+		    }
+		} break;
+	    case LLM_ARCH_STARCODER:
+		{
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
+		    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+
+		    // output
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+			    backend_norm = llama_backend_offload;
+#else
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+#endif // _WIN32
 
-                        if (n_gpu_layers > int(n_layer)) {
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+			model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights13 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			    vram_weights += ggml_nbytes(model.output_norm_b);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights14 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+
+		    model.layers.resize(n_layer);
+
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+			auto & layer = model.layers[i];
+
+			layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+			layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+
+			layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+			layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+
+			layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
+			layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
+
+			layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+			layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+
+			layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+			layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+
+			layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
+			layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+
+			if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights15 '%ld'\n", vram_weights);
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
+				ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
+				ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
+				ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
+				ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b)  +
+				ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b);
+			}
+		    }
+		} break;
+	    case LLM_ARCH_PERSIMMON:
+		{
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
 #ifdef GGML_USE_CUBLAS
-                            if (n_gpu_layers > int(n_layer + 1)) {
-                                LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
-                                    __func__, n_layer + 1);
-                                throw std::runtime_error("Persimmon CUDA offload failed");
-                            }
+			    if (n_gpu_layers > int(n_layer + 1)) {
+				LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
+				    __func__, n_layer + 1);
+				throw std::runtime_error("Persimmon CUDA offload failed");
+			    }
 #endif
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = llama_backend_offload;
+			    backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
 
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm    = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b  = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                            vram_weights += ggml_nbytes(model.output_norm_b);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    const uint32_t n_ff = hparams.n_ff;
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-                    model.layers.resize(n_layer);
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
-                        auto & layer = model.layers[i];
-                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
-                        layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend);
-                        layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend);
-                        layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend);
-                        layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend);
-                        layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
-                        layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64}, backend);
-                        layer.attn_k_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64}, backend);
-                    }
-                } break;
-            case LLM_ARCH_BLOOM:
-                {
-                    // TODO: CPU-only for now
-
-                    model.tok_embd   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.tok_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
-                    model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
-
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm    = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output_norm_b  = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+			model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights16 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			  fprintf(stderr, "vram_weights17 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm_b);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights18 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+		    model.layers.resize(n_layer);
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
+			auto & layer = model.layers[i];
+			layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+			layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
+			layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+			layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend);
+			layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
+			layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend);
+			layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
+			layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend);
+			layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
+			layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend);
+			layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
+			layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
+			layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
+			layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64}, backend);
+			layer.attn_k_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
+			layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64}, backend);
+		    }
+		} break;
+	    case LLM_ARCH_BLOOM:
+		{
+		    // TODO: CPU-only for now
+
+		    model.tok_embd   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+		    model.tok_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
+		    model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
+
+		    // output
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = llama_backend_offload;
+			    backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
 
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                            vram_weights += ggml_nbytes(model.output_norm_b);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    const uint32_t n_ff = hparams.n_ff;
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
-
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
-
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend);
-
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
-
-                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
-
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
-
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
-                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
-                                ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
-                                ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b)    +
-                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_MPT:
-                {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+			model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights19 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			  fprintf(stderr, "vram_weights20 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm_b);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights21 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+
+		    model.layers.resize(n_layer);
+
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+			auto & layer = model.layers[i];
+
+			layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+			layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+
+			layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+			layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+
+			layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+			layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend);
+
+			layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+			layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+
+			layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+			layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+
+			layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+			layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
+
+			if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights22 '%ld'\n", vram_weights);
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
+				ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
+				ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
+				ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
+				ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b)    +
+				ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b);
+			}
+		    }
+		} break;
+	    case LLM_ARCH_MPT:
+		{
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+		    // output
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = llama_backend_offload;
+			    backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
 
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-
-                    const uint32_t n_ff = hparams.n_ff;
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
-
-                        auto & layer = model.layers[i];
-
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) +
-                                ggml_nbytes(layer.wqkv)      +
-                                ggml_nbytes(layer.wo)        +
-                                ggml_nbytes(layer.ffn_norm)  +
-                                ggml_nbytes(layer.ffn_down)  +
-                                ggml_nbytes(layer.ffn_up);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_STABLELM:
-                {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
+
+			model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+
+			if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights23 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output_norm);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights24 '%ld'\n", vram_weights);
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
+
+		    const uint32_t n_ff = hparams.n_ff;
+
+		    const int i_gpu_start = n_layer - n_gpu_layers;
+
+		    model.layers.resize(n_layer);
+
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+			auto & layer = model.layers[i];
+
+			layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+			layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+			layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+
+			layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+			layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+			layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+
+			if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights25 '%ld'\n", vram_weights);
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) +
+				ggml_nbytes(layer.wqkv)      +
+				ggml_nbytes(layer.wo)        +
+				ggml_nbytes(layer.ffn_norm)  +
+				ggml_nbytes(layer.ffn_down)  +
+				ggml_nbytes(layer.ffn_up);
+			}
+		    }
+		} break;
+	    case LLM_ARCH_STABLELM:
+		{
+		    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+		    // output
+		    {
+			ggml_backend_type backend_norm;
+			ggml_backend_type backend_output;
+
+			if (n_gpu_layers > int(n_layer)) {
+			    // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+			    // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = llama_backend_offload;
+			    backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+			    backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
 
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
+			    backend_output = llama_backend_offload_split;
+			} else {
+			    backend_norm   = GGML_BACKEND_CPU;
+			    backend_output = GGML_BACKEND_CPU;
+			}
 
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},          backend_norm);
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+			model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},          backend_norm);
+			model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+			model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
+			if (backend_norm == GGML_BACKEND_GPU) {
+			    vram_weights += ggml_nbytes(model.output_norm);
+			}
+			if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			    vram_weights += ggml_nbytes(model.output);
+			}
+		    }
 
-                    const uint32_t n_ff = hparams.n_ff;
+		    const uint32_t n_ff = hparams.n_ff;
 
-                    const int i_gpu_start = n_layer - n_gpu_layers;
+		    const int i_gpu_start = n_layer - n_gpu_layers;
 
-                    model.layers.resize(n_layer);
+		    model.layers.resize(n_layer);
 
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        /*
-                        llama_model_loader: - tensor    4:         blk.0.attn_output.weight f16      [  2560,  2560,     1,     1 ]
-                        */
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+		    for (uint32_t i = 0; i < n_layer; ++i) {
+			/*
+			llama_model_loader: - tensor    4:         blk.0.attn_output.weight f16      [  2560,  2560,     1,     1 ]
+			*/
+			const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+			const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
 
-                        auto & layer = model.layers[i];
+			auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+			layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+			layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
 
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+			layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+			layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+			layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+			layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
 
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+			layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+			layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
 
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+			layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+			layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+			layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
-                        }
-                    }
-                } break;
+			if (backend == GGML_BACKEND_GPU) {
+			    vram_weights +=
+				ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+				ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+				ggml_nbytes(layer.ffn_gate)  + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
+			}
+		    }
+		} break;
 
-            default:
-                throw std::runtime_error("unknown architecture");
-        }
+	    default:
+		throw std::runtime_error("unknown architecture");
+	}
     }
 
     ml.done_getting_tensors();
 
     // print memory requirements
     {
-        // this is the total memory required to run the inference
-        size_t mem_required =
-            ctx_size +
-            mmapped_size - vram_weights; // weights in VRAM not in memory
+	// this is the total memory required to run the inference
+	size_t mem_required =
+	    ctx_size +
+	    mmapped_size - vram_weights; // weights in VRAM not in memory
 
-        LLAMA_LOG_INFO("%s: mem required  = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
+	LLAMA_LOG_INFO("%s: mem required  = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
 
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+	const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
-        }
+	LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+	if (n_gpu_layers > (int) hparams.n_layer) {
+	    LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
+	}
 
 #ifdef GGML_USE_CUBLAS
-        const int max_backend_supported_layers = hparams.n_layer + 3;
-        const int max_offloadable_layers       = hparams.n_layer + 3;
+	const int max_backend_supported_layers = hparams.n_layer + 3;
+	const int max_offloadable_layers       = hparams.n_layer + 3;
 #elif GGML_USE_CLBLAST
-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
+	const int max_backend_supported_layers = hparams.n_layer + 1;
+	const int max_offloadable_layers       = hparams.n_layer + 1;
 #endif // GGML_USE_CUBLAS
 
-        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-        LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
+	LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+	LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
 #else
-        (void) n_gpu_layers;
+	(void) n_gpu_layers;
 #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     }
 
     // populate `tensors_by_name`
     for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
-        model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+	struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
+	model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
     }
 
     (void) tensor_split;
 #ifdef GGML_USE_CUBLAS
     {
-        ggml_cuda_set_tensor_split(tensor_split);
+	ggml_cuda_set_tensor_split(tensor_split);
     }
 #endif
 
     ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
 
     if (progress_callback) {
-        progress_callback(1.0f, progress_callback_user_data);
+	progress_callback(1.0f, progress_callback_user_data);
     }
 
     model.mapping = std::move(ml.mapping);
@@ -3363,32 +2878,32 @@ static void llm_load_tensors(
 
 static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
-        llama_model_loader ml(fname, params.use_mmap);
+	llama_model_loader ml(fname, params.use_mmap);
 
-        model.hparams.vocab_only = params.vocab_only;
+	model.hparams.vocab_only = params.vocab_only;
 
-        llm_load_arch   (ml, model);
-        llm_load_hparams(ml, model);
-        llm_load_vocab  (ml, model);
+	llm_load_arch   (ml, model);
+	llm_load_hparams(ml, model);
+	llm_load_vocab  (ml, model);
 
-        llm_load_print_meta(ml, model);
+	llm_load_print_meta(ml, model);
 
-        if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
-            throw std::runtime_error("vocab size mismatch");
-        }
+	if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
+	    throw std::runtime_error("vocab size mismatch");
+	}
 
-        if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return true;
-        }
+	if (params.vocab_only) {
+	    LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+	    return true;
+	}
 
-        llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
-            params.progress_callback, params.progress_callback_user_data
-        );
+	llm_load_tensors(
+	    ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
+	    params.progress_callback, params.progress_callback_user_data
+	);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
-        return false;
+	LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
+	return false;
     }
 
     return true;
@@ -3398,52 +2913,28 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
 // llm_build
 //
 
-using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
-
-enum llm_rope_type {
-    LLM_ROPE,
-    LLM_ROPE_NEOX,
-    LLM_ROPE_GLM,
-};
-
-enum llm_ffn_op_type {
-    LLM_FFN_SILU,
-    LLM_FFN_GELU,
-    LLM_FFN_RELU,
-    LLM_FFN_RELU_SQR,
-};
-
-enum llm_ffn_gate_type {
-    LLM_FFN_SEQ,
-    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
-};
-
-enum llm_norm_type {
-    LLM_NORM,
-    LLM_NORM_RMS,
-};
 
 static struct ggml_tensor * llm_build_inp_embd(
-        struct ggml_context * ctx,
-        const llama_hparams & hparams,
-          const llama_batch & batch,
-         struct ggml_tensor * tok_embd,
-         const llm_build_cb & cb) {
+	struct ggml_context * ctx,
+	const llama_hparams & hparams,
+	  const llama_batch & batch,
+	 struct ggml_tensor * tok_embd,
+	 const llm_build_cb & cb) {
     const int64_t n_embd = hparams.n_embd;
 
     struct ggml_tensor * inpL;
 
     if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
+	struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+	cb(inp_tokens, "inp_tokens", -1);
 
-        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
+	inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
     } else {
 #ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
+	GGML_ASSERT(false && "not implemented");
 #endif
 
-        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+	inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
     }
 
     return inpL;
@@ -3457,11 +2948,11 @@ static void llm_build_k_shift(
       const llama_cparams & cparams,
      const llama_kv_cache & kv,
        struct ggml_cgraph * graph,
-            llm_rope_type   type,
-                  int64_t   n_ctx,
-                  int64_t   n_rot,
-                  float     freq_base,
-                  float     freq_scale,
+	    llm_rope_type   type,
+		  int64_t   n_ctx,
+		  int64_t   n_rot,
+		  float     freq_base,
+		  float     freq_scale,
        const llm_build_cb & cb) {
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_head_kv   = hparams.n_head_kv;
@@ -3481,39 +2972,39 @@ static void llm_build_k_shift(
     int rope_type = 0;
 
     switch (type) {
-        case LLM_ROPE:      rope_type = 0; break;
-        case LLM_ROPE_NEOX: rope_type = 2; break;
-        case LLM_ROPE_GLM:  rope_type = 4; break;
+	case LLM_ROPE:      rope_type = 0; break;
+	case LLM_ROPE_NEOX: rope_type = 2; break;
+	case LLM_ROPE_GLM:  rope_type = 4; break;
     }
 
     for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * tmp =
-            // we rotate only the first n_rot dimensions
-            ggml_rope_custom_inplace(ctx,
-                    ggml_view_3d(ctx, kv.k,
-                        n_rot, n_head_kv, n_ctx,
-                        ggml_element_size(kv.k)*n_embd_head,
-                        ggml_element_size(kv.k)*n_embd_gqa,
-                        ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
-                    K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
-        cb(tmp, "K_shifted", il);
-        ggml_build_forward_expand(graph, tmp);
+	struct ggml_tensor * tmp =
+	    // we rotate only the first n_rot dimensions
+	    ggml_rope_custom_inplace(ctx,
+		    ggml_view_3d(ctx, kv.k,
+			n_rot, n_head_kv, n_ctx,
+			ggml_element_size(kv.k)*n_embd_head,
+			ggml_element_size(kv.k)*n_embd_gqa,
+			ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
+		    K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+		    ext_factor, attn_factor, beta_fast, beta_slow);
+	cb(tmp, "K_shifted", il);
+	ggml_build_forward_expand(graph, tmp);
     }
 }
 
 static void llm_build_kv_store(
-        struct ggml_context * ctx,
-        const llama_hparams & hparams,
+	struct ggml_context * ctx,
+	const llama_hparams & hparams,
        const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-                    int64_t   n_ctx,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-         const llm_build_cb & cb,
-                    int64_t   il) {
+	 struct ggml_cgraph * graph,
+	 struct ggml_tensor * k_cur,
+	 struct ggml_tensor * v_cur,
+		    int64_t   n_ctx,
+		    int32_t   n_tokens,
+		    int32_t   kv_head,
+	 const llm_build_cb & cb,
+		    int64_t   il) {
     const int64_t n_embd_gqa = hparams.n_embd_gqa();
 
     // compute the transposed [n_tokens, n_embd] V matrix
@@ -3522,12 +3013,12 @@ static void llm_build_kv_store(
     cb(v_cur_t, "v_cur_t", il);
 
     struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
-            (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+	    (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
     cb(k_cache_view, "k_cache_view", il);
 
     struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
-            (   n_ctx)*ggml_element_size(kv.v),
-            (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
+	    (   n_ctx)*ggml_element_size(kv.v),
+	    (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
     cb(v_cache_view, "v_cache_view", il);
 
     // important: storing RoPE-ed version of K in the KV cache!
@@ -3536,48 +3027,48 @@ static void llm_build_kv_store(
 }
 
 static struct ggml_tensor * llm_build_norm(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-        const llama_hparams & hparams,
-         struct ggml_tensor * mw,
-         struct ggml_tensor * mb,
-              llm_norm_type   type,
-         const llm_build_cb & cb,
-                        int   il) {
+	struct ggml_context * ctx,
+	 struct ggml_tensor * cur,
+	const llama_hparams & hparams,
+	 struct ggml_tensor * mw,
+	 struct ggml_tensor * mb,
+	      llm_norm_type   type,
+	 const llm_build_cb & cb,
+			int   il) {
     switch (type) {
-        case LLM_NORM:     cur = ggml_norm    (ctx, cur, hparams.f_norm_eps);     break;
-        case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
+	case LLM_NORM:     cur = ggml_norm    (ctx, cur, hparams.f_norm_eps);     break;
+	case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
     }
 
     if (mw || mb) {
-        cb(cur, "norm", il);
+	cb(cur, "norm", il);
     }
 
     if (mw) {
-        cur = ggml_mul(ctx, cur, mw);
-        if (mb) {
-            cb(cur, "norm_w", il);
-        }
+	cur = ggml_mul(ctx, cur, mw);
+	if (mb) {
+	    cb(cur, "norm_w", il);
+	}
     }
 
     if (mb) {
-        cur = ggml_add(ctx, cur, mb);
+	cur = ggml_add(ctx, cur, mb);
     }
 
     return cur;
 }
 
 static struct ggml_tensor * llm_build_ffn(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * up,
-         struct ggml_tensor * up_b,
-         struct ggml_tensor * gate,
-         struct ggml_tensor * gate_b,
-         struct ggml_tensor * down,
-         struct ggml_tensor * down_b,
-            llm_ffn_op_type   type_op,
-          llm_ffn_gate_type   type_gate,
+	struct ggml_context * ctx,
+	 struct ggml_tensor * cur,
+	 struct ggml_tensor * up,
+	 struct ggml_tensor * up_b,
+	 struct ggml_tensor * gate,
+	 struct ggml_tensor * gate_b,
+	 struct ggml_tensor * down,
+	 struct ggml_tensor * down_b,
+	    llm_ffn_op_type   type_op,
+	llm_ffn_gate_type   type_gate,
          const llm_build_cb & cb,
                         int   il) {
     struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
@@ -3736,45 +3227,10 @@ static struct ggml_tensor * llm_build_kqv(
     return cur;
 }
 
-struct llm_build_context {
-    const llama_model    & model;
-    const llama_hparams  & hparams;
-    const llama_cparams  & cparams;
-    const llama_batch    & batch;
-    const llama_kv_cache & kv_self;
-
-    const int64_t n_embd;
-    const int64_t n_layer;
-    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_head;
-    const int64_t n_head_kv;
-    const int64_t n_embd_head;
-    const int64_t n_embd_gqa;
-
-    const float freq_base;
-    const float freq_scale;
-    const float ext_factor;
-    const float attn_factor;
-    const float beta_fast;
-    const float beta_slow;
-    const float norm_eps;
-    const float norm_rms_eps;
-
-    const int32_t n_tokens;
-    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
-    const int32_t kv_head;  // index of where we store new KV data in the cache
-    const int32_t n_orig_ctx;
-
-    const bool do_rope_shift;
-
-    const llm_build_cb & cb;
-
-    llama_buffer & buf_compute;
-
-    struct ggml_context * ctx0 = nullptr;
+// struct llm_build_context {
 
     // TODO: consider making the entire interface noexcept
-    llm_build_context(
+llm_build_context::llm_build_context(
         llama_context  & lctx,
     const llama_batch  & batch,
     const llm_build_cb & cb,
@@ -3811,24 +3267,27 @@ struct llm_build_context {
             // all initializations should be done in init()
         }
 
-    void init() {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ buf_compute.size,
-            /*.mem_buffer =*/ buf_compute.data,
-            /*.no_alloc   =*/ true,
-        };
+void llm_build_context::init() {
+      struct ggml_init_params params(
+				     //.mem_size   =
+				     buf_compute.size,
+				     //.mem_buffer =
+				     buf_compute.data,
+				     //.no_alloc   =
+				     true
+				     );
 
         ctx0 = ggml_init(params);
     }
 
-    void free() {
+    void llm_build_context::free() {
         if (ctx0) {
             ggml_free(ctx0);
             ctx0 = nullptr;
         }
     }
 
-    struct ggml_cgraph * build_llama() {
+    struct ggml_cgraph * llm_build_context::build_llama() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -3940,7 +3399,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_baichuan() {
+struct ggml_cgraph * llm_build_context::build_baichuan() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -4060,7 +3519,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_falcon() {
+struct ggml_cgraph * llm_build_context::build_falcon() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -4182,7 +3641,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_starcoder() {
+struct ggml_cgraph * llm_build_context::build_starcoder() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -4281,7 +3740,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_persimmon() {
+    struct ggml_cgraph * llm_build_context::build_persimmon() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         const int64_t n_rot = n_embd_head / 2;
@@ -4491,7 +3950,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_refact() {
+struct ggml_cgraph * llm_build_context::build_refact() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -4582,7 +4041,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_bloom() {
+struct ggml_cgraph * llm_build_context::build_bloom() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -4676,7 +4135,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_mpt() {
+struct ggml_cgraph * llm_build_context::build_mpt() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -4775,7 +4234,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_stablelm() {
+struct ggml_cgraph * llm_build_context::build_stablelm() {
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
         struct ggml_tensor * cur;
@@ -4887,27 +4346,18 @@ struct llm_build_context {
 
         return gf;
     }
-};
+
 
 //
 // tensor offloading helpers
 //
 // TODO: will be removed with backend v2
 
-enum llm_offload_func_e {
-    OFFLOAD_FUNC_NOP,
-    OFFLOAD_FUNC,
-    OFFLOAD_FUNC_KQ,
-    OFFLOAD_FUNC_V,
-    OFFLOAD_FUNC_NR,
-    OFFLOAD_FUNC_EMB,
-    OFFLOAD_FUNC_OUT,
-};
 
 // TODO: will be removed with backend v2
-struct llm_offload_trie {
-    struct node {
-        ~node() {
+//struct llm_offload_trie {
+//    struct node {
+llm_offload_trie::node::~node() {
             for (int i = 0; i < 256; ++i) {
                 if (children[i]) {
                     delete children[i];
@@ -4915,28 +4365,28 @@ struct llm_offload_trie {
             }
         }
 
-        node * children[256] = { nullptr };
-        llm_offload_func_e func = OFFLOAD_FUNC_NOP;
-    };
+//        node * children[256] = { nullptr };
+//        llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+//    };
 
-    llm_offload_trie() {
+llm_offload_trie::llm_offload_trie() {
         root = new node;
     }
 
-    llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
-        root = new node;
-
-        for (const auto & kv : map) {
-            add(kv.first, kv.second);
-        }
-    }
+llm_offload_trie::llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
+  root = new node;
+  
+  for (const auto & kv : map) {
+    add(kv.first, kv.second);
+  }
+}
 
-    ~llm_offload_trie() {
-        delete root;
-    }
+llm_offload_trie::~llm_offload_trie() {
+  delete root;
+}
 
-    void add(const char * name, llm_offload_func_e func) {
-        node * cur = root;
+void llm_offload_trie::add(const char * name, llm_offload_func_e func) {
+  node * cur = root;
 
         for (int i = 0; ; ++i) {
             const uint8_t c = name[i];
@@ -4955,7 +4405,7 @@ struct llm_offload_trie {
         cur->func = func;
     }
 
-    llm_offload_func_e find(const char * name) const {
+llm_offload_func_e llm_offload_trie::find(const char * name) const {
         const node * cur = root;
 
         for (int i = 0; ; ++i) {
@@ -4975,8 +4425,8 @@ struct llm_offload_trie {
         return cur->func;
     }
 
-    node * root = nullptr;
-};
+//    node * root = nullptr;
+//};
 
 // TODO: will be removed with backend v2
 static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
@@ -5588,8 +5038,8 @@ static int llama_decode_internal(
 
     // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-    //}
+    //ggml_graph_dump_dot(gf, NULL, "llama.dot");
+	//}
 
     // extract logits
     // TODO: do not compute and extract logits if only embeddings are needed
@@ -5710,13 +5160,6 @@ static void llama_unescape_whitespace(std::string & word) {
     replace_all(word, "\xe2\x96\x81", " ");
 }
 
-struct llm_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};
 
 static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
 
@@ -5724,24 +5167,16 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
 // original implementation:
 // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
 
-struct llm_bigram_spm {
-    struct comparator {
-        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
-            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
-        }
-    };
-    using queue_storage = std::vector<llm_bigram_spm>;
-    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    float score;
-    size_t size;
-};
 
-struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+bool llm_bigram_spm::comparator::operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
+  return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+}
+
+
+// struct llm_tokenizer_spm {
+llm_tokenizer_spm::llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+void llm_tokenizer_spm::tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         // split string into utf8 chars
         int index = 0;
         size_t offs = 0;
@@ -5799,8 +5234,8 @@ struct llm_tokenizer_spm {
         }
     }
 
-private:
-    void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
+//private:
+void llm_tokenizer_spm::resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
         auto text = std::string(symbol.text, symbol.n);
         auto token = vocab.token_to_id.find(text);
 
@@ -5825,7 +5260,7 @@ struct llm_tokenizer_spm {
         resegment(symbols[p->second.second], output);
     }
 
-    void try_add_bigram(int left, int right) {
+void llm_tokenizer_spm::try_add_bigram(int left, int right) {
         if (left == -1 || right == -1) {
             return;
         }
@@ -5855,13 +5290,6 @@ struct llm_tokenizer_spm {
         rev_merge[text] = std::make_pair(left, right);
     }
 
-    const llama_vocab & vocab;
-
-    std::vector<llm_symbol> symbols;
-    llm_bigram_spm::queue work_queue;
-
-    std::map<std::string, std::pair<int, int>> rev_merge;
-};
 
 // BPE tokenizer
 // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
@@ -5869,26 +5297,15 @@ struct llm_tokenizer_spm {
 
 // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
 
-struct llm_bigram_bpe {
-    struct comparator {
-        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
-            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
-        }
-    };
 
-    using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    std::string text;
-    int rank;
-    size_t size;
-};
+bool llm_bigram_bpe::comparator::operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
+  return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
+}
 
-struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
+//struct llm_tokenizer_bpe {
+llm_tokenizer_bpe::llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void llm_tokenizer_bpe::tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
         auto word_collection = bpe_gpt2_preprocess(text);
 
@@ -5989,8 +5406,8 @@ struct llm_tokenizer_bpe {
         }
     }
 
-private:
-    void add_new_bigram(int left, int right) {
+//private:
+void llm_tokenizer_bpe::add_new_bigram(int left, int right) {
         if (left == -1 || right == -1) {
             return;
         }
@@ -6017,7 +5434,7 @@ struct llm_tokenizer_bpe {
         work_queue.push(bigram);
     }
 
-    std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
+    std::vector<std::string> llm_tokenizer_bpe::bpe_gpt2_preprocess(const std::string & text) {
         std::vector<std::string> bpe_words;
         std::vector<std::string> bpe_encoded_words;
 
@@ -6156,28 +5573,17 @@ struct llm_tokenizer_bpe {
         return bpe_encoded_words;
     }
 
-    const llama_vocab & vocab;
-
-    std::vector<llm_symbol> symbols;
-    std::vector<llm_symbol> symbols_final;
 
-    llm_bigram_bpe::queue work_queue;
-};
-
-typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
-    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
-    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
-} FRAGMENT_BUFFER_VARIANT_TYPE;
 
-struct fragment_buffer_variant{
-    fragment_buffer_variant(llama_vocab::id _token)
+//struct fragment_buffer_variant{
+fragment_buffer_variant::fragment_buffer_variant(llama_vocab::id _token)
     :
         type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
         token(_token),
         raw_text(_dummy),
         offset(0),
         length(0){}
-    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+fragment_buffer_variant::fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
     :
         type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
         token((llama_vocab::id)-1),
@@ -6189,13 +5595,6 @@ struct fragment_buffer_variant{
             GGML_ASSERT( offset + length <= raw_text.length() );
         }
 
-    const FRAGMENT_BUFFER_VARIANT_TYPE type;
-    const llama_vocab::id token;
-    const std::string _dummy;
-    const std::string & raw_text;
-    const uint64_t offset;
-    const uint64_t length;
-};
 
 // #define PRETOKENIZERDEBUG
 
@@ -6294,6 +5693,32 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
 
+// struct 
+
+bool llama_hparams::operator!=(const llama_hparams & other) const {
+  if (this->vocab_only  != other.vocab_only)  return true;
+  if (this->n_vocab     != other.n_vocab)     return true;
+  if (this->n_ctx_train != other.n_ctx_train) return true;
+  if (this->n_embd      != other.n_embd)      return true;
+	if (this->n_head      != other.n_head)      return true;
+	if (this->n_head_kv   != other.n_head_kv)   return true;
+	if (this->n_layer     != other.n_layer)     return true;
+	if (this->n_rot       != other.n_rot)       return true;
+	if (this->n_ff        != other.n_ff)        return true;
+	if (this->rope_finetuned  != other.rope_finetuned)  return true;
+	if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
+
+	const float EPSILON = 1e-9;
+
+	if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
+	if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
+	if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
+	if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
+
+	return false;
+    }
+
+
 static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
     std::vector<llama_vocab::id> output;
 
@@ -6375,24 +5800,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 // grammar - internal
 //
 
-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar {
-    const std::vector<std::vector<llama_grammar_element>>   rules;
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    llama_partial_utf8                                      partial_utf8;
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};
 
 // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
 // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
@@ -6715,7 +6122,8 @@ struct llama_grammar * llama_grammar_init(
         for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
             vec_rules[i].push_back(*pos);
         }
-        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+	llama_grammar_element ge(LLAMA_GRETYPE_END,0);
+        vec_rules[i].push_back(ge);
     }
 
     // loop over alternates of start rule to build initial stacks
@@ -7323,46 +6731,56 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 // Beam search
 //
 
-struct llama_beam {
-    std::vector<llama_token> tokens;
-    float p;  // Cumulative beam probability (renormalized relative to all beams)
-    bool eob; // Initialize end-of-beam to false. Callback sets this to true.
-    // Sort beams by probability. In case of ties, prefer beams at eob.
-    bool operator<(const llama_beam & rhs) const {
+// llama_beam {
+
+bool llama_beam::operator<(const llama_beam & rhs) const {
         return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
     }
     // Shift off first n tokens and discard them.
-    void shift_tokens(const size_t n) {
+void llama_beam::shift_tokens(const size_t n) {
         if (n) {
             std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
             tokens.resize(tokens.size() - n);
         }
     }
-    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
-};
+llama_beam_view llama_beam::view() const {
+      llama_beam_view bv = {
+	.tokens =tokens.data(),
+	.n_tokens= tokens.size(),
+	.p=p,
+	.eob=eob
+      };
+      return bv;
+    }
+
 
 // A struct for calculating logit-related info.
-struct llama_logit_info {
-    const float * const logits;
-    const int n_vocab;
-    const float max_l;
-    const float normalizer;
-    struct sum_exp {
-        float max_l;
-        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
-    };
-    llama_logit_info(llama_context * ctx)
+//struct llama_logit_info {
+//    const float * const logits;
+//    const int n_vocab;
+//    const float max_l;
+//    const float normalizer;
+//    struct sum_exp {
+//        float max_l;
+//        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+//    };
+llama_logit_info::llama_logit_info(llama_context * ctx)
       : logits(llama_get_logits(ctx))
       , n_vocab(llama_n_vocab(llama_get_model(ctx)))
       , max_l(*std::max_element(logits, logits + n_vocab))
       , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
       { }
-    llama_token_data get_token_data(const llama_token token_id) const {
+llama_token_data llama_logit_info::get_token_data(const llama_token token_id) const {
         constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
-        return {token_id, logits[token_id], p};
+	llama_token_data dd(
+			    token_id,
+			    logits[token_id],
+			    p
+			    );
+        return dd;
     }
     // Return top k token_data by logit.
-    std::vector<llama_token_data> top_k(size_t k) {
+std::vector<llama_token_data> llama_logit_info::top_k(size_t k) {
         std::vector<llama_token_data> min_heap;  // min-heap by logit
         const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
         min_heap.reserve(k_min);
@@ -7381,26 +6799,15 @@ struct llama_logit_info {
         }
         return min_heap;
     }
-    float probability_from_logit(float logit) const {
+float llama_logit_info::probability_from_logit(float logit) const {
         return normalizer * std::exp(logit - max_l);
     }
-};
 
-struct llama_beam_search_data {
-    llama_context * ctx;
-    size_t n_beams;
-    int n_past;
-    int n_predict;
-    std::vector<llama_beam> beams;
-    std::vector<llama_beam> next_beams;
 
-    // Re-calculated on each loop iteration
-    size_t common_prefix_length;
+//struct llama_beam_search_data {
 
-    // Used to communicate to/from callback on beams state.
-    std::vector<llama_beam_view> beam_views;
 
-    llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
+llama_beam_search_data::llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
       : ctx(ctx)
       , n_beams(n_beams)
       , n_past(n_past)
@@ -7411,7 +6818,7 @@ struct llama_beam_search_data {
     }
 
     // Collapse beams to a single beam given by index.
-    void collapse_beams(const size_t beam_idx) {
+void llama_beam_search_data::collapse_beams(const size_t beam_idx) {
         if (0u < beam_idx) {
             std::swap(beams[0], beams[beam_idx]);
         }
@@ -7423,7 +6830,7 @@ struct llama_beam_search_data {
     //  * Gather elements until the vector is full, then call std::make_heap() on it.
     //  * If the heap is full and a new element is found that should be included, pop the
     //    least element to the back(), replace it with the new, then push it into the heap.
-    void fill_next_beams_by_top_probabilities(llama_beam & beam) {
+void llama_beam_search_data::fill_next_beams_by_top_probabilities(llama_beam & beam) {
         // Min-heaps use a greater-than comparator.
         const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
         if (beam.eob) {
@@ -7478,7 +6885,7 @@ struct llama_beam_search_data {
 
     // Find common_prefix_length based on beams.
     // Requires beams is not empty.
-    size_t find_common_prefix_length() {
+size_t llama_beam_search_data::find_common_prefix_length() {
         size_t common_prefix_length = beams[0].tokens.size();
         for (size_t i = 1 ; i < beams.size() ; ++i) {
             common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
@@ -7494,12 +6901,18 @@ struct llama_beam_search_data {
 
     // Construct beams_state to send back to caller via the callback function.
     // Side effect: set common_prefix_length = find_common_prefix_length();
-    llama_beams_state get_beams_state(const bool last_call) {
+llama_beams_state llama_beam_search_data::get_beams_state(const bool last_call) {
         for (size_t i = 0 ; i < beams.size() ; ++i) {
             beam_views[i] = beams[i].view();
         }
         common_prefix_length = find_common_prefix_length();
-        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
+        llama_beams_state a = {
+	  .beam_views=beam_views.data(),
+	  .n_beams = beams.size(),
+	  .common_prefix_length=common_prefix_length,
+	  .last_call=last_call
+	};
+	return a;
     }
 
     // Loop:
@@ -7507,7 +6920,7 @@ struct llama_beam_search_data {
     //  * any of the beams have not yet reached end-of-beam (eob), AND
     //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
     //    (since all other beam probabilities can only decrease)
-    void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
+void llama_beam_search_data::loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
         beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eob.
         const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
         for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
@@ -7534,25 +6947,25 @@ struct llama_beam_search_data {
 
     // As beams grow, the cumulative probabilities decrease.
     // Renormalize them to avoid floating point underflow.
-    static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
+void llama_beam_search_data::renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
         const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
         const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
         std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
     }
 
     // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
-    size_t top_beam_index() {
+size_t llama_beam_search_data::top_beam_index() {
         return std::max_element(beams.begin(), beams.end()) - beams.begin();
     }
 
     // Copy (p,eob) for each beam which may have been changed by the callback.
-    void update_beams_from_beam_views() {
+void llama_beam_search_data::update_beams_from_beam_views() {
         for (size_t i = 0 ; i < beams.size() ; ++i) {
             beams[i].p = beam_views[i].p;
             beams[i].eob = beam_views[i].eob;
         }
     }
-};
+
 
 void llama_beam_search(llama_context * ctx,
                        llama_beam_search_callback_fn_t callback, void * callback_data,
@@ -7578,23 +6991,6 @@ struct no_init {
     no_init() { /* do nothing */ }
 };
 
-struct quantize_state_internal {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-
-    int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
-    int i_attention_wv    = 0;
-    int i_feed_forward_w2 = 0;
-
-    int n_k_quantized     = 0;
-    int n_fallback        = 0;
-
-    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
-};
 
 static void llama_convert_tensor_internal(
     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
@@ -8326,14 +7722,14 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
-        /*.n_gpu_layers                =*/ 0,
-        /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ nullptr,
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-        /*.vocab_only                  =*/ false,
-        /*.use_mmap                    =*/ true,
-        /*.use_mlock                   =*/ false,
+        .n_gpu_layers                = 0,
+        .main_gpu                    = 0,
+        .tensor_split                = nullptr,
+        .progress_callback           = nullptr,
+        .progress_callback_user_data = nullptr,
+        .vocab_only                  = false,
+        .use_mmap                    = true,
+        .use_mlock                   = false,
     };
 
 #ifdef GGML_USE_METAL
@@ -8345,23 +7741,23 @@ struct llama_model_params llama_model_default_params() {
 
 struct llama_context_params llama_context_default_params() {
     struct llama_context_params result = {
-        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
-        /*.n_ctx                       =*/ 512,
-        /*.n_batch                     =*/ 512,
-        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
-        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
-        /*.rope_freq_base              =*/ 0.0f,
-        /*.rope_freq_scale             =*/ 0.0f,
-        /*.yarn_ext_factor             =*/ -1.0f,
-        /*.yarn_attn_factor            =*/ 1.0f,
-        /*.yarn_beta_fast              =*/ 32.0f,
-        /*.yarn_beta_slow              =*/ 1.0f,
-        /*.yarn_orig_ctx               =*/ 0,
-        /*.mul_mat_q                   =*/ true,
-        /*.f16_kv                      =*/ true,
-        /*.logits_all                  =*/ false,
-        /*.embedding                   =*/ false,
+        .seed                        = LLAMA_DEFAULT_SEED,
+        .n_ctx                       = 512,
+        .n_batch                     = 512,
+        .n_threads                   = GGML_DEFAULT_N_THREADS, // TODO: better default
+        .n_threads_batch             = GGML_DEFAULT_N_THREADS,
+        .rope_scaling_type           = LLAMA_ROPE_SCALING_UNSPECIFIED,
+        .rope_freq_base              = 0.0f,
+        .rope_freq_scale             = 0.0f,
+        .yarn_ext_factor             = -1.0f,
+        .yarn_attn_factor            = 1.0f,
+        .yarn_beta_fast              = 32.0f,
+        .yarn_beta_slow              = 1.0f,
+        .yarn_orig_ctx               = 0,
+        .mul_mat_q                   = true,
+        .f16_kv                      = true,
+        .logits_all                  = false,
+        .embedding                   = false,
     };
 
     return result;
@@ -8369,12 +7765,12 @@ struct llama_context_params llama_context_default_params() {
 
 struct llama_model_quantize_params llama_model_quantize_default_params() {
     struct llama_model_quantize_params result = {
-        /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
-        /*.allow_requantize            =*/ false,
-        /*.quantize_output_tensor      =*/ true,
-        /*.only_copy                   =*/ false,
-        /*.pure                        =*/ false,
+        .nthread                     = 0,
+        .ftype                       = LLAMA_FTYPE_MOSTLY_Q5_1,
+        .allow_requantize            = false,
+        .quantize_output_tensor      = true,
+        .only_copy                   = false,
+        .pure                        = false,
     };
 
     return result;
@@ -8397,7 +7793,11 @@ void llama_backend_init(bool numa) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL, false };
+      struct ggml_init_params params(
+				     0,
+				     NULL,
+				     false
+				     );
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
@@ -8847,45 +8247,32 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
     return s_total;
 }
 
-// llama_context_data
-struct llama_data_context {
-    virtual void write(const void * src, size_t size) = 0;
-    virtual size_t get_size_written() = 0;
-    virtual ~llama_data_context() = default;
-};
 
-struct llama_data_buffer_context : llama_data_context {
-    uint8_t * ptr;
-    size_t size_written = 0;
 
-    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
+  llama_data_buffer_context::llama_data_buffer_context(uint8_t * p) : ptr(p) {}
 
-    void write(const void * src, size_t size) override {
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
-    }
+void llama_data_buffer_context::write(const void * src, size_t size) {
+  memcpy(ptr, src, size);
+  ptr += size;
+  size_written += size;
+}
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+size_t llama_data_buffer_context::get_size_written()  {
+  return size_written;
+}
 
-struct llama_data_file_context : llama_data_context {
-    llama_file * file;
-    size_t size_written = 0;
 
-    llama_data_file_context(llama_file * f) : file(f) {}
+  
+llama_data_file_context::llama_data_file_context(llama_file * f) : file(f) {}
 
-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
-    }
+void llama_data_file_context::write(const void * src, size_t size) {
+  file->write_raw(src, size);
+  size_written += size;
+}
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+size_t llama_data_file_context::get_size_written()  {
+  return size_written;
+}
 
 /** copy state data into either a buffer or file depending on the passed in context
  *
@@ -8968,7 +8355,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         if (kv_buf_size) {
             const size_t elt_size = ggml_element_size(kv_self.k);
 
-            ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+	    ggml_init_params ip(
+				//.mem_size   =
+				6*ggml_tensor_overhead() + ggml_graph_overhead(),
+				//.mem_buffer =
+				NULL,
+				//.no_alloc = /* no_alloc */
+				true
+				);
+	    
+            ggml_context * cpy_ctx = ggml_init( ip);
             ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
 
             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
@@ -9096,7 +8492,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             const size_t elt_size = ggml_element_size(kv_self.k);
 
-            ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+	    ggml_init_params ip(
+				//.mem_size=
+				6*ggml_tensor_overhead() + ggml_graph_overhead(),
+				//.mem_buffer=
+				NULL,
+				//.no_alloc=
+				true );
+	    
+            ggml_context * cpy_ctx = ggml_init(ip);
             ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
 
             ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
@@ -9257,7 +8661,18 @@ int llama_eval_embd(
                              int   n_past) {
     llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch(
+		      n_tokens,
+		      nullptr,
+		      embd,
+		      nullptr,
+		      nullptr,
+		      nullptr,
+		      nullptr,
+		      n_past,
+		      1,
+		      0
+		      );
 
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
@@ -9277,22 +8692,32 @@ struct llama_batch llama_batch_get_one(
                  int32_t   n_tokens,
                llama_pos   pos_0,
             llama_seq_id   seq_id) {
-    return {
-        /*n_tokens       =*/ n_tokens,
-        /*tokens         =*/ tokens,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ nullptr,
-        /*n_seq_id       =*/ nullptr,
-        /*seq_id         =*/ nullptr,
-        /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ pos_0,
-        /*all_pos_1      =*/ 1,
-        /*all_seq_id     =*/ seq_id,
-    };
+  llama_batch b(
+		n_tokens,
+		tokens,
+		nullptr,
+		nullptr,
+		nullptr,
+		nullptr,
+		nullptr,
+		pos_0,
+		1,
+		seq_id);
+    return b;
 }
 
 struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+  llama_batch batch(
+		    /* .n_tokens = */ 0,
+		    /* .token */  (llama_token  *)nullptr,
+		    /* .embd= */  (float        *)nullptr,
+		    /* .pos= */  (llama_pos    *)nullptr,
+		    /* .n_seq_id= */ (int32_t      *)nullptr,
+		    /* .seq_id= */  (llama_seq_id **)nullptr,
+		    /* .logits= */ (int8_t       *)nullptr,
+		    /* .all_pos_0= */ 0,
+		    /* .all_pos_1= */ 0 ,
+		    /* .all_seq_id= */ 0);
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -9489,16 +8914,15 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
 
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
     struct llama_timings result = {
-        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
-        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
-        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
-
-        /*.n_sample =*/ std::max(1, ctx->n_sample),
-        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
-        /*.n_eval   =*/ std::max(1, ctx->n_eval),
+      .t_start_ms  = 1e-3 * ctx->t_start_us,
+      .t_end_ms    = 1.00 * ggml_time_ms(),
+      .t_load_ms   = 1e-3 * ctx->t_load_us,
+      .t_sample_ms = 1e-3 * ctx->t_sample_us,
+      .t_p_eval_ms = 1e-3 * ctx->t_p_eval_us,
+      .t_eval_ms   = 1e-3 * ctx->t_eval_us,     
+      .n_sample = std::max(1, ctx->n_sample),
+      .n_p_eval = std::max(1, ctx->n_p_eval),
+      .n_eval   = std::max(1, ctx->n_eval),
     };
 
     return result;
@@ -9618,3 +9042,43 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
     fputs(text, stderr);
     fflush(stderr);
 }
+
+
+// LLM_TN 
+LLM_TN::LLM_TN(llm_arch arch) : arch(arch) {}
+
+
+std::string LLM_TN::operator()(llm_tensor tensor) const {
+	return LLM_TENSOR_NAMES[arch].at(tensor);
+    }
+
+    std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix) const {
+	return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
+    }
+
+    std::string LLM_TN::operator()(llm_tensor tensor, int bid) const {
+	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
+    }
+
+    std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
+	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
+    }
+
+std::string LLM_KV::operator()(llm_kv kv) const {
+  return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
+}
+
+
+llama_context::~llama_context() {
+#ifdef GGML_USE_METAL
+	if (ctx_metal) {
+	    ggml_metal_free(ctx_metal);
+	}
+#endif
+	if (alloc) {
+	    ggml_allocr_free(alloc);
+	}
+    }
+llama_state::llama_state(){
+  log_callback= llama_log_callback_default;
+}
diff --git a/llama.h b/llama.h
index 70e8fda4bf1b3..fa430896c6562 100644
--- a/llama.h
+++ b/llama.h
@@ -50,7 +50,7 @@
 #endif
 
 #ifdef __cplusplus
-extern "C" {
+//extern "C" {
 #endif
 
     //
@@ -115,12 +115,20 @@ extern "C" {
     };
 
     typedef struct llama_token_data {
+      llama_token_data( llama_token id, float logit,     float p):
+	id( id),logit(logit),p(p){      }
         llama_token id; // token id
         float logit;    // log-odds of the token
         float p;        // probability of the token
     } llama_token_data;
 
     typedef struct llama_token_data_array {
+      llama_token_data_array(llama_token_data * data,
+			     size_t size,
+			     bool sorted):
+	data(data),
+	size(size),
+	sorted(sorted){}
         llama_token_data * data;
         size_t size;
         bool sorted;
@@ -139,6 +147,29 @@ extern "C" {
     // - logits : if zero, the logits for the respective token will not be output
     //
     typedef struct llama_batch {
+
+      llama_batch(int32_t n_tokens,
+		  llama_token  *  token,
+		  float        *  embd,
+		  llama_pos    *  pos,
+		  int32_t      *  n_seq_id,
+		  llama_seq_id ** seq_id,
+		  int8_t       *  logits,
+		  llama_pos    all_pos_0,
+		  llama_pos    all_pos_1,
+		  llama_seq_id all_seq_id
+		  ) :
+	n_tokens(n_tokens),
+	token(token),
+	embd(embd),
+	pos(pos),
+	n_seq_id(n_seq_id),
+	seq_id(seq_id),
+	logits(logits),      
+	all_pos_0(all_pos_0),
+	all_pos_1(all_pos_1),
+	all_seq_id(all_seq_id) {}
+      
         int32_t n_tokens;
 
         llama_token  *  token;
@@ -174,7 +205,7 @@ extern "C" {
         bool use_mlock;  // force system to keep model in RAM
     };
 
-    struct llama_context_params {
+    struct llama_context_params{
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t n_ctx;             // text context, 0 = from model
         uint32_t n_batch;           // prompt processing maximum batch size
@@ -238,6 +269,10 @@ extern "C" {
     };
 
     typedef struct llama_grammar_element {
+      llama_grammar_element(        enum llama_gretype type,
+				    uint32_t           value // Unicode code point or rule ID
+				    ):type(type), value(value){}
+      llama_grammar_element( ):type(llama_gretype(0)), value(0){}
         enum llama_gretype type;
         uint32_t           value; // Unicode code point or rule ID
     } llama_grammar_element;
@@ -776,7 +811,7 @@ extern "C" {
     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
 
 #ifdef __cplusplus
-}
+//}
 #endif
 
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
@@ -793,4 +828,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
 
 #endif // LLAMA_API_INTERNAL
 
+
+
 #endif // LLAMA_H
+
+
diff --git a/print.hpp b/print.hpp
new file mode 100644
index 0000000000000..b9a75fbff0f3a
--- /dev/null
+++ b/print.hpp
@@ -0,0 +1,553 @@
+#include <iostream>
+#include "llama.h"
+#include "ggml-internal.hpp"
+#include "llama-internal.hpp"
+
+REFL_TYPE(ggml_init_params )
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_adam)
+REFL_END
+
+REFL_TYPE(ggml_opt_params::ggml_lbfgs)
+REFL_END
+
+
+REFL_TYPE(ggml_opt_context::ggml_grad )
+REFL_END
+
+REFL_TYPE(gpt_params )
+
+REFL_FIELD( seed )
+REFL_FIELD( n_threads)
+REFL_FIELD( n_threads_batch)
+REFL_FIELD( n_predict )
+REFL_FIELD( n_ctx )
+REFL_FIELD( n_batch)
+REFL_FIELD( n_keep )
+REFL_FIELD( n_draft)
+REFL_FIELD( n_chunks )
+REFL_FIELD( n_parallel)
+REFL_FIELD( n_sequences)
+REFL_FIELD( p_accept  )
+REFL_FIELD( p_split )
+REFL_FIELD( n_gpu_layers)
+REFL_FIELD( n_gpu_layers_draft)
+REFL_FIELD( main_gpu )
+REFL_FIELD( tensor_split)
+REFL_FIELD( n_beams )
+REFL_FIELD(rope_freq_base)
+REFL_FIELD( rope_freq_scale )
+REFL_FIELD( yarn_ext_factor )
+REFL_FIELD( yarn_attn_factor )
+REFL_FIELD( yarn_beta_fast )
+REFL_FIELD( yarn_beta_slow )
+REFL_FIELD( yarn_orig_ctx)
+REFL_FIELD( rope_scaling_type)
+REFL_FIELD( sparams)
+REFL_FIELD(model )
+REFL_FIELD(model_draft )
+REFL_FIELD(model_alias)
+REFL_FIELD(prompt )
+REFL_FIELD(prompt_file )
+REFL_FIELD(path_prompt_cache )
+REFL_FIELD(input_prefix )
+REFL_FIELD(input_suffix )
+REFL_FIELD( antiprompt)
+REFL_FIELD(logdir )
+REFL_FIELD( lora_adapter)
+REFL_FIELD(lora_base )
+REFL_FIELD( ppl_stride )
+REFL_FIELD( ppl_output_type )
+REFL_FIELD( hellaswag )
+REFL_FIELD( hellaswag_tasks )
+REFL_FIELD( mul_mat_q )
+REFL_FIELD( memory_f16)
+REFL_FIELD( random_prompt )
+REFL_FIELD( use_color )
+REFL_FIELD( interactive )
+REFL_FIELD( chatml )
+REFL_FIELD( prompt_cache_all )
+REFL_FIELD( prompt_cache_ro )
+REFL_FIELD( embedding )
+REFL_FIELD( escape )
+REFL_FIELD( interactive_first )
+REFL_FIELD( multiline_input )
+REFL_FIELD( simple_io )
+REFL_FIELD( cont_batching )
+REFL_FIELD( input_prefix_bos )
+REFL_FIELD( ignore_eos )
+REFL_FIELD( instruct )
+REFL_FIELD( logits_all )
+REFL_FIELD( use_mmap)
+REFL_FIELD( use_mlock )
+REFL_FIELD( numa )
+REFL_FIELD( verbose_prompt )
+REFL_FIELD( infill ) 
+REFL_FIELD(mmproj )
+REFL_FIELD( image)
+
+REFL_END
+
+REFL_TYPE(llama_sampling_params)
+REFL_END
+
+REFL_TYPE(llm_arch)
+REFL_END
+
+REFL_TYPE(llama_sampling_context )
+REFL_FIELD( params)
+REFL_FIELD( mirostat_mu)
+REFL_FIELD( grammar)
+REFL_FIELD( parsed_grammar)
+REFL_FIELD( prev) 
+REFL_FIELD( cur)
+REFL_END
+
+REFL_TYPE(llama_token_data )
+REFL_END
+
+
+REFL_TYPE(llama_token_data_array )
+REFL_END
+
+REFL_TYPE(llama_batch )
+REFL_END
+
+
+REFL_TYPE(ggml_object)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_tensor)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_cplan)
+  REFL_FIELD(work_size)
+REFL_END
+
+REFL_TYPE(ggml_hash_set)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_cgraph)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(ggml_scratch)
+  REFL_FIELD(offs)
+REFL_END
+
+REFL_TYPE(ggml_compute_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_params)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(ggml_opt_context)
+  REFL_FIELD(ctx)
+REFL_END
+
+REFL_TYPE(gguf_init_params)
+REFL_END
+
+REFL_TYPE(ggml_something)
+  REFL_FIELD(type_name)
+REFL_END
+
+REFL_TYPE(ggml_context)
+  REFL_FIELD(mem_size)
+REFL_FIELD(mem_buffer)
+REFL_FIELD(mem_buffer_owned)
+REFL_FIELD(    no_alloc)
+REFL_FIELD(    no_alloc_save)
+REFL_FIELD(    n_objects)
+REFL_FIELD(    objects_begin)
+REFL_FIELD(    objects_end)
+REFL_FIELD(    scratch)
+REFL_FIELD(    scratch_save)
+
+REFL_END
+
+REFL_TYPE(ggml_context_container)
+  REFL_FIELD(used)
+  REFL_FIELD(context)
+REFL_END
+
+ REFL_TYPE(ggml_numa_node)
+   REFL_FIELD(cpus)
+   REFL_FIELD(n_cpus)
+ REFL_END
+
+ REFL_TYPE(ggml_numa_nodes)
+   REFL_FIELD(nodes)
+   REFL_FIELD(n_nodes)
+ REFL_END
+
+ REFL_TYPE(ggml_state)
+   REFL_FIELD(contexts)
+   REFL_FIELD(numa)
+   REFL_END
+
+ REFL_TYPE(gguf_str)
+   REFL_FIELD(n)
+   REFL_FIELD(data)
+ REFL_END
+
+ REFL_TYPE(ggml_map_custom1_op_params)
+   REFL_FIELD(fun)
+   REFL_FIELD(n_tasks)
+ REFL_END
+
+REFL_TYPE(ggml_map_custom2_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(ggml_map_custom3_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END
+
+REFL_TYPE(hash_map)
+  REFL_FIELD(set)
+  REFL_FIELD(vals)
+REFL_END
+REFL_TYPE(ggml_compute_state_shared)
+  REFL_FIELD(cgraph)
+  REFL_FIELD(cplan)
+REFL_END
+REFL_TYPE(ggml_compute_state)
+  REFL_FIELD(thrd)
+  REFL_FIELD(ith)
+REFL_END
+REFL_TYPE(ggml_lbfgs_iteration_data)
+  REFL_FIELD(alpha)
+  REFL_FIELD(ys)
+REFL_END
+
+REFL_TYPE(gguf_kv)
+  REFL_FIELD(key)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(gguf_header)
+  REFL_FIELD(magic)
+  REFL_FIELD(version)
+REFL_END
+
+REFL_TYPE(gguf_tensor_info)
+  REFL_FIELD(name)
+  REFL_FIELD(n_dims)
+REFL_END
+
+REFL_TYPE(gguf_context)
+  REFL_FIELD(header)
+  REFL_FIELD(kv)
+REFL_END
+
+REFL_TYPE(gguf_buf)
+  REFL_FIELD(data)
+  REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_model_params)
+  REFL_FIELD(n_gpu_layers)
+REFL_END
+REFL_TYPE(llama_context_params)
+  REFL_FIELD(seed)
+REFL_END
+REFL_TYPE(llama_model_quantize_params)
+  REFL_FIELD(nthread)
+REFL_END
+
+REFL_TYPE(llama_grammar_element)
+REFL_END
+
+REFL_TYPE(llama_timings)
+  REFL_FIELD(t_start_ms)
+REFL_END
+REFL_TYPE(llama_beam_view)
+  REFL_FIELD(tokens)
+REFL_END
+
+REFL_TYPE(llama_beams_state)
+  REFL_FIELD(beam_views)
+REFL_END
+  
+REFL_TYPE(ggml_backend)
+REFL_END
+
+REFL_TYPE(ggml_backend_buffer)
+REFL_END
+
+REFL_TYPE(ggml_allocr)
+REFL_END
+
+REFL_TYPE(ggml_tallocr)
+REFL_END
+
+REFL_TYPE(ggml_gallocr)
+REFL_END
+
+
+REFL_TYPE(llama_buffer)
+REFL_FIELD(data)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_file)
+REFL_FIELD(fp)
+REFL_FIELD(size)
+REFL_END
+  
+
+REFL_TYPE(llama_mmap)
+REFL_FIELD(addr)
+REFL_FIELD(size)
+REFL_END
+
+
+REFL_TYPE(llama_mlock)
+  REFL_FIELD(addr)
+  REFL_FIELD(size)
+REFL_END
+
+REFL_TYPE(llama_state)
+ REFL_FIELD(log_callback)
+ REFL_FIELD(log_callback_user_data)
+ REFL_END
+  
+
+REFL_TYPE(llama_hparams)
+  REFL_FIELD(vocab_only)
+  REFL_FIELD(n_vocab)
+  REFL_END
+
+
+REFL_TYPE(llama_cparams)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(n_batch)
+REFL_END
+
+REFL_TYPE(llama_layer)
+ REFL_FIELD(attn_norm)
+ REFL_FIELD(attn_norm_b)
+REFL_END
+
+REFL_TYPE(llama_kv_cell)
+  REFL_FIELD(pos)
+  REFL_FIELD(delta)
+REFL_END
+
+REFL_TYPE(llama_kv_cache)
+   REFL_FIELD(has_shift)
+   REFL_FIELD(head)
+ REFL_END
+
+REFL_TYPE(e_model)
+REFL_END
+
+REFL_TYPE(llama_ftype)
+REFL_END
+
+REFL_TYPE(llama_model)
+  REFL_FIELD(type)
+  REFL_FIELD(arch)
+REFL_FIELD(ftype )
+
+REFL_FIELD(  name )
+
+  REFL_FIELD(   hparams )
+REFL_FIELD(    vocab)
+
+REFL_FIELD(   tok_embd)
+REFL_FIELD(   pos_embd)
+REFL_FIELD(   tok_norm)
+REFL_FIELD(   tok_norm_b)
+
+REFL_FIELD(   output_norm)
+REFL_FIELD(  output_norm_b)
+REFL_FIELD(  output)
+
+REFL_FIELD(  layers)
+
+REFL_FIELD(  n_gpu_layers)
+
+  REFL_FIELD(  gguf_kv) //unordered map
+  REFL_FIELD( ctx)
+  REFL_FIELD( buf)
+ REFL_FIELD( mapping) //std::unique_ptr 
+REFL_FIELD( mlock_buf)
+REFL_FIELD( mlock_mmap)
+REFL_FIELD( tensors_by_name)
+  REFL_FIELD( t_load_us)
+REFL_FIELD( t_start_us)
+
+REFL_END
+
+REFL_TYPE(llama_vocab)
+  REFL_END
+  
+  REFL_TYPE(grammar_parser::parse_state)
+  REFL_END
+  
+REFL_TYPE(llama_context)
+REFL_FIELD( cparams)
+//REFL_FIELD(model)
+REFL_FIELD(kv_self)
+ REFL_FIELD(rng) //random numbers
+REFL_FIELD(has_evaluated_once )
+REFL_FIELD( t_start_us)
+REFL_FIELD( t_load_us)
+  REFL_FIELD( t_sample_us )
+REFL_FIELD( t_p_eval_us )
+  REFL_FIELD( t_eval_us)
+REFL_FIELD( n_sample )
+REFL_FIELD( n_p_eval )
+  REFL_FIELD( n_eval  )
+REFL_FIELD(  logits)
+REFL_FIELD(  logits_all )
+REFL_FIELD(  embedding)
+REFL_FIELD(   work_buffer)
+  REFL_FIELD(   buf_compute)
+  REFL_FIELD( buf_alloc)
+REFL_FIELD( alloc ) 
+
+#ifdef GGML_USE_METAL
+REFL_FIELD( ctx_metal )
+#endif
+
+#ifdef GGML_USE_MPI
+REFL_FIELD( ctx_mpi )
+
+#endif
+REFL_END
+
+REFL_TYPE(llama_model_loader)
+  REFL_FIELD(n_kv)
+  REFL_FIELD(n_tensors)
+REFL_END
+
+REFL_TYPE(llm_build_context)
+// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
+//  REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
+REFL_END
+
+REFL_TYPE(llm_offload_trie)
+REFL_END
+
+REFL_TYPE(llm_symbol)
+  REFL_FIELD(prev)
+REFL_END
+
+REFL_TYPE(llm_bigram_spm)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_spm)
+REFL_END
+
+REFL_TYPE(llm_bigram_bpe)
+REFL_END
+
+REFL_TYPE(llm_tokenizer_bpe)
+REFL_END
+  
+
+REFL_TYPE(fragment_buffer_variant)
+REFL_END
+  
+
+REFL_TYPE(llama_partial_utf8)
+  REFL_FIELD(value)
+  REFL_FIELD(n_remain)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar)
+ REFL_FIELD(rules)
+ REFL_FIELD(stacks)
+REFL_END
+  
+
+REFL_TYPE(llama_grammar_candidate)
+ REFL_FIELD(index)
+ REFL_FIELD(code_points)
+REFL_END
+  
+
+REFL_TYPE(llama_beam)
+  REFL_FIELD(tokens)
+  REFL_FIELD(p)
+REFL_END
+  
+
+REFL_TYPE(llama_logit_info)
+  REFL_FIELD(logits)
+  REFL_FIELD(n_vocab)
+REFL_END
+
+REFL_TYPE(llama_beam_search_data)
+  REFL_FIELD(ctx)
+  REFL_FIELD(n_beams)
+REFL_END
+
+
+REFL_TYPE(quantize_state_internal)
+//  REFL_FIELD(model)
+  REFL_FIELD(params)
+REFL_FIELD( n_attention_wv )
+REFL_FIELD(    n_feed_forward_w2 )
+  REFL_FIELD(    i_attention_wv    )
+  REFL_FIELD(    i_feed_forward_w2 )
+REFL_FIELD(    n_k_quantized     )
+REFL_FIELD(     n_fallback        )
+
+REFL_END
+
+REFL_TYPE(llama_data_context)
+REFL_END
+  
+REFL_TYPE(llama_data_buffer_context)
+  REFL_FIELD(ptr)
+REFL_END
+
+REFL_TYPE(llama_data_file_context)
+  REFL_FIELD(file)
+REFL_END
+
+template <typename T>
+constexpr auto get_value_type_name(const T t) noexcept
+{
+  return t.value_type;
+}
+
+// // A generic function to print out the fields of any object
+template<typename T>
+void print_fields(const T& t) {
+  refl::runtime::debug(std::cout, t);
+  constexpr auto type = refl::reflect<T>();
+
+  constexpr auto membertype = refl::member_list<T>();
+
+  constexpr auto members = get_members(type);
+  std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
+  std::cout << "DEBUG Type2: " << typeid(membertype).name() << "\n";
+  std::cout << "DEBUG Type3: " << typeid(members).name() << "\n";
+     refl::util::for_each(members, [&](auto member) {
+       //using member_t = decltype(member::value_type);
+       //typename type3 = member::value_type;
+       //typename trait::remove_qualifiers_t<member_t>::value_type>;
+       //constexpr auto type2 = refl::reflect(type3);
+	 //std::cout  << "Auto:" << foo <<"\n";       
+       std::cout  << "Auto:" << member.name <<"\n";
+       //std::cout << "DEBUG Type2: " << typeid(member_t).name() << "\n";
+       //std::cout << "DEBUG Type2: " << type2.name.c_str() << "\n";
+     });
+     std::cout << "\n";
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c8b4bc254f4c6..28f6254630010 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -46,6 +46,6 @@ llama_build_and_test_executable(test-grad0.cpp) # SLOW
 llama_build_and_test_executable(test-rope.cpp)
 
 # dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
+get_filename_component(TEST_TARGET test-c.cpp NAME_WE)
+add_executable(${TEST_TARGET} test-c.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
diff --git a/tests/test-c.c b/tests/test-c.cpp
similarity index 100%
rename from tests/test-c.c
rename to tests/test-c.cpp
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index a2459a2867c5c..a58e555622fcf 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -115,11 +115,11 @@ int main(int argc, char * argv[]) {
     generate_data(1.0, test_data2.size(), test_data2.data());
 
     // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
+    struct ggml_init_params ggml_params(
         /* .mem_size   = */ 1*1024,
         /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
+        /* .no_alloc   = */ true
+					);
     struct ggml_context * ctx = ggml_init(ggml_params);
 
     int num_failed = 0;
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 88fac0e23106b..dccfe087b415b 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -261,11 +261,11 @@ int main(int argc, char * argv[]) {
 
 
     // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
+    struct ggml_init_params ggml_params(
         /* .mem_size   = */ 1*1024,
         /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
+        /* .no_alloc   = */ true
+					);
     struct ggml_context * ctx = ggml_init(ggml_params);
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 26c1f42dc0e95..e1d92cdd4655b 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -124,11 +124,11 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 }
 
 int main(int /*argc*/, const char ** /*argv*/) {
-    struct ggml_init_params params = {
+  struct ggml_init_params params(
         /* .mem_size   = */ 128*1024*1024,
         /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
+        /* .no_alloc   = */ false
+				 );
 
     std::vector<uint8_t> work_buffer;