diff --git a/CMakeLists.txt b/CMakeLists.txt
index f32df5fe52335e..0ddcead86f0259 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,34 @@
 cmake_minimum_required(VERSION 3.13)  # for add_link_options
 project("llama.cpp" C CXX)
 
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if (NOT MSVC)
+    set(cuda_flags -Wno-pedantic)
+endif()
 
+set(LLAMA_CUBLAS ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(LLAMA_CUDA_F16 ON)
+set(LLAMA_ACCELERATE ON)
+set(LLAMA_K_QUANTS ON)
+
+#-DLLAMA_NATIVE=off
+set(LLAMA_AVX ON)
+set(LLAMA_AVX2 OFF)
+set(LLAMA_AVX512 OFF)
+set(LLAMA_FMA OFF)
+set(LLAMA_F16C OFF)
+set(CMAKE_CUDA_FLAGS "--verbose") #
+set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+set(CUDACXX /usr/local/cuda-12.3/bin/nvcc)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc)
+set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3)
+#GGML_USE_CUBLAS
+
+#set(CMAKE_EXE_LINKER_FLAGS -pg)
+#set(CMAKE_SHARED_LINKER_FLAGS -pg)
+
+set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
+    
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -44,7 +70,7 @@ endif()
 
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 
 # debug
@@ -77,9 +103,9 @@ endif()
 
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  ON)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  ON)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
@@ -104,7 +130,7 @@ option(LLAMA_BUILD_SERVER               "llama: build server example"
 # Compile flags
 #
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -230,7 +256,12 @@ if (LLAMA_BLAS)
 
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
+	# from https://github.com/NVIDIA/cutlass
+	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
+	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
+
+	#        add_compile_definitions(GGML_USE_OPENBLAS)
         if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
             add_compile_definitions(GGML_BLAS_USE_MKL)
         endif()
@@ -272,6 +303,7 @@ if (LLAMA_CUBLAS)
         endif()
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+
         if (DEFINED LLAMA_CUDA_DMMV_Y)
             add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
         endif()
@@ -390,14 +422,15 @@ endif()
 
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+    # -Wpedantic
+        set(warning_flags -Wall -Wextra  -Wcast-qual -Wno-unused-function)
         set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive)
         set(host_cxx_flags "")
 
         if (CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive)
 
             if (
                 (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@@ -407,30 +440,27 @@ if (LLAMA_ALL_WARNINGS)
             endif()
         elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
             set(c_flags ${c_flags} -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive)
 
             if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive)
             endif()
             if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive)
             endif()
         endif()
     else()
         # todo : msvc
     endif()
 
-    set(c_flags   ${c_flags}   ${warning_flags})
-    set(cxx_flags ${cxx_flags} ${warning_flags})
+    set(c_flags   ${c_flags}  -save-temps --verbose  ${warning_flags})
+    set(cxx_flags ${cxx_flags} -fpermissive  -save-temps --verbose ${warning_flags})
     add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
                         "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
                         "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 
 endif()
 
-if (NOT MSVC)
-    set(cuda_flags -Wno-pedantic)
-endif()
 set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
 
 list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
@@ -438,6 +468,9 @@ if (NOT cuda_host_flags STREQUAL "")
     set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 
+# 
+set(cuda_flags --verbose -G  ${cuda_flags})
+
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 
 if (WIN32)
@@ -485,8 +518,10 @@ if (NOT MSVC)
             add_link_options(-static-libgcc -static-libstdc++)
         endif()
     endif()
+    add_link_options("-Wl,-Map=${TARGET}.map")
+
     if (LLAMA_GPROF)
-        add_compile_options(-pg)
+      add_compile_options(-pg)
     endif()
 endif()
 
@@ -645,13 +680,13 @@ if (GGML_USE_CPU_HBM)
 endif()
 
 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
             ggml.h
-            ggml-alloc.c
+            ggml-alloc.cpp
             ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
             ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
             ggml-quants.h
             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
diff --git a/Makefile b/Makefile
index a6d2c2ec0f380e..240744ea2c50e4 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive
 
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -538,16 +538,16 @@ $(info )
 #
 
 ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
 ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
 ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+	$(CXX) $(CXXFLAGS)    -c $< -o $@
 
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 
diff --git a/README.md b/README.md
index e14886737121bb..d3b92a75a73731 100644
--- a/README.md
+++ b/README.md
@@ -696,7 +696,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
 
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 
 ### Instruction mode with Alpaca
 
diff --git a/README.org b/README.org
new file mode 100644
index 00000000000000..4f7092f93d82f2
--- /dev/null
+++ b/README.org
@@ -0,0 +1,1039 @@
+This readme is showing how to use mistral using llama.cpp and cuda profiling nsys to collect data.
+
+#+begin_src sh :results verbatim :exports both
+  /home/mdupont/2023/11/07/nvidia-cuda-toolkit-11.5.1/amd64/cuda_cuobjdump/bin/cuobjdump --dump-ptx  ./build/bin/main  > ./build/bin/main.ptx
+#end_example
+  
+  Now to run llama.cpp with model downloaded from ollama we can do it like this
+
+#+begin_src sh :results verbatim :exports both
+      sudo /opt/nvidia/nsight-systems/2023.2.3/bin/nsys profile --show-output=true --trace=cuda,nvtx,cublas,cublas-verbose,cusparse,cusparse-verbose,mpi,oshmem,ucx,osrt,cudnn,opengl,opengl-annotations,openacc,openmp,nvvideo --sample=process-tree  --cudabacktrace=all   ./build/bin/main -m ~/.ollama/models/blobs/sha256:6ae28029995007a3ee8d0b8556d50f3b59b831074cf19c84de87acf51fb54054  -f prompt.org
+#+end_src
+
+#+RESULTS:
+#+begin_example
+This readme is showing how to use mistral using llama.cpp and cuda profiling nsys to collect data.
+
+,#+begin_src sh :results verbatim :exports both
+  /home/mdupont/2023/11/07/nvidia-cuda-toolkit-11.5.1/amd64/cuda_cuobjdump/bin/cuobjdump --dump-ptx  ./build/bin/main > ./build/bin/main.ptx
+#end_example
+  
+  Now to run llama.cpp with model downloaded from ollama we can do it like this
+
+,#+begin_src sh :results verbatim :exports both
+      sudo /opt/nvidia/nsight-systems/2023.2.3/bin/nsys profile --show-output=true --trace=cuda,nvtx,cublas,cublas-verbose,cusparse,cusparse-verbose,mpi,oshmem,ucx,osrt,cudnn,opengl,opengl-annotations,openacc,openmp,nvvideo --sample=process-tree  --cudabacktrace=all   ./build/bin/main -m ~/.ollama/models/blobs/sha256:6ae28029995007a3ee8d0b8556d50f3b59b831074cf19c84de87acf51fb54054    -f README.org
+,#+end_src
+
+  Here we can see the data collected by nsys:
+
+  ,#+begin_example data
+  ===nsys===
+  ====/path/to/bin/main===
+
+  ===Profile Summary=====
+    Total Samples = 30956
+    Sample Rate = 16.102757 Hz
+
+    CPU Samples:
+      Instructions Executed = 6469108233
+      Flops Executed = 6145482438.736761
+      Floats Executed = 20133734308.689648
+      Memory Accesses = 309559
+      Register Accesses = 102771
+      Branch Taken = 149
+      Branch Missed = 378
+      Static Branchs Executed = 17
+      Dynamic Branchs Executed = 5
+    GPU Samples:
+      Instructions Executed = 163111268848
+      Flops Executed = 15056925654.22184
+      Floats Executed = 20133734308.689648
+      Memory Accesses = 172190
+      Register Accesses = 43252
+      Branch Taken = 29
+      Branch Missed = 393
+      Static Branchs Executed = 2
+      Dynamic Branchs Executed = 6
+    ===Profile Details===== 
+  ====/path/to/bin/main===
+  ====Total Samples=====
+    Instructions Executed = 179422513688
+    Flops Executed = 30190359948.90951
+    Floats Executed = 20133734308.689648
+    Memory Accesses = 481749
+    Register Accesses = 146023
+    Branch Taken = 162
+    Branch Missed = 415
+    Static Branchs Executed = 17
+    Dynamic Branchs Executed = 5
+    ====Instruction Details=====
+    <Insert detailed instruction breakdown here>
+    ====Memory Access Details=====
+    <Insert detailed memory access breakdown here>
+    ====Register Access Details=====
+    <Insert detailed register access breakdown here>
+    ====Branching Details=====
+    <Insert detailed branching breakdown here>
+  ====/path/to/bin/main===
+  ====Function Calls=====
+    Function Name | Samples | Flops Executed
+    <Insert function name, sample count, and flop execution count here>
+  ====Function Returns=====
+    Function Name | Samples | Flops Executed
+    <Insert function name, sample count, and flop execution count here>
+  ====Code Coverage=====
+    <Insert code coverage breakdown here>
+  ====Heap Usage=====
+    <Insert heap usage breakdown here>
+  ====Stack Usage=====
+    <Insert stack usage breakdown here>
+#include <iostream>
+#include <vector>
+#include "gtest/gtest.h"
+using namespace testing;
+class TestMyCode : public Test {
+protected:
+  // Set up any needed data or environment variables before each test case.
+};
+TEST_F(TestMyCode, TestCase1) {
+  // Test code for TestCase1 goes here.
+}
+TEST_F(TestMyCode, TestCase2) {
+  // Test code for TestCase2 goes here.
+}
+int main() {
+  InitGoogleTest();
+  RunAllTests(new MySuite());
+  CleanUpGoogleTest();
+  return EXIT_SUCCESS;
+}Generating '/tmp/nsys-report-d862.qdstrm'
+[1/1] [0%                          ] report7.nsys-rep[1/1] [0%                          ] report7.nsys-rep[1/1] [===========50%              ] report7.nsys-rep[1/1] [========================100%] report7.nsys-rep[1/1] [0%                          ] report7.nsys-rep[1/1] [5%                          ] report7.nsys-rep[1/1] [7%                          ] report7.nsys-rep[1/1] [9%                          ] report7.nsys-rep[1/1] [10%                         ] report7.nsys-rep[1/1] [12%                         ] report7.nsys-rep[1/1] [14%                         ] report7.nsys-rep[1/1] [=15%                        ] report7.nsys-rep[1/1] [=17%                        ] report7.nsys-rep[1/1] [==19%                       ] report7.nsys-rep[1/1] [==21%                       ] report7.nsys-rep[1/1] [===22%                      ] report7.nsys-rep[1/1] [===24%                      ] report7.nsys-rep[1/1] [====26%                     ] report7.nsys-rep[1/1] [====27%                     ] report7.nsys-rep[1/1] [=====29%                    ] report7.nsys-rep[1/1] [=====31%                    ] report7.nsys-rep[1/1] [=====32%                    ] report7.nsys-rep[1/1] [======34%                   ] report7.nsys-rep[1/1] [=======36%                  ] report7.nsys-rep[1/1] [=======37%                  ] report7.nsys-rep[1/1] [=======39%                  ] report7.nsys-rep[1/1] [========41%                 ] report7.nsys-rep[1/1] [========42%                 ] report7.nsys-rep[1/1] [=========44%                ] report7.nsys-rep[1/1] [=========45%                ] report7.nsys-rep[1/1] [==========47%               ] report7.nsys-rep[1/1] [==========48%               ] report7.nsys-rep[1/1] [==========49%               ] report7.nsys-rep[1/1] [===========50%              ] report7.nsys-rep[1/1] [===========51%              ] report7.nsys-rep[1/1] [===========52%              ] report7.nsys-rep[1/1] [===========53%              ] report7.nsys-rep[1/1] [============54%             ] report7.nsys-rep[1/1] [============55%             ] report7.nsys-rep[1/1] [============56%             ] report7.nsys-rep[1/1] [============57%             ] report7.nsys-rep[1/1] [=============58%            ] report7.nsys-rep[1/1] [=============59%            ] report7.nsys-rep[1/1] [=============60%            ] report7.nsys-rep[1/1] [==============61%           ] report7.nsys-rep[1/1] [==============62%           ] report7.nsys-rep[1/1] [==============63%           ] report7.nsys-rep[1/1] [==============64%           ] report7.nsys-rep[1/1] [===============65%          ] report7.nsys-rep[1/1] [===============66%          ] report7.nsys-rep[1/1] [===============67%          ] report7.nsys-rep[1/1] [================68%         ] report7.nsys-rep[1/1] [================69%         ] report7.nsys-rep[1/1] [================70%         ] report7.nsys-rep[1/1] [================71%         ] report7.nsys-rep[1/1] [=================72%        ] report7.nsys-rep[1/1] [=================73%        ] report7.nsys-rep[1/1] [=================74%        ] report7.nsys-rep[1/1] [==================75%       ] report7.nsys-rep[1/1] [==================76%       ] report7.nsys-rep[1/1] [==================77%       ] report7.nsys-rep[1/1] [==================78%       ] report7.nsys-rep[1/1] [===================79%      ] report7.nsys-rep[1/1] [===================80%      ] report7.nsys-rep[1/1] [===================81%      ] report7.nsys-rep[1/1] [===================82%      ] report7.nsys-rep[1/1] [====================83%     ] report7.nsys-rep[1/1] [====================84%     ] report7.nsys-rep[1/1] [====================85%     ] report7.nsys-rep[1/1] [=====================86%    ] report7.nsys-rep[1/1] [=====================87%    ] report7.nsys-rep[1/1] [=====================88%    ] report7.nsys-rep[1/1] [=====================89%    ] report7.nsys-rep[1/1] [======================90%   ] report7.nsys-rep[1/1] [======================91%   ] report7.nsys-rep[1/1] [======================92%   ] report7.nsys-rep[1/1] [=======================93%  ] report7.nsys-rep[1/1] [=======================94%  ] report7.nsys-rep[1/1] [=======================95%  ] report7.nsys-rep[1/1] [=======================96%  ] report7.nsys-rep[1/1] [========================97% ] report7.nsys-rep[1/1] [========================98% ] report7.nsys-rep[1/1] [========================99% ] report7.nsys-rep[1/1] [========================100%] report7.nsys-rep[1/1] [========================100%] report7.nsys-rep
+Generated:
+    /mnt/data1/2023/11/09/llama.cpp/report7.nsys-rep
+#+end_example
+Log start
+main: build = 1503 (5519834)
+main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+main: seed  = 1699536977
+ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
+ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
+ggml_init_cublas: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6
+llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/mdupont/.ollama/models/blobs/sha256:6ae28029995007a3ee8d0b8556d50f3b59b831074cf19c84de87acf51fb54054 (version GGUF V2)
+llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
+llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   10:              blk.1.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   11:              blk.1.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   12:              blk.1.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   13:         blk.1.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   15:              blk.1.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   16:            blk.1.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   19:              blk.2.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   20:              blk.2.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   21:              blk.2.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   22:         blk.2.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   24:              blk.2.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   25:            blk.2.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   28:              blk.3.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   29:              blk.3.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   30:              blk.3.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   31:         blk.3.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   33:              blk.3.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   34:            blk.3.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   37:              blk.4.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   38:              blk.4.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   39:              blk.4.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   40:         blk.4.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   42:              blk.4.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   43:            blk.4.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   46:              blk.5.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   47:              blk.5.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   48:              blk.5.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   49:         blk.5.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   51:              blk.5.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   52:            blk.5.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   55:              blk.6.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   56:              blk.6.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   57:              blk.6.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   58:         blk.6.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   60:              blk.6.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   61:            blk.6.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   64:              blk.7.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   65:              blk.7.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   66:              blk.7.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   67:         blk.7.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   69:              blk.7.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   70:            blk.7.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   73:              blk.8.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   74:              blk.8.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   75:              blk.8.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   76:         blk.8.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   78:              blk.8.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   79:            blk.8.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   82:              blk.9.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   83:              blk.9.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   84:              blk.9.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   85:         blk.9.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   87:              blk.9.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   88:            blk.9.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   91:             blk.10.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   92:             blk.10.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   93:             blk.10.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor   94:        blk.10.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   96:             blk.10.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor   97:           blk.10.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  100:             blk.11.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  101:             blk.11.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  102:             blk.11.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  103:        blk.11.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  105:             blk.11.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  106:           blk.11.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  109:             blk.12.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  110:             blk.12.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  111:             blk.12.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  112:        blk.12.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  114:             blk.12.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  115:           blk.12.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  118:             blk.13.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  119:             blk.13.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  120:             blk.13.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  121:        blk.13.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  123:             blk.13.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  124:           blk.13.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  127:             blk.14.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  128:             blk.14.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  129:             blk.14.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  130:        blk.14.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  132:             blk.14.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  133:           blk.14.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  136:             blk.15.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  137:             blk.15.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  138:             blk.15.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  139:        blk.15.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  142:           blk.15.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  145:             blk.16.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  146:             blk.16.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  147:             blk.16.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  148:        blk.16.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  150:             blk.16.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  151:           blk.16.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  154:             blk.17.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  155:             blk.17.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  156:             blk.17.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  157:        blk.17.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  159:             blk.17.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  160:           blk.17.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  163:             blk.18.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  164:             blk.18.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  165:             blk.18.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  166:        blk.18.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  168:             blk.18.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  169:           blk.18.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  172:             blk.19.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  173:             blk.19.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  174:             blk.19.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  175:        blk.19.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  177:             blk.19.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  178:           blk.19.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  181:             blk.20.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  182:             blk.20.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  183:             blk.20.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  184:        blk.20.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  186:             blk.20.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  187:           blk.20.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  190:             blk.21.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  191:             blk.21.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  192:             blk.21.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  193:        blk.21.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  195:             blk.21.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  196:           blk.21.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  199:             blk.22.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  200:             blk.22.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  201:             blk.22.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  202:        blk.22.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  204:             blk.22.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  205:           blk.22.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  208:             blk.23.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  209:             blk.23.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  210:             blk.23.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  211:        blk.23.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  213:             blk.23.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  214:           blk.23.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  217:             blk.24.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  218:             blk.24.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  219:             blk.24.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  220:        blk.24.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  222:             blk.24.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  223:           blk.24.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  226:             blk.25.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  227:             blk.25.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  228:             blk.25.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  229:        blk.25.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  231:             blk.25.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  232:           blk.25.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  235:             blk.26.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  236:             blk.26.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  237:             blk.26.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  238:        blk.26.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  240:             blk.26.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  241:           blk.26.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  244:             blk.27.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  245:             blk.27.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  246:             blk.27.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  247:        blk.27.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  249:             blk.27.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  250:           blk.27.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  253:             blk.28.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  254:             blk.28.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  255:             blk.28.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  256:        blk.28.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  258:             blk.28.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  259:           blk.28.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  262:             blk.29.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  263:             blk.29.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  264:             blk.29.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  265:        blk.29.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  267:             blk.29.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  268:           blk.29.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  271:             blk.30.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  272:             blk.30.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  273:             blk.30.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  274:        blk.30.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  276:             blk.30.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  277:           blk.30.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  280:             blk.31.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  281:             blk.31.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  282:             blk.31.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
+llama_model_loader: - tensor  283:        blk.31.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
+llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  285:             blk.31.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
+llama_model_loader: - tensor  286:           blk.31.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
+llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]
+llama_model_loader: - tensor  290:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
+llama_model_loader: - kv   0:                       general.architecture str     
+llama_model_loader: - kv   1:                               general.name str     
+llama_model_loader: - kv   2:                       llama.context_length u32     
+llama_model_loader: - kv   3:                     llama.embedding_length u32     
+llama_model_loader: - kv   4:                          llama.block_count u32     
+llama_model_loader: - kv   5:                  llama.feed_forward_length u32     
+llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     
+llama_model_loader: - kv   7:                 llama.attention.head_count u32     
+llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     
+llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     
+llama_model_loader: - kv  10:                       llama.rope.freq_base f32     
+llama_model_loader: - kv  11:                          general.file_type u32     
+llama_model_loader: - kv  12:                       tokenizer.ggml.model str     
+llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr     
+llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr     
+llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr     
+llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32     
+llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32     
+llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32     
+llama_model_loader: - kv  19:               general.quantization_version u32     
+llama_model_loader: - type  f32:   65 tensors
+llama_model_loader: - type q4_0:  225 tensors
+llama_model_loader: - type q6_K:    1 tensors
+llm_load_vocab: special tokens definition check successful ( 259/32000 ).
+llm_load_print_meta: format           = GGUF V2
+llm_load_print_meta: arch             = llama
+llm_load_print_meta: vocab type       = SPM
+llm_load_print_meta: n_vocab          = 32000
+llm_load_print_meta: n_merges         = 0
+llm_load_print_meta: n_ctx_train      = 32768
+llm_load_print_meta: n_embd           = 4096
+llm_load_print_meta: n_head           = 32
+llm_load_print_meta: n_head_kv        = 8
+llm_load_print_meta: n_layer          = 32
+llm_load_print_meta: n_rot            = 128
+llm_load_print_meta: n_gqa            = 4
+llm_load_print_meta: f_norm_eps       = 0.0e+00
+llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+llm_load_print_meta: n_ff             = 14336
+llm_load_print_meta: rope scaling     = linear
+llm_load_print_meta: freq_base_train  = 10000.0
+llm_load_print_meta: freq_scale_train = 1
+llm_load_print_meta: n_yarn_orig_ctx  = 32768
+llm_load_print_meta: rope_finetuned   = unknown
+llm_load_print_meta: model type       = 7B
+llm_load_print_meta: model ftype      = mostly Q4_0
+llm_load_print_meta: model params     = 7.24 B
+llm_load_print_meta: model size       = 3.83 GiB (4.54 BPW) 
+llm_load_print_meta: general.name   = mistralai
+llm_load_print_meta: BOS token = 1 '<s>'
+llm_load_print_meta: EOS token = 2 '</s>'
+llm_load_print_meta: UNK token = 0 '<unk>'
+llm_load_print_meta: LF token  = 13 '<0x0A>'
+llm_load_tensors: ggml ctx size =    0.11 MB
+llm_load_tensors: using CUDA for GPU acceleration
+llm_load_tensors: mem required  = 3917.97 MB
+llm_load_tensors: offloading 0 repeating layers to GPU
+llm_load_tensors: offloaded 0/35 layers to GPU
+llm_load_tensors: VRAM used: 0.00 MB
+..................................................................................................
+llama_new_context_with_model: n_ctx      = 512
+llama_new_context_with_model: freq_base  = 10000.0
+llama_new_context_with_model: freq_scale = 1
+llama_new_context_with_model: kv self size  =   64.00 MB
+llama_build_graph: non-view tensors processed: 740/740
+llama_new_context_with_model: compute buffer total size = 79.63 MB
+llama_new_context_with_model: VRAM scratch buffer: 73.00 MB
+llama_new_context_with_model: total VRAM used: 73.00 MB (model: 0.00 MB, context: 73.00 MB)
+
+system_info: n_threads = 12 / 24 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
+sampling: 
+	repeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000
+	top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800
+	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
+generate: n_ctx = 512, n_batch = 512, n_predict = -1, n_keep = 0
+
+
+ [end of text]
+
+llama_print_timings:        load time =     245.80 ms
+llama_print_timings:      sample time =       6.71 ms /    52 runs   (    0.13 ms per token,  7748.47 tokens per second)
+llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
+llama_print_timings:        eval time =    5098.77 ms /    52 runs   (   98.05 ms per token,    10.20 tokens per second)
+llama_print_timings:       total time =    5161.43 ms
+Log end
+[ Babel evaluation exited with code 0 ]
+
+
+#+begin_src sh  :results verbatim :exports both
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys stats report7.nsys-rep 
+#+end_src
+
+#+RESULTS:
+#+begin_example
+Generating SQLite file report7.sqlite from report7.nsys-rep
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/nvtx_sum.py]... 
+
+ ,** NVTX Range Summary (nvtx_sum):
+
+ Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)    Max (ns)   StdDev (ns)   Style             Range          
+ --------  ---------------  ---------  -----------  -----------  ---------  ----------  -----------  -------  -------------------------
+     71.3       91,261,248      2,048     44,561.2     34,700.0     33,179  17,628,931    388,774.9  PushPop  cuBLAS:cublasSgemm_v2    
+     21.8       27,939,877        225    124,177.2     53,143.0     27,935  15,965,566  1,060,852.9  PushPop  cuBLAS:cublasGemmEx      
+      6.3        8,036,669          1  8,036,669.0  8,036,669.0  8,036,669   8,036,669          0.0  PushPop  cuBLAS:cublasCreate_v2   
+      0.6          742,488      2,273        326.7        221.0        150      18,693        509.1  PushPop  cuBLAS:cublasSetStream_v2
+      0.0            7,419          2      3,709.5      3,709.5        142       7,277      5,045.2  PushPop  cuBLAS:cublasGetProperty 
+      0.0              207          1        207.0        207.0        207         207          0.0  PushPop  cuBLAS:cublasSetMathMode 
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/osrt_sum.py]... 
+
+ ,** OS Runtime Summary (osrt_sum):
+
+ Time (%)  Total Time (ns)  Num Calls      Avg (ns)          Med (ns)         Min (ns)        Max (ns)     StdDev (ns)            Name         
+ --------  ---------------  ---------  ----------------  ----------------  --------------  --------------  ------------  ----------------------
+     49.8   98,748,705,227        995      99,244,929.9     100,207,029.0           3,076     145,062,709   9,535,006.2  poll                  
+     38.9   77,113,391,701          1  77,113,391,701.0  77,113,391,701.0  77,113,391,701  77,113,391,701           0.0  pthread_cond_wait     
+     10.8   21,505,984,622         43     500,139,177.3     500,139,962.0     500,071,147     500,199,879      31,487.9  pthread_cond_timedwait
+      0.2      408,111,147      5,966          68,406.2           1,002.5              19      66,331,209   1,803,864.3  fflush                
+      0.2      371,330,137        585         634,752.4           4,055.0             202     106,687,209   7,290,173.5  ioctl                 
+      0.1      100,181,277         29       3,454,526.8           6,438.0           1,135      93,195,838  17,278,903.4  mmap                  
+      0.0       58,243,121         12       4,853,593.4           8,691.5           2,231      58,158,033  16,786,545.6  munmap                
+      0.0        2,653,253          4         663,313.3         354,810.5             157       1,943,475     915,833.7  fwrite                
+      0.0        2,281,929     66,070              34.5              22.0              21         648,878       2,531.0  fread                 
+      0.0          831,597         27          30,799.9           6,749.0           3,478         474,236      89,505.1  mmap64                
+      0.0          599,699          9          66,633.2          38,958.0           4,556         206,867      71,500.9  sem_timedwait         
+      0.0          235,180         37           6,356.2           1,564.0             689         114,711      18,945.1  fopen                 
+      0.0          134,278        466             288.2             217.0             155          10,542         532.5  fputs                 
+      0.0          132,740          3          44,246.7          45,080.0          41,640          46,020       2,305.8  pthread_create        
+      0.0           88,594         44           2,013.5           1,668.5             861           3,993         920.3  open64                
+      0.0           26,380         29             909.7             524.0             385           3,325         826.9  fclose                
+      0.0           21,411         56             382.3              24.0              22          20,033       2,673.7  fgets                 
+      0.0           16,310         62             263.1             120.0              80           2,821         481.5  fcntl                 
+      0.0           15,596         16             974.8             764.0             145           5,352       1,249.5  read                  
+      0.0           12,287          6           2,047.8           1,692.5             618           4,230       1,338.0  open                  
+      0.0            9,178         11             834.4             570.0             301           1,485         475.1  write                 
+      0.0            7,860          2           3,930.0           3,930.0           2,653           5,207       1,806.0  socket                
+      0.0            7,589          3           2,529.7           2,328.0             775           4,486       1,863.7  pipe2                 
+      0.0            6,039          1           6,039.0           6,039.0           6,039           6,039           0.0  connect               
+      0.0            4,874          2           2,437.0           2,437.0           1,626           3,248       1,146.9  fopen64               
+      0.0            1,674          1           1,674.0           1,674.0           1,674           1,674           0.0  pthread_cond_signal   
+      0.0            1,026          7             146.6             164.0              89             212          53.8  dup                   
+      0.0              871          1             871.0             871.0             871             871           0.0  bind                  
+      0.0              415          1             415.0             415.0             415             415           0.0  listen                
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_api_sum.py]... 
+
+ ,** CUDA API Summary (cuda_api_sum):
+
+ Time (%)  Total Time (ns)  Num Calls    Avg (ns)     Med (ns)    Min (ns)     Max (ns)     StdDev (ns)                       Name                     
+ --------  ---------------  ---------  ------------  -----------  ---------  -------------  ------------  ---------------------------------------------
+     33.3    3,915,363,238        289  13,547,969.7  9,484,112.0     19,820     32,587,408  13,784,976.3  cudaDeviceSynchronize                        
+     33.3    3,915,338,614        289  13,547,884.5  9,484,033.0     19,749     32,587,319  13,784,970.8  cudaDeviceSynchronize                        
+     11.0    1,289,319,560      7,108     181,389.9      4,874.0      1,971  1,248,737,939  14,811,400.1  cudaLaunchKernel                             
+     10.9    1,288,680,251      7,108     181,300.0      4,784.0      1,922  1,248,737,696  14,811,398.3  cudaLaunchKernel                             
+      4.3      504,516,347      3,747     134,645.4      4,250.0      2,925     11,642,362     664,161.4  cudaMemcpyAsync                              
+      4.3      504,111,303      3,747     134,537.3      4,161.0      2,862     11,641,970     664,125.5  cudaMemcpyAsync                              
+      2.0      237,836,979          8  29,729,622.4      1,076.0        972    237,827,936  84,084,416.4  cudaStreamCreateWithFlags                    
+      0.2       24,762,935          4   6,190,733.8  5,975,786.0    463,322     12,348,041   6,245,573.4  cudaMallocHost                               
+      0.2       24,762,567          4   6,190,641.8  5,975,703.0    463,182     12,347,979   6,245,578.8  cudaMallocHost                               
+      0.1        9,415,273          8   1,176,909.1    147,189.5      1,509      4,594,906   1,935,033.5  cudaFreeHost                                 
+      0.1        9,410,395          8   1,176,299.4    146,459.0      1,278      4,592,920   1,934,725.0  cudaFreeHost                                 
+      0.1        7,195,101          2   3,597,550.5  3,597,550.5  1,072,705      6,122,396   3,570,670.7  cudaFree                                     
+      0.1        7,194,827          2   3,597,413.5  3,597,413.5  1,072,563      6,122,264   3,570,677.8  cudaFree                                     
+      0.1        7,147,578      1,536       4,653.4      4,177.0      3,552         58,008       2,635.3  cudaMemcpy2DAsync                            
+      0.1        6,938,748      1,536       4,517.4      4,042.0      3,425         57,847       2,634.2  cudaMemcpy2DAsync                            
+      0.0        4,765,427     13,477         353.6        256.0        150          7,184         215.8  cudaStreamGetCaptureInfo_v2_v11030           
+      0.0        2,473,305         17     145,488.5     72,327.0      2,246        539,857     166,286.6  cudaMalloc                                   
+      0.0        2,470,534         17     145,325.5     72,203.0      2,181        539,649     166,184.6  cudaMalloc                                   
+      0.0        2,469,464      2,273       1,086.4        946.0        841          4,801         417.9  cudaEventRecord                              
+      0.0        2,304,122      2,273       1,013.7        873.0        771          4,723         417.2  cudaEventRecord                              
+      0.0        1,179,270        161       7,324.7      7,423.0      5,556         11,078         902.4  cudaMemsetAsync                              
+      0.0        1,157,594        161       7,190.0      7,289.0      5,437         10,922         896.7  cudaMemsetAsync                              
+      0.0          363,729        166       2,191.1      2,186.0        730          6,634         535.8  cudaOccupancyMaxActiveBlocksPerMultiprocessor
+      0.0           93,899        766         122.6        102.0         63            553          63.3  cuGetProcAddress_v2                          
+      0.0           30,972          1      30,972.0     30,972.0     30,972         30,972           0.0  cudaGetDeviceProperties_v2_v12000            
+      0.0            9,674         18         537.4        224.0        203          4,209         947.6  cudaEventCreateWithFlags                     
+      0.0            6,163          2       3,081.5      3,081.5      2,878          3,285         287.8  cudaEventQuery                               
+      0.0            5,973          2       2,986.5      2,986.5      2,776          3,197         297.7  cudaEventQuery                               
+      0.0            1,239          3         413.0        152.0         76          1,011         519.3  cuModuleGetLoadingMode                       
+      0.0            1,162          2         581.0        581.0        400            762         256.0  cudaGetDriverEntryPoint_v11030               
+      0.0              960          2         480.0        480.0        360            600         169.7  cuInit                                       
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 
+
+ ,** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+
+ Time (%)  Total Time (ns)  Instances    Avg (ns)     Med (ns)    Min (ns)    Max (ns)   StdDev (ns)                                                   Name                                                
+ --------  ---------------  ---------  ------------  -----------  ---------  ----------  ------------  ----------------------------------------------------------------------------------------------------
+     94.3    3,661,170,403        224  16,344,510.7  8,861,904.0  2,199,256  30,836,845  12,771,357.3  void dequantize_block<(int)32, (int)2, &dequantize_q4_0, __half>(const void *, T4 *, int)           
+      2.7      103,018,305        225     457,859.1    346,527.0    333,855   1,230,427     271,927.9  void dequantize_block<(int)1, (int)1, &convert_f32, __half>(const void *, T4 *, int)                
+      1.1       44,414,363        161     275,865.6    345,439.0    110,432     804,285     138,253.6  ampere_h16816gemm_256x128_ldg8_stages_32x3_tn                                                       
+      1.1       43,348,510      2,273      19,071.1      6,944.0      6,784     619,070      49,609.4  void dequantize_block<(int)1, (int)1, &convert_f16, float>(const void *, T4 *, int)                 
+      0.4       16,973,438      2,048       8,287.8      8,671.5      7,360      10,304         693.3  void cutlass::Kernel<cutlass_80_tensorop_s1688gemm_64x64_16x6_tn_align1>(T1::Params)                
+      0.1        5,584,460          1   5,584,460.0  5,584,460.0  5,584,460   5,584,460           0.0  void dequantize_block_q6_K<__half>(const void *, T1 *)                                              
+      0.1        4,481,001      2,048       2,188.0      2,271.5      1,663       3,360         484.2  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, float, float, float, (bool)1, (boo…
+      0.1        1,946,648         64      30,416.4     30,176.0     29,664      34,720         977.1  ampere_h16816gemm_128x128_ldg8_stages_64x3_tn                                                       
+      0.0          340,796         64       5,324.9      5,312.0      5,184       6,048         162.5  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __half, __half, __half, __half, (bool)1, …
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_mem_time_sum.py]... 
+
+ ,** CUDA GPU MemOps Summary (by Time) (cuda_gpu_mem_time_sum):
+
+ Time (%)  Total Time (ns)  Count  Avg (ns)   Med (ns)  Min (ns)   Max (ns)   StdDev (ns)      Operation     
+ --------  ---------------  -----  ---------  --------  --------  ----------  -----------  ------------------
+     82.7      538,012,483  3,010  178,741.7  13,488.0     5,120  11,313,305    646,615.9  [CUDA memcpy HtoD]
+     17.2      112,106,788  2,273   49,321.1  22,495.0     7,999   1,823,129    143,689.5  [CUDA memcpy DtoH]
+      0.0           66,112    161      410.6     384.0       352       1,152         82.8  [CUDA memset]     
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_mem_size_sum.py]... 
+
+ ,** CUDA GPU MemOps Summary (by Size) (cuda_gpu_mem_size_sum):
+
+ Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation     
+ ----------  -----  --------  --------  --------  --------  -----------  ------------------
+  6,729.069  3,010     2.236     0.192     0.096   107.520        6.567  [CUDA memcpy HtoD]
+  2,884.992  2,273     1.269     0.562     0.192    48.000        3.775  [CUDA memcpy DtoH]
+      0.063    161     0.000     0.000     0.000     0.002        0.000  [CUDA memset]     
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/openmp_sum.py]... 
+SKIPPED: report7.sqlite does not contain OpenMP event data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/opengl_khr_range_sum.py]... 
+SKIPPED: report7.sqlite does not contain KHR Extension (KHR_DEBUG) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/opengl_khr_gpu_range_sum.py]... 
+SKIPPED: report7.sqlite does not contain GPU KHR Extension (KHR_DEBUG) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/vulkan_marker_sum.py]... 
+SKIPPED: report7.sqlite does not contain Vulkan Debug Extension (Vulkan Debug Util) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/vulkan_gpu_marker_sum.py]... 
+SKIPPED: report7.sqlite does not contain GPU Vulkan Debug Extension (GPU Vulkan Debug markers) data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/dx11_pix_sum.py]... 
+SKIPPED: report7.sqlite does not contain DX11 CPU debug markers.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/dx12_gpu_marker_sum.py]... 
+SKIPPED: report7.sqlite does not contain DX12 GPU debug markers.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/dx12_pix_sum.py]... 
+SKIPPED: report7.sqlite does not contain DX12 CPU debug markers.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/wddm_queue_sum.py]... 
+SKIPPED: report7.sqlite does not contain WDDM context data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/um_sum.py]... 
+SKIPPED: report7.sqlite does not contain CUDA Unified Memory CPU page faults data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/um_total_sum.py]... 
+SKIPPED: report7.sqlite does not contain CUDA Unified Memory CPU page faults data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/um_cpu_page_faults_sum.py]... 
+SKIPPED: report7.sqlite does not contain CUDA Unified Memory CPU page faults data.
+
+Processing [report7.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/openacc_sum.py]... 
+SKIPPED: report7.sqlite does not contain OpenACC event data.
+
+#+end_example
+
+#+begin_src sh
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys export -t json  report7.nsys-rep
+#+end_src
+
+#+RESULTS:
+
+#+begin_src sh
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys export -t hdf  report7.nsys-rep
+    /opt/nvidia/nsight-systems/2023.2.3/bin/nsys export -t json  report7.nsys-rep
+  #    jq .  ./report12.json > report12.jq
+#+end_src
+
+#+RESULTS:
+
+
+#+begin_src sh :results verbatim :exports both
+python ./reporthd5_callchains.py ./report7.h5
+#+end_src
+
+#+RESULTS:
+#+begin_example
+./report2.h5
+./report2.h5
+('0x7f70ac50663f|721|MOD:321/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:0', 17)
+('0x7f70ac508958|717|MOD:321/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:1', 17)
+('0x7f70af680966|722|MOD:235/usr/lib/x86_64-linux-gnu/libcuda.so.545.23.06|DEP:2', 17)
+('cudaFreeHost|636|MOD:206/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 8)
+('ggml_cuda_host_free|637|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 8)
+('llama_new_context_with_model|647|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 6)
+('llama_init_from_gpt_params(gpt_params&)|521|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 6)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 6)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 6)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 6)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 6)
+('cudaMallocHost|778|MOD:206/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 4)
+('ggml_cuda_host_malloc|779|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 4)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 4)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:7', 4)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 4)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 4)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 3)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 3)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 3)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 3)
+('0x7f70d54421b0|728|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 3)
+('0x7f70d50aa9bd|729|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 3)
+('llama_free|848|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 3)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 2)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 2)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 2)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 2)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 2)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 2)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2)
+('llm_load_tensors(llama_model_loader&, llama_model&, int, int, float const*, bool, void (*)(float, votrunc|638|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 2)
+('llama_load_model_from_file|520|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 2)
+('llama_init_from_gpt_params(gpt_params&)|521|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('0x7f70d5442978|723|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 1)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 1)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('0x7f70b46e9dc8|724|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 1)
+('0x7f70b16d9e24|725|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 1)
+('0x7f70b16da79b|726|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 1)
+('cublasLtCtxInit|510|MOD:215/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 1)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 1)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 1)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 1)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 1)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:13', 1)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 1)
+('0x7f70d50aa20b|730|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('0x7f70d50aa22e|731|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('cublasCreate_v2|499|MOD:208/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('ggml_init_cublas|422|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('ggml_init|316|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('llama_backend_init|317|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('main|155|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('__libc_start_call_main|318|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 1)
+('__libc_start_main@@GLIBC_2|319|MOD:169/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 1)
+('_start|320|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 1)
+('llama_free_model|805|MOD:192/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+#+end_example
+
+* mistral eval 
+
+This is a table of performance metrics for code that performs several operations on a GPU using NVIDIA CUDA. The operations are:
+
+* `cudaDeviceSynchronize`: This operation synchronizes the execution of all other threads on the GPU. It ensures that all threads have completed before moving on to the next operation.
+* `cudaLaunchKernel`: This operation launches a kernel function (a small CUDA program) on the GPU. In this case, two different kernels are launched, likely with different parameters or data inputs.
+* `cudaMemcpyAsync`: This operation copies memory from the CPU to the GPU or vice versa asynchronously. It does not block the execution of other threads on the GPU, allowing multiple operations to be performed concurrently.
+* `cudaStreamCreateWithFlags`: This operation creates a new CUDA stream, which is used to manage the execution of multiple operations on the GPU in parallel. In this case, a single stream is created with some flags set.
+
+
+#+begin_src sh :results verbatim :exports both
+python ./reporthd5_callchains.py ./report7.h5
+#+end_src
+
+#+RESULTS:
+#+begin_example
+./report7.h5
+./report7.h5
+('0x7fbb4530663f|697|MOD:296/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:0', 15147)
+('0x7fbb45308958|693|MOD:296/opt/nvidia/nsight-systems/2023.2.3/target-linux-x64/libcupti.so.12.2|DEP:1', 15147)
+('0x7fbb48480966|698|MOD:231/usr/lib/x86_64-linux-gnu/libcuda.so.545.23.06|DEP:2', 15147)
+('0x7fbb4d5057a8|3059|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 4385)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 4036)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 4036)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 4036)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:7', 4032)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 4032)
+('cudaMemcpyAsync|724|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 3747)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 2731)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 2731)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2731)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 2731)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 2725)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 2725)
+('cudaLaunchKernel|744|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 2723)
+('0x7fbb6e25d785|3070|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 2273)
+('0x7fbb6deab1d7|3071|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 2273)
+('0x7fbb6deac192|3072|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 2273)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2273)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 2273)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 2273)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 2273)
+('void dequantize_block<1, 1, &(convert_f16(void const*, int, int, __half2&)), float>(void const*, flotrunc|2841|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 2273)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:17', 2272)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:18', 2272)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 2211)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 2211)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 2211)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 2211)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:18', 2210)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:19', 2210)
+('0x7fbb6deaa8b2|3073|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 2112)
+('0x7fbb4c77794d|3084|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 2048)
+('0x7fbb4c7db69a|3085|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 2048)
+('0x7fbb4afd0fc9|3086|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 2048)
+('0x7fbb4a4f5b71|3087|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 2048)
+('0x7fbb4a62697b|3088|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 2048)
+('cublasLtSSSMatmul|2823|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:9', 2048)
+('0x7fbb6de4cb15|3089|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 2048)
+('0x7fbb6de4ef46|3090|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 2048)
+('0x7fbb6df65abd|3091|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 2048)
+('cublasSgemm_v2|2827|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 2048)
+('0x7fbb4ad4b256|3092|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 2048)
+('0x7fbb4afd1133|3093|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 2048)
+('0x7fbb4a4f5b71|3087|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 2048)
+('0x7fbb4a62697b|3088|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 2048)
+('cublasLtSSSMatmul|2823|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 2048)
+('0x7fbb6de4cb15|3089|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 2048)
+('0x7fbb6de4ef46|3090|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 2048)
+('0x7fbb6df65abd|3091|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 2048)
+('cublasSgemm_v2|2827|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 2048)
+('0x7fbb6de4cb48|3094|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 2048)
+('0x7fbb6de4ef46|3090|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 2048)
+('0x7fbb6df65abd|3091|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 2048)
+('cublasSgemm_v2|2827|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 2048)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 2048)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 2048)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2048)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 2048)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:15', 2048)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:16', 2048)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1542)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1542)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1542)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 1539)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 1539)
+('cudaMemcpy2DAsync|2915|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 1536)
+('ggml_cuda_cpy_tensor_2d(void*, ggml_tensor const*, long, long, long, long, CUstream_st*)|2916|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 1536)
+('cudaDeviceSynchronize|2772|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 289)
+('void dequantize_block<1, 1, &(convert_f32(void const*, int, int, __half2&)), __half>(void const*, __trunc|3047|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 225)
+('0x7fbb4acae2f1|3062|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 225)
+('0x7fbb4acb0dda|3063|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 225)
+('0x7fbb4a4f3471|3064|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 225)
+('0x7fbb4a62af2f|3065|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:9', 225)
+('cublasLtHHHMatmul|2759|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:10', 225)
+('0x7fbb6de43905|3066|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 225)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 225)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 225)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:14', 225)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:15', 225)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:16', 225)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 225)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 225)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 225)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 225)
+('0x7fbb6de43938|3074|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 225)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 225)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 225)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 225)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 225)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 225)
+('void dequantize_block<32, 2, &(dequantize_q4_0(void const*, int, int, __half2&)), __half>(void consttrunc|745|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 224)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:21', 224)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:22', 224)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 163)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 163)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 163)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 163)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 163)
+('0x7fbb4d503e43|3078|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 161)
+('0x7fbb4acb13e3|3079|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 161)
+('0x7fbb4a4f3471|3064|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 161)
+('0x7fbb4a62af2f|3065|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 161)
+('cublasLtHHHMatmul|2759|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 161)
+('0x7fbb6de43905|3066|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 161)
+5('0x7fbb4d4468ad|3081|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 161)
+('0x7fbb4d4468cd|3082|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 161)
+('0x7fbb6deaa85f|3083|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 161)
+('0x7fbb4d44430d|3060|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 64)
+('0x7fbb4d44432d|3061|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 64)
+('0x7fbb4ad41fd2|3067|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 64)
+('0x7fbb4acb0e84|3068|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 64)
+('0x7fbb4a4f3471|3064|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 64)
+('0x7fbb4a62af2f|3065|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:7', 64)
+('cublasLtHHHMatmul|2759|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:8', 64)
+('0x7fbb6de43905|3066|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:9', 64)
+('0x7fbb6de45d36|3055|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:10', 64)
+('0x7fbb6de2dfbe|3056|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:11', 64)
+('0x7fbb6e0c5ef4|3057|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:12', 64)
+('0x7fbb6e0c6380|3058|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:13', 64)
+('cublasGemmEx|3039|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:14', 64)
+('ggml_cuda_op_mul_mat_cublas(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, char const*, floattrunc|746|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 64)
+('ggml_cuda_op_mul_mat(ggml_tensor const*, ggml_tensor const*, ggml_tensor*, void (*)(ggml_tensor construnc|725|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 64)
+('ggml_cuda_compute_forward|726|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 64)
+('ggml_graph_compute_thread|637|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 64)
+('start_thread|350|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:19', 63)
+('__GI___clone3|351|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:20', 63)
+('cudaMalloc|703|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 14)
+('ggml_cuda_pool_malloc(unsigned long, unsigned long*)|2855|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 14)
+('cudaFreeHost|613|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 8)
+('ggml_cuda_host_free|614|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 8)
+('llama_new_context_with_model|628|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 6)
+('llama_init_from_gpt_params(gpt_params&)|523|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 6)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 6)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 6)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 6)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 6)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 6)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 6)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 6)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 6)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 6)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:14', 6)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:15', 6)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:16', 6)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 5)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 5)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:13', 5)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:14', 5)
+('cudaMallocHost|3009|MOD:207/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcudart.so.12.3.52|DEP:3', 4)
+('ggml_cuda_host_malloc|3010|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 4)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 4)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 4)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 4)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 4)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 4)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:7', 4)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:8', 4)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 4)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 3)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:9', 3)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 3)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 3)
+('0x7fbb6e2421b0|704|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 3)
+('0x7fbb6deaa9bd|705|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 3)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 3)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 3)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 3)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:11', 3)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 3)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:13', 3)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:14', 3)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:15', 3)
+('llama_free|3928|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 3)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 2)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 2)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 2)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 2)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 2)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:12', 2)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:13', 2)
+('llm_load_tensors(llama_model_loader&, llama_model&, int, int, float const*, bool, void (*)(float, votrunc|615|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 2)
+('llama_load_model_from_file|521|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 2)
+('llama_init_from_gpt_params(gpt_params&)|523|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 2)
+('0x7fbb6e23e8db|3049|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 2)
+('0x7fbb6deaae8b|3050|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 2)
+('0x7fbb6deac55b|3051|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 2)
+('0x7fbb6de43264|3053|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 2)
+('0x7fbb6de43c6c|3054|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:8', 2)
+('0x7fbb6e242978|699|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:3', 1)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:4', 1)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('0x7fbb4d4e9dc8|700|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:3', 1)
+('0x7fbb4a4d9e24|701|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:4', 1)
+('0x7fbb4a4da79b|702|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:5', 1)
+('cublasLtCtxInit|456|MOD:216/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so.12.3.2.9|DEP:6', 1)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:7', 1)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:10', 1)
+('0x7fbb6deaa20b|706|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('0x7fbb6deaa22e|707|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('cublasCreate_v2|442|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:5', 1)
+('ggml_init_cublas|443|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:6', 1)
+('ggml_init|291|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:7', 1)
+('llama_backend_init|292|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:8', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:9', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:10', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:11', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:12', 1)
+('0x7fbb6deaa5dc|3052|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:22', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:23', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:24', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:25', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:22', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:23', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:24', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:25', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:26', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:27', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:28', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:22', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:23', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:24', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:25', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:26', 1)
+('ggml_graph_compute|639|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:17', 1)
+('ggml_graph_compute_helper(std::vector<unsigned char, std::allocator<unsigned char> >&, ggml_cgraph*,trunc|640|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:18', 1)
+('llama_decode_internal(llama_context&, llama_batch)|633|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:19', 1)
+('llama_decode|634|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:20', 1)
+('main|158|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:21', 1)
+('__libc_start_call_main|293|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:22', 1)
+('__libc_start_main@@GLIBC_2|294|MOD:170/usr/lib/x86_64-linux-gnu/libc.so.6|DEP:23', 1)
+('_start|295|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:24', 1)
+('0x7fbb6deaa582|3076|MOD:209/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublas.so.12.3.2.9|DEP:6', 1)
+('void dequantize_block_q6_K<__half>(void const*, __half*)|3698|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:4', 1)
+('llama_free_model|3899|MOD:193/mnt/data1/2023/11/09/llama.cpp/build/bin/main|DEP:5', 1)
+#+end_example
+
+
+nm  /mnt/data1/2023/11/09/llama.cpp/build/bin/main  >main.nm
+
+
+grep libcuda report7.gron  -C10 > cudareport.txt
+grep -C1000 libcuda report7.jq > cuda.txt
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 31ec8cade19be1..de24283ab80206 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -659,7 +659,7 @@ int main(int argc, char ** argv) {
         if (input_echo) {
             for (auto id : embd) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                printf("TOKEN:%s\n", token_str.c_str());
 
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
@@ -850,6 +850,9 @@ int main(int argc, char ** argv) {
     llama_print_timings(ctx);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
+    // dont dump core
+    //int *ptr = 0; *ptr = 1;
+    
     if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);
diff --git a/ggml-alloc.c b/ggml-alloc.cpp
similarity index 100%
rename from ggml-alloc.c
rename to ggml-alloc.cpp
diff --git a/ggml-backend.c b/ggml-backend.cpp
similarity index 100%
rename from ggml-backend.c
rename to ggml-backend.cpp
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 50e03de5007472..b6d39cc29f4dbc 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7623,12 +7623,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #endif
 
     // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+           // printf("JSON: { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ],	   \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
+	   // src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+	   // ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
+	   // src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
+	   // dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
+	   // );
 
     if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
@@ -8056,9 +8056,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
 
     if (tensor->op == GGML_OP_MUL_MAT) {
         if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
+
             fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
+
             return false;
         }
     }
diff --git a/ggml-impl.h b/ggml-impl.h
index 06c07339e92699..1bf20a4af39850 100644
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -22,7 +22,7 @@ extern "C" {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
-#define static_assert(cond, msg) struct global_scope_noop_trick
+  //#define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 
diff --git a/ggml-mpi.c b/ggml-mpi.cpp
similarity index 100%
rename from ggml-mpi.c
rename to ggml-mpi.cpp
diff --git a/ggml-quants.c b/ggml-quants.cpp
similarity index 93%
rename from ggml-quants.c
rename to ggml-quants.cpp
index 7285d5f7fbcc00..a084f66c9c5860 100644
--- a/ggml-quants.c
+++ b/ggml-quants.cpp
@@ -5,7 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
-
+#include <stdio.h>
 #ifdef __ARM_NEON
 
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
@@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
     }
 }
 
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k) {
     const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     }
 }
 
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q4_1_reference(x, y, k);
 }
 
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
     }
 }
 
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q5_0_reference(x, y, k);
 }
 
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k) {
     const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     }
 }
 
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q5_1_reference(x, y, k);
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
@@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
     }
 }
 
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
-    block_q8_0 * restrict y = vy;
+    block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k) {
     assert(QK8_1 == 32);
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
@@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
     }
 }
 
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
 
-    block_q8_1 * restrict y = vy;
+    block_q8_1 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
 #endif
 }
 
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int
     }
 }
 
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k) {
     static const int qk = QK8_0;
 
     assert(k % qk == 0);
@@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
+static float make_qx_quants(int n, int nmax, const float * __restrict__ x, int8_t * __restrict__ L, int rmse_type) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     return scale;
 }
 
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * __restrict__ x, int8_t * __restrict__ L, bool do_rmse) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * __restrict__ x, uint8_t * __restrict__ L, float * __restrict__ the_min,
         int ntry, float alpha) {
     float min = x[0];
     float max = x[0];
@@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
     return scale;
 }
 
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * __restrict__ x, const float * __restrict__ weights,
+        uint8_t * __restrict__ L, float * __restrict__ the_min, uint8_t * __restrict__ Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
     float min = x[0];
     float max = x[0];
@@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
 }
 
 #if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
     if (j < 4) {
         *d = q[j] & 63; *m = q[j + 4] & 63;
     } else {
@@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 
 //========================- 2-bit (de)-quantization
 
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     }
 }
 
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     quantize_row_q2_K_reference(x, vy, k);
 }
 
-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q2_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
+        block_q2_K * __restrict__ y = (block_q2_K *)dst + j/QK_K;
         quantize_row_q2_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q2_K));
@@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
 
 //========================= 3-bit (de)-quantization
 
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
 }
 
 #if QK_K == 256
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * __restrict__ q = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
         uint8_t m = 1;
 
         memcpy(aux, x[i].scales, 12);
@@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
     }
 }
 #else
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     assert(QK_K == 64);
     const int nb = k / QK_K;
@@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * __restrict__ q = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
 
         const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
         const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
@@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
 }
 #endif
 
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     quantize_row_q3_K_reference(x, vy, k);
 }
 
-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q3_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
+        block_q3_K * __restrict__ y = (block_q3_K *)dst + j/QK_K;
         quantize_row_q3_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q3_K));
@@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 4-bit (de)-quantization
 
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     }
 }
 
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
+    block_q4_K * __restrict__ y = vy;
     quantize_row_q4_K_reference(x, y, k);
 }
 
-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q4_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     assert(k % QK_K == 0);
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
+        block_q4_K * __restrict__ y = (block_q4_K *)dst + j/QK_K;
         quantize_row_q4_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q4_K));
@@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 5-bit (de)-quantization
 
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * __restrict__ qh = y[i].qh;
+        uint8_t * __restrict__ ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         uint8_t m1 = 1, m2 = 2;
@@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * __restrict__ qh = y[i].qh;
+        uint8_t * __restrict__ ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         for (int j = 0; j < 32; ++j) {
@@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     }
 }
 
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
         }
 #else
         float d = GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict s = x[i].scales;
+        const int8_t * __restrict__ s = x[i].scales;
         for (int l = 0; l < 8; ++l) {
             y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
             y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
@@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
+    block_q5_K * __restrict__ y = vy;
     quantize_row_q5_K_reference(x, y, k);
 }
 
-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
+size_t ggml_quantize_q5_K(const float * __restrict__ src, void * __restrict__ dst, int n, int k, int64_t * __restrict__ hist) {
     assert(k % QK_K == 0);
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
+        block_q5_K * __restrict__ y = (block_q5_K *)dst + j/QK_K;
         quantize_row_q5_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q5_K));
@@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n
 
 // ====================== 6-bit (de)-quantization
 
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             }
         }
 
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
+        uint8_t * __restrict__ ql = y[i].ql;
+        uint8_t * __restrict__ qh = y[i].qh;
 #if QK_K == 256
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
     }
 }
 
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
+        const uint8_t * __restrict__ ql = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ sc = x[i].scales;
 
 #if QK_K == 256
         for (int n = 0; n < QK_K; n += 128) {
@@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ vy, int k) {
     assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
+    block_q6_K * __restrict__ y = vy;
     quantize_row_q6_K_reference(x, y, k);
 }
 
@@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
     (void)hist; // TODO: collect histograms
 
     for (int j = 0; j < n; j += k) {
-        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
+        block_q6_K * __restrict__ y = (block_q6_K *)dst + j/QK_K;
         quantize_row_q6_K_reference(src + j, y, k);
     }
     return (n/QK_K*sizeof(block_q6_K));
@@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
 
 //===================================== Q8_K ==============================================
 
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
     }
 }
 
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -2374,7 +2374,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int
     }
 }
 
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k) {
     quantize_row_q8_K_reference(x, y, k);
 }
 
@@ -2423,14 +2423,15 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
+  //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy);
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q4_0 * __restrict__ x = vx;
+    const block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2439,10 +2440,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q4_0 * __restrict__ x0 = &x[i + 0];
+        const block_q4_0 * __restrict__ x1 = &x[i + 1];
+        const block_q8_0 * __restrict__ y0 = &y[i + 0];
+        const block_q8_0 * __restrict__ y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
         const int8x16_t  s8b = vdupq_n_s8(0x8);
@@ -2733,14 +2734,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
 #endif
 }
 
-void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_1_q8_1(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q4_1 * __restrict__ x = vx;
+    const block_q8_1 * __restrict__ y = vy;
 
     // TODO: add WASM SIMD
 #if defined(__ARM_NEON)
@@ -2752,10 +2753,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
+        const block_q4_1 * __restrict__ x0 = &x[i + 0];
+        const block_q4_1 * __restrict__ x1 = &x[i + 1];
+        const block_q8_1 * __restrict__ y0 = &y[i + 0];
+        const block_q8_1 * __restrict__ y1 = &y[i + 1];
 
         summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
 
@@ -2893,15 +2894,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_0_q8_0(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
     assert(qk == QK5_0);
 
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q5_0 * __restrict__ x = vx;
+    const block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2916,10 +2917,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q5_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q5_0 * __restrict__ x0 = &x[i];
+        const block_q5_0 * __restrict__ x1 = &x[i + 1];
+        const block_q8_0 * __restrict__ y0 = &y[i];
+        const block_q8_0 * __restrict__ y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3000,8 +3001,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 
     // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
+        const block_q5_0 * __restrict__ x0 = &x[i];
+        const block_q8_0 * __restrict__ y0 = &y[i];
 
         const v128_t m4b  = wasm_i8x16_splat(0x0F);
 
@@ -3199,15 +3200,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_1_q8_1(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
     assert(n % qk == 0);
     assert(qk == QK5_1);
 
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q5_1 * __restrict__ x = vx;
+    const block_q8_1 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3225,10 +3226,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q5_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i];
-        const block_q8_1 * restrict y1 = &y[i + 1];
+        const block_q5_1 * __restrict__ x0 = &x[i];
+        const block_q5_1 * __restrict__ x1 = &x[i + 1];
+        const block_q8_1 * __restrict__ y0 = &y[i];
+        const block_q8_1 * __restrict__ y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3314,8 +3315,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 
     // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
+        const block_q5_1 * __restrict__ x0 = &x[i];
+        const block_q8_1 * __restrict__ y0 = &y[i];
 
         summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
 
@@ -3518,14 +3519,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
 
-void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q8_0_q8_0(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
     assert(n % qk == 0);
 
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q8_0 * __restrict__ x = vx;
+    const block_q8_0 * __restrict__ y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3534,10 +3535,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q8_0 * __restrict__ x0 = &x[i + 0];
+        const block_q8_0 * __restrict__ x1 = &x[i + 1];
+        const block_q8_0 * __restrict__ y0 = &y[i + 0];
+        const block_q8_0 * __restrict__ y1 = &y[i + 1];
 
         const int8x16_t x0_0 = vld1q_s8(x0->qs);
         const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3642,10 +3643,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
 }
 
 #if QK_K == 256
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -3667,9 +3668,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
+        const uint8_t * __restrict__ sc = x[i].scales;
 
         const uint8x16_t mins_and_scales = vld1q_u8(sc);
         const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -3746,8 +3747,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
         const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -3813,8 +3814,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         // load mins and scales from block_q2_K.scales[QK_K/16]
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -4035,10 +4036,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -4061,9 +4062,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const float dmin = -y[i].d * (float)x[i].dmin;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
 
         aux32[0] = sc[0] & 0x0f0f0f0f;
         aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
@@ -4114,8 +4115,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
     __m256 acc = _mm256_setzero_ps();
 
     uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
+    const uint8_t * __restrict__ db = (const uint8_t *)&ud;
+    const uint8_t * __restrict__ mb = (const uint8_t *)&um;
 
     float summs = 0;
 
@@ -4126,10 +4127,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
         ud = (sc[0] >> 0) & 0x0f0f0f0f;
         um = (sc[0] >> 4) & 0x0f0f0f0f;
 
@@ -4166,8 +4167,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
     __m256 acc = _mm256_setzero_ps();
 
     uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
+    const uint8_t * __restrict__ db = (const uint8_t *)&ud;
+    const uint8_t * __restrict__ mb = (const uint8_t *)&um;
 
     float summs = 0;
 
@@ -4178,10 +4179,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
         ud = (sc[0] >> 0) & 0x0f0f0f0f;
         um = (sc[0] >> 4) & 0x0f0f0f0f;
 
@@ -4227,9 +4228,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const float dmin = -y[i].d * (float)x[i].dmin;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+        const uint8_t * __restrict__ q2 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
+        const uint32_t * __restrict__ sc = (const uint32_t *)x[i].scales;
 
         aux32[0] = sc[0] & 0x0f0f0f0f;
         aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
@@ -4311,14 +4312,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
     const uint32_t kmask1 = 0x03030303;
     const uint32_t kmask2 = 0x0f0f0f0f;
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -4346,9 +4347,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].hmask;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -4454,8 +4455,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         // Set up scales
         memcpy(aux, x[i].scales, 12);
@@ -4559,8 +4560,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         // Set up scales
         aux = (const uint32_t *)x[i].scales;
@@ -4694,9 +4695,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].hmask;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
         memcpy(aux, x[i].scales, 12);
         utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -4806,11 +4807,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -4855,11 +4856,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -4947,8 +4948,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5018,8 +5019,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5098,8 +5099,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint16_t a = *(const uint16_t *)x[i].scales;
         aux16[0] = a & 0x0f0f;
@@ -5173,10 +5174,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
+        const uint8_t * __restrict__ q3 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].hmask;
+        const  int8_t * __restrict__ q8 = y[i].qs;
+        int8_t * __restrict__ a = aux8;
         for (int l = 0; l < 8; ++l) {
             a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
             a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
@@ -5213,11 +5214,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -5262,8 +5263,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         int32_t sumi1 = 0;
         int32_t sumi2 = 0;
@@ -5334,8 +5335,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
 
@@ -5393,8 +5394,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -5494,8 +5495,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
         vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
         sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         vl = 32;
 
@@ -5548,10 +5549,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
             a += 32;
@@ -5594,11 +5595,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 }
 #else
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -5618,14 +5619,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
     float sum_mins = 0.f;
 
     uint16_t aux16[2];
-    const uint8_t * restrict scales = (const uint8_t *)aux16;
+    const uint8_t * __restrict__ scales = (const uint8_t *)aux16;
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const uint16_t * restrict a = (const uint16_t *)x[i].scales;
+        const uint16_t * __restrict__ a = (const uint16_t *)x[i].scales;
         aux16[0] = a[0] & 0x0f0f;
         aux16[1] = (a[0] >> 4) & 0x0f0f;
 
@@ -5698,8 +5699,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
         const __m256i q4l = _mm256_and_si256(q4bits, m4);
@@ -5744,8 +5745,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
         summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
         const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
@@ -5778,16 +5779,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #elif defined __riscv_v_intrinsic
 
     uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
+    const uint8_t * __restrict__ scales = (const uint8_t *)s16;
 
     float sumf = 0;
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        const uint16_t * __restrict__ b = (const uint16_t *)x[i].scales;
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
@@ -5827,17 +5828,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
     memset(sums, 0, 8*sizeof(float));
 
     uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
+    const uint8_t * __restrict__ scales = (const uint8_t *)s16;
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        uint8_t * restrict a = aux8;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const  int8_t * __restrict__ q8 = y[i].qs;
+        uint8_t * __restrict__ a = aux8;
         for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
         for (int l = 0; l < 32; ++l) a[l+32] = q4[l]  >> 4;
 
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
+        const uint16_t * __restrict__ b = (const uint16_t *)x[i].scales;
         s16[0] = b[0] & 0x0f0f;
         s16[1] = (b[0] >> 4) & 0x0f0f;
 
@@ -5861,11 +5862,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 
 #if QK_K == 256
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -5911,9 +5912,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -5976,8 +5977,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
    for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
 #if QK_K == 256
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
@@ -6065,8 +6066,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6163,9 +6164,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
         vl = 8;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -6249,11 +6250,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -6302,11 +6303,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -6328,9 +6329,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const int8_t * sc = x[i].scales;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const uint8x8_t qhbits = vld1_u8(qh);
 
@@ -6387,8 +6388,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
@@ -6433,8 +6434,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
@@ -6490,9 +6491,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         const float d = y[i].d * (float)x[i].d;
         const int8_t * sc = x[i].scales;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q5 = x[i].qs;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
 
@@ -6560,10 +6561,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
+        const uint8_t * __restrict__ q4 = x[i].qs;
+        const uint8_t * __restrict__ hm = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
+        int8_t * __restrict__ a = aux8;
         for (int l = 0; l < 32; ++l) {
             a[l+ 0] = q4[l] & 0xF;
             a[l+32] = q4[l]  >> 4;
@@ -6574,7 +6575,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
         }
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
+        const int8_t * __restrict__ sc = x[i].scales;
 
         for (int j = 0; j < QK_K/16; ++j) {
             const float dl = d * sc[j];
@@ -6591,11 +6592,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
 
 #if QK_K == 256
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -6618,11 +6619,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
         const int8x16_t scales = vld1q_s8(scale);
@@ -6750,9 +6751,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -6830,9 +6831,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -6942,11 +6943,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         size_t vl;
 
@@ -7030,11 +7031,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
                 a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -7067,11 +7068,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
 #else
 
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(const int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy) {
     assert(n % QK_K == 0);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * __restrict__ x = vx;
+    const block_q8_K * __restrict__ y = vy;
 
     const int nb = n / QK_K;
 
@@ -7094,11 +7095,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = (float)x[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         int32_t isum = 0;
 
@@ -7157,9 +7158,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
         const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
@@ -7214,9 +7215,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
         const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
         const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
@@ -7281,11 +7282,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
         const float d_all = (float)x[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q6 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const int8_t  * __restrict__ q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * __restrict__ scale = x[i].scales;
 
         int32_t isum = 0;
 
@@ -7350,11 +7351,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * __restrict__ q4 = x[i].ql;
+        const uint8_t * __restrict__ qh = x[i].qh;
+        const  int8_t * __restrict__ q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * __restrict__ a = aux8;
         for (int l = 0; l < 16; ++l) {
             a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
             a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
diff --git a/ggml-quants.h b/ggml-quants.h
index 70c12c27465e80..2706e36ada7d3a 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 
 
 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
-
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
+void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
+void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
+void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
+void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
+void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
+
+void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
+void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
+void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
+void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
+void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
+void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
+
+void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
+
+void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
+void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
 
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
+
+void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
+void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
 
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
diff --git a/ggml.c b/ggml.cpp
similarity index 96%
rename from ggml.c
rename to ggml.cpp
index f92292b39c635e..2ccf51fe989b22 100644
--- a/ggml.c
+++ b/ggml.cpp
@@ -1,3 +1,7 @@
+
+//https://github.com/Neargye/magic_enum.git
+#include <magic_enum.hpp>
+
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
@@ -86,7 +90,13 @@ static int sched_yield (void) {
 }
 #else
 #include <pthread.h>
+//#include <stdatomic.h>
+#ifdef __cplusplus
+#include <atomic>
+using namespace std;
+#else
 #include <stdatomic.h>
+#endif
 
 typedef void * thread_ret_t;
 
@@ -409,195 +419,195 @@ int64_t ggml_cycles_per_ms(void) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_f32(const int n, float * __restrict__ s, const float * __restrict__ x, const float * __restrict__ y);
+static void ggml_vec_dot_f16(const int n, float * __restrict__ s, ggml_fp16_t * __restrict__ x, ggml_fp16_t * __restrict__ y);
 
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_I8] = {
-        .type_name                = "i8",
-        .blck_size                = 1,
-        .type_size                = sizeof(int8_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I16] = {
-        .type_name                = "i16",
-        .blck_size                = 1,
-        .type_size                = sizeof(int16_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I32] = {
-        .type_name                = "i32",
-        .blck_size                = 1,
-        .type_size                = sizeof(int32_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_F32] = {
-        .type_name                = "f32",
-        .blck_size                = 1,
-        .type_size                = sizeof(float),
-        .is_quantized             = false,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
-        .vec_dot_type             = GGML_TYPE_F32,
-    },
-    [GGML_TYPE_F16] = {
-        .type_name                = "f16",
-        .blck_size                = 1,
-        .type_size                = sizeof(ggml_fp16_t),
-        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
-        .vec_dot_type             = GGML_TYPE_F16,
-    },
-    [GGML_TYPE_Q4_0] = {
-        .type_name                = "q4_0",
-        .blck_size                = QK4_0,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
-        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .type_name                = "q4_1",
-        .blck_size                = QK4_1,
-        .type_size                = sizeof(block_q4_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
-        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [4] = { // GGML_TYPE_Q4_2
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [5] = { // GGML_TYPE_Q4_3
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .type_name                = "q5_0",
-        .blck_size                = QK5_0,
-        .type_size                = sizeof(block_q5_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
-        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .type_name                = "q5_1",
-        .blck_size                = QK5_1,
-        .type_size                = sizeof(block_q5_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
-        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .type_name                = "q8_0",
-        .blck_size                = QK8_0,
-        .type_size                = sizeof(block_q8_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
-        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
-        .type_name                = "q8_1",
-        .blck_size                = QK8_1,
-        .type_size                = sizeof(block_q8_1),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q2_K] = {
-        .type_name                = "q2_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q2_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
-        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q3_K] = {
-        .type_name                = "q3_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q3_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
-        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q4_K] = {
-        .type_name                = "q4_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q4_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
-        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q5_K] = {
-        .type_name                = "q5_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q5_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
-        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q6_K] = {
-        .type_name                = "q6_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q6_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
-        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q8_K] = {
-        .type_name                = "q8_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q8_K),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_K,
-    }
+    // [GGML_TYPE_I8] = {
+    //     .type_name                = "i8",
+    //     .blck_size                = 1,
+    //     .type_size                = sizeof(int8_t),
+    //     .is_quantized             = false,
+    // },
+    // [GGML_TYPE_I16] = {
+    //     .type_name                = "i16",
+    //     .blck_size                = 1,
+    //     .type_size                = sizeof(int16_t),
+    //     .is_quantized             = false,
+    // },
+    // [GGML_TYPE_I32] = {
+    //     .type_name                = "i32",
+    //     .blck_size                = 1,
+    //     .type_size                = sizeof(int32_t),
+    //     .is_quantized             = false,
+    // },
+    // [GGML_TYPE_F32] = {
+    //     .type_name                = "f32",
+    //     .blck_size                = 1,
+    //     .type_size                = sizeof(float),
+    //     .is_quantized             = false,
+    //     .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+    //     .vec_dot_type             = GGML_TYPE_F32,
+    // },
+    // [GGML_TYPE_F16] = {
+    //     .type_name                = "f16",
+    //     .blck_size                = 1,
+    //     .type_size                = sizeof(ggml_fp16_t),
+    //     .is_quantized             = false,
+    //     .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
+    //     .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+    //     .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+    //     .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
+    //     .vec_dot_type             = GGML_TYPE_F16,
+    // },
+    // [GGML_TYPE_Q4_0] = {
+    //     .type_name                = "q4_0",
+    //     .blck_size                = QK4_0,
+    //     .type_size                = sizeof(block_q4_0),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+    //     .from_float               = quantize_row_q4_0,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+    //     .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+    //     .vec_dot_type             = GGML_TYPE_Q8_0,
+    // },
+    // [GGML_TYPE_Q4_1] = {
+    //     .type_name                = "q4_1",
+    //     .blck_size                = QK4_1,
+    //     .type_size                = sizeof(block_q4_1),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
+    //     .from_float               = quantize_row_q4_1,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
+    //     .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
+    //     .vec_dot_type             = GGML_TYPE_Q8_1,
+    // },
+    // [4] = { // GGML_TYPE_Q4_2
+    //     .type_name                = "DEPRECATED",
+    //     .blck_size                = 0,
+    //     .type_size                = 0,
+    //     .is_quantized             = false,
+    //     .to_float                 = NULL,
+    //     .from_float               = NULL,
+    //     .from_float_reference     = NULL,
+    //     .vec_dot                  = NULL,
+    //     .vec_dot_type             = GGML_TYPE_COUNT,
+    // },
+    // [5] = { // GGML_TYPE_Q4_3
+    //     .type_name                = "DEPRECATED",
+    //     .blck_size                = 0,
+    //     .type_size                = 0,
+    //     .is_quantized             = false,
+    //     .to_float                 = NULL,
+    //     .from_float               = NULL,
+    //     .from_float_reference     = NULL,
+    //     .vec_dot                  = NULL,
+    //     .vec_dot_type             = GGML_TYPE_COUNT,
+    // },
+    // [GGML_TYPE_Q5_0] = {
+    //     .type_name                = "q5_0",
+    //     .blck_size                = QK5_0,
+    //     .type_size                = sizeof(block_q5_0),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
+    //     .from_float               = quantize_row_q5_0,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
+    //     .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
+    //     .vec_dot_type             = GGML_TYPE_Q8_0,
+    // },
+    // [GGML_TYPE_Q5_1] = {
+    //     .type_name                = "q5_1",
+    //     .blck_size                = QK5_1,
+    //     .type_size                = sizeof(block_q5_1),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
+    //     .from_float               = quantize_row_q5_1,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
+    //     .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
+    //     .vec_dot_type             = GGML_TYPE_Q8_1,
+    // },
+    // [GGML_TYPE_Q8_0] = {
+    //     .type_name                = "q8_0",
+    //     .blck_size                = QK8_0,
+    //     .type_size                = sizeof(block_q8_0),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
+    //     .from_float               = quantize_row_q8_0,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
+    //     .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
+    //     .vec_dot_type             = GGML_TYPE_Q8_0,
+    // },
+    // [GGML_TYPE_Q8_1] = {
+    //     .type_name                = "q8_1",
+    //     .blck_size                = QK8_1,
+    //     .type_size                = sizeof(block_q8_1),
+    //     .is_quantized             = true,
+    //     .from_float               = quantize_row_q8_1,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
+    //     .vec_dot_type             = GGML_TYPE_Q8_1,
+    // },
+    // [GGML_TYPE_Q2_K] = {
+    //     .type_name                = "q2_K",
+    //     .blck_size                = QK_K,
+    //     .type_size                = sizeof(block_q2_K),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
+    //     .from_float               = quantize_row_q2_K,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
+    //     .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
+    //     .vec_dot_type             = GGML_TYPE_Q8_K,
+    // },
+    // [GGML_TYPE_Q3_K] = {
+    //     .type_name                = "q3_K",
+    //     .blck_size                = QK_K,
+    //     .type_size                = sizeof(block_q3_K),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
+    //     .from_float               = quantize_row_q3_K,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
+    //     .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
+    //     .vec_dot_type             = GGML_TYPE_Q8_K,
+    // },
+    // [GGML_TYPE_Q4_K] = {
+    //     .type_name                = "q4_K",
+    //     .blck_size                = QK_K,
+    //     .type_size                = sizeof(block_q4_K),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
+    //     .from_float               = quantize_row_q4_K,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
+    //     .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
+    //     .vec_dot_type             = GGML_TYPE_Q8_K,
+    // },
+    // [GGML_TYPE_Q5_K] = {
+    //     .type_name                = "q5_K",
+    //     .blck_size                = QK_K,
+    //     .type_size                = sizeof(block_q5_K),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
+    //     .from_float               = quantize_row_q5_K,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
+    //     .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
+    //     .vec_dot_type             = GGML_TYPE_Q8_K,
+    // },
+    // [GGML_TYPE_Q6_K] = {
+    //     .type_name                = "q6_K",
+    //     .blck_size                = QK_K,
+    //     .type_size                = sizeof(block_q6_K),
+    //     .is_quantized             = true,
+    //     .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+    //     .from_float               = quantize_row_q6_K,
+    //     .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
+    //     .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
+    //     .vec_dot_type             = GGML_TYPE_Q8_K,
+    // },
+    // [GGML_TYPE_Q8_K] = {
+    //     .type_name                = "q8_K",
+    //     .blck_size                = QK_K,
+    //     .type_size                = sizeof(block_q8_K),
+    //     .is_quantized             = true,
+    //     .from_float               = quantize_row_q8_K,
+    // }
 };
 
 // For internal test use
@@ -1160,7 +1170,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(const int n, float * __restrict__ s, const float * __restrict__ x, const float * __restrict__ y) {
 #ifdef GGML_SIMD
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1197,7 +1207,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
     *s = sumf;
 }
 
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(const int n, float * __restrict__ s, ggml_fp16_t * __restrict__ x, ggml_fp16_t * __restrict__ y) {
     ggml_float sumf = 0.0;
 
 #if defined(GGML_SIMD)
@@ -1235,10 +1245,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest
 
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * __restrict__ s, void * __restrict__ xv, ggml_fp16_t * __restrict__ y) {
     ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
 
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+    ggml_fp16_t * __restrict__ x[GGML_VEC_DOT_UNROLL];
 
     for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
         x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1288,7 +1298,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
     }
 }
 
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+inline static void ggml_vec_mad_f32(const int n, float * __restrict__ y, const float * __restrict__ x, const float v) {
 #if defined(GGML_SIMD)
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -1320,10 +1330,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 }
 
 // xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * __restrict__ y, const float * __restrict__ xv, const float * __restrict__ vv) {
 
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
+    const float * __restrict__ x[GGML_VEC_MAD_UNROLL];
+    const float * __restrict__ v[GGML_VEC_MAD_UNROLL];
 
     for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
         x[i] = (const float *) ((const char *) xv + i*xs);
@@ -2176,17 +2186,18 @@ static inline int ggml_up(int n, int m) {
 ////////////////////////////////////////////////////////////////////////////////
 
 struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    static bool is_first_call = true;
-
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
-
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
+  struct ggml_context * ctx = NULL;
+  static bool is_first_call = true;
+  // make this function thread safe
+  ggml_critical_section_start();
+  
+
+  if (is_first_call) {
+    // initialize time system (required on Windows)
+    ggml_time_init();
+    
+    // initialize GELU, Quick GELU, SILU and EXP F32 tables
+    {
             const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
             ggml_fp16_t ii;
@@ -2209,13 +2220,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         {
             const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-                /*.numa =*/ {
-                    .n_nodes = 0,
-                    .total_cpus = 0,
-                },
-            };
+	    // TODOFIXME
+            // g_state = (struct ggml_state) {
+            //     /*.contexts =*/ { { 0 } },
+            //     /*.numa =*/ {
+            //         .n_nodes = 0,
+            //         .total_cpus = 0,
+            //     },
+	    //};
 
             for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
                 g_state.contexts[i].used = false;
@@ -2238,7 +2250,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     }
 
     // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+    
 
     for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
         if (!g_state.contexts[i].used) {
@@ -2402,7 +2414,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
     // align to GGML_MEM_ALIGN
     size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
 
-    char * const mem_buffer = ctx->mem_buffer;
+    char * const mem_buffer = (char*)ctx->mem_buffer;
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
 
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
@@ -2412,12 +2424,13 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
         return NULL;
     }
 
-    *obj_new = (struct ggml_object) {
-        .offs = cur_end + GGML_OBJECT_SIZE,
-        .size = size_needed,
-        .next = NULL,
-        .type = type,
-    };
+    // FIXME
+    // *obj_new = (struct ggml_object) {
+    //     .offs = cur_end + GGML_OBJECT_SIZE,
+    //     .size = size_needed,
+    //     .next = NULL,
+    //     .type = type,
+    // };
 
     ggml_assert_aligned(mem_buffer + obj_new->offs);
 
@@ -2794,6 +2807,42 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
     return 0.0f;
 }
 
+void ggml_tensor_checksum(const struct ggml_tensor * tensor) {
+  const int64_t ne = ggml_nelements(tensor) ;
+  float fmin=0;
+  float ffirst=0;
+  float fmax=0;
+  float fsum=0;
+
+  for (int64_t j = 0; j < ne; ++j) {
+    float f = ggml_get_f32_1d(tensor, j);
+    if (j ==0) {
+      ffirst = f;
+      fmin = f;
+      fmax = f;
+    }
+    fsum += f;
+    if (f < fmin){
+      fmin = f;
+    }
+    if (f >fmax){
+      fmax = f;
+    }    
+  }
+
+  auto type_name = magic_enum::enum_name(tensor->type);
+// color_name
+  fprintf(stderr, "JSON: { \"cnt\":%ld, \"first\":%f,\"max\":%f,\"min\":%f,\"sum\":%f, \"name\":\"%s\", \"type\":\"%s\"}\n",
+	  ne,
+	  ffirst,
+	  fmax,
+	  fmin,
+	  fsum,
+	  tensor->name,
+	  type_name
+	  );
+}
+
 void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
     if (!ggml_is_contiguous(tensor)) {
         int64_t id[4] = { 0, 0, 0, 0 };
@@ -2911,17 +2960,30 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
                 GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                 return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
             }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
 
+    case GGML_TYPE_Q2_K:
+    case GGML_TYPE_Q3_K:
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
+    case GGML_TYPE_Q4_K:
+    case GGML_TYPE_Q5_0:
+    case GGML_TYPE_Q5_1:
+    case GGML_TYPE_Q5_K:
+    case GGML_TYPE_Q6_K:
+    case GGML_TYPE_Q8_0:
+    case GGML_TYPE_Q8_1:
+    case GGML_TYPE_Q8_K:
+    case GGML_TYPE_F32:
+      {
+	//GGML_ASSERT(tensor->nb[0] == sizeof(float));
+	return ((float *)(tensor->data))[i];
+      }
+      
+    default:
+      {
+	GGML_ASSERT(false);
+      }
+    }
     return 0.0f;
 }
 
@@ -6365,7 +6427,7 @@ static void ggml_compute_forward_dup_f16(
                 GGML_ASSERT(false); // TODO: implement
             }
         } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
+	  printf("%s: this is not optimal - fix me\n", __func__);
 
             if (dst->type == GGML_TYPE_F32) {
                 size_t id = 0;
@@ -6612,7 +6674,7 @@ static void ggml_compute_forward_dup_f32(
                 GGML_ASSERT(false); // TODO: implement
             }
         } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
+	  printf("%s: this is not optimal - fix me\n", __func__);
 
             if (dst->type == GGML_TYPE_F32) {
                 size_t id = 0;
@@ -9390,6 +9452,7 @@ static void ggml_compute_forward_mul_mat(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
+
     int64_t t0 = ggml_perf_time_us();
     UNUSED(t0);
 
@@ -9427,7 +9490,8 @@ static void ggml_compute_forward_mul_mat(
 
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
-
+    fprintf(stderr, "%s: params_type:%d src0:%p ->data %p src1:%p ->data %p\n", __func__, params->type,  (void*)src0, src0->data, (void*)src1, src1->data);
+  
 #if defined(GGML_USE_CLBLAST)
     if (ggml_cl_can_mul_mat(src0, src1, dst)) {
         if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
@@ -9484,7 +9548,7 @@ static void ggml_compute_forward_mul_mat(
             }
         }
 
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
     }
@@ -9518,7 +9582,7 @@ static void ggml_compute_forward_mul_mat(
     const int64_t nr0 = ne01;           // src0 rows
     const int64_t nr1 = ne11*ne12*ne13; // src1 rows
 
-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+    printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
 
     // distribute the thread work across the inner or outer loop based on which one is larger
 
@@ -9537,7 +9601,7 @@ static void ggml_compute_forward_mul_mat(
     const int64_t ir110 = dr1*ith1;
     const int64_t ir111 = MIN(ir110 + dr1, nr1);
 
-    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+    printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
 
     // threads with no work simply yield (not sure if it helps)
     if (ir010 >= ir011 || ir110 >= ir111) {
@@ -13696,6 +13760,105 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 
 /////////////////////////////////
 
+/* const char *  ggml_op_name_table [] =  {  */
+/*   "GGML_OP_NONE", */
+/*   "GGML_OP_DUP", */
+/*   "GGML_OP_ADD", */
+/*   "GGML_OP_ADD1", */
+/*   "GGML_OP_ACC", */
+/*   "GGML_OP_SUB", */
+/*   "GGML_OP_MUL", */
+/*   "GGML_OP_DIV", */
+/*   "GGML_OP_SQR", */
+/*   "GGML_OP_SQRT", */
+/*   "GGML_OP_LOG", */
+/*   "GGML_OP_SUM", */
+/*   "GGML_OP_SUM_ROWS", */
+/*   "GGML_OP_MEAN", */
+/*   "GGML_OP_ARGMAX", */
+/*         "GGML_OP_REPEAT", */
+/*         "GGML_OP_REPEAT_BACK", */
+/*         "GGML_OP_CONCAT", */
+/*         "GGML_OP_SILU_BACK", */
+/*         "GGML_OP_NORM", */
+/*         "GGML_OP_RMS_NORM", */
+/*         "GGML_OP_RMS_NORM_BACK", */
+/*         "GGML_OP_GROUP_NORM", */
+/*         "GGML_OP_MUL_MAT", */
+/*         "GGML_OP_OUT_PROD", */
+/*         "GGML_OP_SCALE", */
+/*         "GGML_OP_SET", */
+/*         "GGML_OP_CPY", */
+/*         "GGML_OP_CONT", */
+/*         "GGML_OP_RESHAPE", */
+/*         "GGML_OP_VIEW", */
+/*         "GGML_OP_PERMUTE", */
+/*         "GGML_OP_TRANSPOSE", */
+/*         "GGML_OP_GET_ROWS", */
+/*         "GGML_OP_GET_ROWS_BACK", */
+/*         "GGML_OP_DIAG", */
+/*         "GGML_OP_DIAG_MASK_INF", */
+/*         "GGML_OP_DIAG_MASK_ZERO", */
+/*         "GGML_OP_SOFT_MAX", */
+/*         "GGML_OP_SOFT_MAX_BACK", */
+/*         "GGML_OP_ROPE", */
+/*         "GGML_OP_ROPE_BACK", */
+/*         "GGML_OP_ALIBI", */
+/*         "GGML_OP_CLAMP", */
+/*         "GGML_OP_CONV_TRANSPOSE_1D", */
+/*         "GGML_OP_IM2COL", */
+/*         "GGML_OP_CONV_TRANSPOSE_2D", */
+/*         "GGML_OP_POOL_1D", */
+/*         "GGML_OP_POOL_2D", */
+/*         "GGML_OP_UPSCALE", */
+/*         "GGML_OP_FLASH_ATTN", */
+/*         "GGML_OP_FLASH_FF", */
+/*         "GGML_OP_FLASH_ATTN_BACK", */
+/*         "GGML_OP_WIN_PART", */
+/*         "GGML_OP_WIN_UNPART", */
+/*         "GGML_OP_GET_REL_POS", */
+/*         "GGML_OP_ADD_REL_POS", */
+/*         "GGML_OP_UNARY", */
+/*         "GGML_OP_MAP_UNARY", */
+/*         "GGML_OP_MAP_BINARY", */
+/*         "GGML_OP_MAP_CUSTOM1_F32", */
+/*         "GGML_OP_MAP_CUSTOM2_F32", */
+/*         "GGML_OP_MAP_CUSTOM3_F32", */
+/*         "GGML_OP_MAP_CUSTOM1", */
+/*         "GGML_OP_MAP_CUSTOM2", */
+/*         "GGML_OP_MAP_CUSTOM3", */
+/*         "GGML_OP_CROSS_ENTROPY_LOSS", */
+/*         "GGML_OP_CROSS_ENTROPY_LOSS_BACK", */
+/*         "GGML_OP_COUNT", */
+/*     }; */
+
+    /* enum ggml_unary_op { */
+    /*     GGML_UNARY_OP_ABS, */
+    /*     GGML_UNARY_OP_SGN, */
+    /*     GGML_UNARY_OP_NEG, */
+    /*     GGML_UNARY_OP_STEP, */
+    /*     GGML_UNARY_OP_TANH, */
+    /*     GGML_UNARY_OP_ELU, */
+    /*     GGML_UNARY_OP_RELU, */
+    /*     GGML_UNARY_OP_GELU, */
+    /*     GGML_UNARY_OP_GELU_QUICK, */
+    /*     GGML_UNARY_OP_SILU, */
+    /*     GGML_UNARY_OP_LEAKY */
+    /* }; */
+
+    /* enum ggml_object_type { */
+    /*     GGML_OBJECT_TENSOR, */
+    /*     GGML_OBJECT_GRAPH, */
+    /*     GGML_OBJECT_WORK_BUFFER */
+    /* }; */
+
+    /* enum ggml_log_level { */
+    /*     GGML_LOG_LEVEL_ERROR = 2, */
+    /*     GGML_LOG_LEVEL_WARN = 3, */
+    /*     GGML_LOG_LEVEL_INFO = 4 */
+    /* }; */
+
+
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
@@ -13703,10 +13866,100 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
+    float fmin1=0;
+    float ffirst1=0;
+    float fmax1=0;
+    float fsum1=0;
+
+    float fmin0=0;
+    float ffirst0=0;
+    float fmax0=0;
+    float fsum0=0;
+
+    float fmin2=0;
+    float ffirst2=0;
+    float fmax2=0;
+    float fsum2=0;
+
+    int64_t elem_src = ggml_nelements(tensor->src[0]);
+    int64_t elem_src1 = 0; //ggml_nelements(tensor->src[1]);
+    
+    if (tensor->src[0]) {
+      const size_t size = ggml_nbytes(tensor->src[0])/sizeof(float);
+      for (int i = 0; i <size; i++){       
+	float f = *( ((float*)(tensor->src[0]->data))+i);
+      }
+    }
+    
+    if (tensor->src[1]) {
+       elem_src1 = ggml_nelements(tensor->src[1]);
+      const size_t size = ggml_nbytes(tensor->src[1])/sizeof(float);
+      for (int i = 0; i <size; i++){       
+	float f = *( ((float*)(tensor->src[1]->data))+i);
+	if (i ==0) {
+	  ffirst1 = f;
+	  fmin1 = f;
+	  fmax1 = f;
+	}
+	fsum1 += f;
+	if (f < fmin1){
+	  fmin1 = f;
+	}
+	if (f >fmax1){
+	  fmax1 = f;
+	}
+      }
+    }
+
 #ifdef GGML_USE_CUBLAS
     bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
     if (skip_cpu) {
-        return;
+
+      if (tensor->src[1]) {
+	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor->src[1]);
+	ggml_tensor_checksum(tensor);
+	
+	/* fprintf(stderr, "JSON:{\"bop\":\"%s\",\"src\":\"%s\",\"src2\":\"%s\",\"cnt1\":%ld,\"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f,\"cnt2\":%ld,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"dst\":\"%s\"}\n", */
+	/*     ggml_op_name_table[tensor->op], */
+	/*     tensor->src[0]->name, */
+	/*     tensor->src[1]->name, */
+	/*     elem_src, */
+	/*     ffirst0, */
+	/*     fmax0, */
+	/*     fmin0, */
+	/*     fsum0, */
+	    
+	/*     elem_src1, */
+	/*     ffirst1, */
+	/*     fmax1, */
+	/*     fmin1, */
+	/*     fsum1, */
+
+	/*     ffirst2, */
+	/*     fmax2, */
+	/*     fmin2, */
+	/*     fsum2, */
+
+	/*     tensor->name); */
+      }  else {
+	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor);
+	/* fprintf(stderr, "JSON: { \"uop\":%d, \"src\":\"%s\", \"cnt1\":%ld, \"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f, \"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f, \"dst\":\"%s\"}\n", */
+	/* 	tensor->op, */
+	/* 	tensor->src[0]->name, */
+	/* 	elem_src, */
+	/* 	ffirst0, */
+	/* 	fmax0, */
+	/* 	fmin0, */
+	/* 	fsum0,	       */
+	/* 	ffirst2, */
+	/* 	fmax2, */
+	/* 	fmin2, */
+	/* 	fsum2, */
+	/* 	tensor->name); */
+      }
+      return;
     }
     GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
     GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
@@ -14016,6 +14269,82 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 GGML_ASSERT(false);
             } break;
     }
+    
+    // now report
+    int64_t elem_dst = ggml_nelements(tensor);
+
+    const size_t size = ggml_nbytes(tensor)/sizeof(float);
+    
+    for (int i = 0; i <size; i++){       
+	float f = *( ((float*)(tensor->data))+i);
+	if (i ==0) {
+	  ffirst2 = f;
+	  fmin2 = f;
+	  fmax2 = f;
+	}
+	fsum2 += f;
+	if (f < fmin2){
+	  fmin2 = f;
+	}
+	if (f >fmax2){
+	  fmax2 = f;
+	}
+      }
+    
+    if (tensor->src[1]) {
+      	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor->src[1]);
+	ggml_tensor_checksum(tensor);
+
+    /* fprintf(stderr, "JSON:{\"bop\":\"%s\",\"src\":\"%s\",\"src2\":\"%s\",\"cnt1\":%ld,\"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f,\"cnt2\":%ld,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"cnt2\":%ld,\"dst\":\"%s\"}\n", */
+    /* 	    ggml_op_name_table[tensor->op], */
+    /* 	    tensor->src[0]->name, */
+    /* 	    tensor->src[1]->name, */
+    /* 	    elem_src, */
+    /* 	    ffirst0, */
+    /* 	    fmax0, */
+    /* 	    fmin0, */
+    /* 	    fsum0, */
+	    
+    /* 	    elem_src1, */
+    /* 	    ffirst1, */
+    /* 	    fmax1, */
+    /* 	    fmin1, */
+    /* 	    fsum1, */
+
+    /* 	    ffirst2, */
+    /* 	    fmax2, */
+    /* 	    fmin2, */
+    /* 	    fsum2, */
+
+    /* 	    elem_dst, */
+    /* 	    tensor->name); */
+
+
+    } else {
+      	ggml_tensor_checksum(tensor->src[0]);
+	ggml_tensor_checksum(tensor);
+
+      /* fprintf(stderr, "JSON: { \"uop\":%d, \"src\":\"%s\", \"cnt1\":%ld, \"first1\":%f,\"max1\":%f,\"min1\":%f,\"sum1\":%f, \"first2\":%f,\"max2\":%f,\"min2\":%f,\"sum2\":%f,\"cnt2\":%ld,\"dst\":\"%s\"}\n", */
+      /* 	      tensor->op, */
+      /* 	      tensor->src[0]->name, */
+      /* 	      // src */
+      /* 	      elem_src, */
+      /* 	      ffirst0, */
+      /* 	      fmax0, */
+      /* 	      fmin0, */
+      /* 	      fsum0, */
+
+      /* 	      // dest */
+      /* 	      ffirst2, */
+      /* 	      fmax2, */
+      /* 	      fmin2, */
+      /* 	      fsum2, */
+      /* 	      elem_dst, */
+      /* 	      tensor->name); */
+
+      }
+
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -16903,7 +17232,7 @@ static enum ggml_opt_result ggml_opt_adam(
     const int n_accum = MAX(1, params.n_gradient_accumulation);
     const float accum_norm = 1.0f / (float) n_accum;
 
-    float * g  = opt->adam.g->data;  // gradients
+    float * g  = (float*)opt->adam.g->data;  // gradients
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
 
@@ -17518,36 +17847,39 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                 };
             } break;
         case GGML_OPT_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_LBFGS,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1,
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 0,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
-
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
-
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
-            } break;
+	  break;
+	  //{
+	      // TODO FIXME
+                // result = (struct ggml_opt_params) {
+                //     .type       = GGML_OPT_LBFGS,
+                //     .graph_size = GGML_DEFAULT_GRAPH_SIZE,
+                //     .n_threads  = 1,
+                //     .past       = 0,
+                //     .delta      = 1e-5f,
+		
+                //     .max_no_improvement = 0,
+
+                //     .print_forward_graph  = true,
+                //     .print_backward_graph = true,
+
+                //     .n_gradient_accumulation = 1,
+
+                //     .lbfgs = {
+                //         .m              = 6,
+                //         .n_iter         = 100,
+                //         .max_linesearch = 20,
+
+                //         .eps      = 1e-5f,
+                //         .ftol     = 1e-4f,
+                //         .wolfe    = 0.9f,
+                //         .min_step = 1e-20f,
+                //         .max_step = 1e+20f,
+
+                //         .linesearch = GGML_LINESEARCH_DEFAULT,
+		
+                //     }
+	      //};
+            //} break;
     }
 
     return result;
@@ -17718,7 +18050,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
+        block_q4_0 * __restrict__ y = (block_q4_0 *) dst + b/QK4_0;
 
         quantize_row_q4_0_reference(src + b, y, k);
 
@@ -17741,7 +18073,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_1;
 
     for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
+        block_q4_1 * __restrict__ y = (block_q4_1 *) dst + b/QK4_1;
 
         quantize_row_q4_1_reference(src + b, y, k);
 
@@ -17764,7 +18096,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK5_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
+        block_q5_0 * __restrict__ y = (block_q5_0 *)dst + b/QK5_0;
 
         quantize_row_q5_0_reference(src + b, y, k);
 
@@ -17794,7 +18126,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK5_1;
 
     for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
+        block_q5_1 * __restrict__ y = (block_q5_1 *)dst + b/QK5_1;
 
         quantize_row_q5_1_reference(src + b, y, k);
 
@@ -17824,7 +18156,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK8_0;
 
     for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
+        block_q8_0 * __restrict__ y = (block_q8_0 *)dst + b/QK8_0;
 
         quantize_row_q8_0_reference(src + b, y, k);
 
@@ -17929,36 +18261,36 @@ struct gguf_str {
 };
 
 static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
+  // [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
+  // [GGUF_TYPE_INT8]    = sizeof(int8_t),
+  // [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
+  // [GGUF_TYPE_INT16]   = sizeof(int16_t),
+  // [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
+  // [GGUF_TYPE_INT32]   = sizeof(int32_t),
+  // [GGUF_TYPE_FLOAT32] = sizeof(float),
+  // [GGUF_TYPE_BOOL]    = sizeof(bool),
+  // [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
+  // [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
+  // [GGUF_TYPE_INT64]   = sizeof(int64_t),
+  // [GGUF_TYPE_FLOAT64] = sizeof(double),
+  // [GGUF_TYPE_ARRAY]   = 0, // undefined
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 
 static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = "u8",
-    [GGUF_TYPE_INT8]    = "i8",
-    [GGUF_TYPE_UINT16]  = "u16",
-    [GGUF_TYPE_INT16]   = "i16",
-    [GGUF_TYPE_UINT32]  = "u32",
-    [GGUF_TYPE_INT32]   = "i32",
-    [GGUF_TYPE_FLOAT32] = "f32",
-    [GGUF_TYPE_BOOL]    = "bool",
-    [GGUF_TYPE_STRING]  = "str",
-    [GGUF_TYPE_ARRAY]   = "arr",
-    [GGUF_TYPE_UINT64]  = "u64",
-    [GGUF_TYPE_INT64]   = "i64",
-    [GGUF_TYPE_FLOAT64] = "f64",
+    // [GGUF_TYPE_UINT8]   = "u8",
+    // [GGUF_TYPE_INT8]    = "i8",
+    // [GGUF_TYPE_UINT16]  = "u16",
+    // [GGUF_TYPE_INT16]   = "i16",
+    // [GGUF_TYPE_UINT32]  = "u32",
+    // [GGUF_TYPE_INT32]   = "i32",
+    // [GGUF_TYPE_FLOAT32] = "f32",
+    // [GGUF_TYPE_BOOL]    = "bool",
+    // [GGUF_TYPE_STRING]  = "str",
+    // [GGUF_TYPE_ARRAY]   = "arr",
+    // [GGUF_TYPE_UINT64]  = "u64",
+    // [GGUF_TYPE_INT64]   = "i64",
+    // [GGUF_TYPE_FLOAT64] = "f64",
 };
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 
@@ -18040,14 +18372,14 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
 
     bool ok = true;
 
-    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = (char*)calloc(p->n + 1, 1);
     ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
 
     return ok;
 }
 
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+  struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
@@ -18092,7 +18424,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     bool ok = true;
 
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = (gguf_context*)GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     // read the header
     {
@@ -18124,7 +18456,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the kv pairs
     {
-        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
+      ctx->kv = (gguf_kv*)malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
 
         for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
@@ -18199,7 +18531,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the tensor infos
     {
-        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+      ctx->infos = (gguf_tensor_info*)malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
 
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct gguf_tensor_info * info = &ctx->infos[i];
@@ -18319,10 +18651,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         // create the tensors
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
+	      (int64_t)ctx->infos[i].ne[0],// FIXME narrowing
+	      (int64_t)ctx->infos[i].ne[1],
+	      (int64_t)ctx->infos[i].ne[2],
+	      (int64_t)ctx->infos[i].ne[3],
             };
 
             struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
@@ -18603,7 +18935,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
 
     const int n_kv = gguf_get_n_kv(ctx);
 
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = (gguf_kv*)realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
     ctx->header.n_kv++;
@@ -18739,7 +19071,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
             case GGUF_TYPE_ARRAY:
                 {
                     if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
+		      const char ** data = (const char **)malloc(src->kv[i].value.arr.n*sizeof(char *));
                         for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                             data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                         }
@@ -18760,7 +19092,7 @@ void gguf_add_tensor(
              struct gguf_context * ctx,
         const struct ggml_tensor * tensor) {
     const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = (gguf_tensor_info*)realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
 
     ctx->infos[idx].name.n    = strlen(tensor->name);
     ctx->infos[idx].name.data = strdup(tensor->name);
diff --git a/llama.cpp b/llama.cpp
index c2ad0486994727..c51829c4589e70 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1494,6 +1494,7 @@ static bool llama_kv_cache_init(
                          ggml_type   wtype,
                           uint32_t   n_ctx,
                                int   n_gpu_layers) {
+  fprintf(stderr, "GPULAYERS '%d'\n", n_gpu_layers);
     const uint32_t n_embd  = hparams.n_embd_gqa();
     const uint32_t n_layer = hparams.n_layer;
 
@@ -1531,6 +1532,7 @@ static bool llama_kv_cache_init(
     (void) n_gpu_layers;
 
 #ifdef GGML_USE_CUBLAS
+    fprintf(stderr, "USE CUBLAS\n");
     if (ggml_cublas_loaded()) {
         size_t vram_kv_cache = 0;
 
@@ -1548,6 +1550,8 @@ static bool llama_kv_cache_init(
             LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
         }
     }
+   #else
+    fprintf(stderr, "NO USE CUBLAS\n");
 #endif
 
     return true;
@@ -2065,6 +2069,7 @@ struct llama_model_loader {
                     break;
 #ifdef GGML_USE_CUBLAS
                 case GGML_BACKEND_GPU:
+		  
                 case GGML_BACKEND_GPU_SPLIT:
                     // old code:
                     //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
@@ -2741,9 +2746,11 @@ static void llm_load_tensors(
                         model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights00 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights01 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -2774,6 +2781,7 @@ static void llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights03 '%ld'\n", vram_weights);
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
                                 ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
@@ -2807,9 +2815,11 @@ static void llm_load_tensors(
                         model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights04 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights05 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -2840,6 +2850,7 @@ static void llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights06 '%ld'\n", vram_weights);
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
                                 ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
@@ -2878,10 +2889,13 @@ static void llm_load_tensors(
                         model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights07 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
+			  fprintf(stderr, "vram_weights08 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm_b);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights09 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -2906,7 +2920,9 @@ static void llm_load_tensors(
                             layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, backend);
 
                             if (backend == GGML_BACKEND_GPU) {
+			      fprintf(stderr, "vram_weights10 '%ld'\n", vram_weights);
                                 vram_weights += ggml_nbytes(layer.attn_norm_2);
+			      fprintf(stderr, "vram_weights11 '%ld'\n", vram_weights);
                                 vram_weights += ggml_nbytes(layer.attn_norm_2_b);
                             }
                         }
@@ -2918,6 +2934,7 @@ static void llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights12 '%ld'\n", vram_weights);
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.wo)          +
@@ -2955,10 +2972,12 @@ static void llm_load_tensors(
                         model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights13 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
                             vram_weights += ggml_nbytes(model.output_norm_b);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights14 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -2994,6 +3013,7 @@ static void llm_load_tensors(
                         layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
 
                         if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights15 '%ld'\n", vram_weights);
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
@@ -3039,10 +3059,13 @@ static void llm_load_tensors(
                         model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights16 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
+			  fprintf(stderr, "vram_weights17 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm_b);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights18 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -3105,10 +3128,13 @@ static void llm_load_tensors(
                         model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights19 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
+			  fprintf(stderr, "vram_weights20 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm_b);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights21 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -3144,6 +3170,7 @@ static void llm_load_tensors(
                         layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
 
                         if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights22 '%ld'\n", vram_weights);
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                 ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
@@ -3182,9 +3209,11 @@ static void llm_load_tensors(
                         model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
 
                         if (backend_norm == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights23 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output_norm);
                         }
                         if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+			  fprintf(stderr, "vram_weights24 '%ld'\n", vram_weights);
                             vram_weights += ggml_nbytes(model.output);
                         }
                     }
@@ -3211,6 +3240,7 @@ static void llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
+			  fprintf(stderr, "vram_weights25 '%ld'\n", vram_weights);
                             vram_weights +=
                                 ggml_nbytes(layer.attn_norm) +
                                 ggml_nbytes(layer.wqkv)      +
@@ -5588,8 +5618,8 @@ static int llama_decode_internal(
 
     // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-    //}
+    //ggml_graph_dump_dot(gf, NULL, "llama.dot");
+	//}
 
     // extract logits
     // TODO: do not compute and extract logits if only embeddings are needed