ggerganov · jmikedupont2 · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 22, 2023
diff --git a/.gitignore b/.gitignore
@@ -99,3 +99,6 @@ tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
+/#llama.cpp#
+#*
+\\#*
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,8 +1,34 @@
 cmake_minimum_required(VERSION 3.13)  # for add_link_options
 project("llama.cpp" C CXX)
 
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if (NOT MSVC)
+    set(cuda_flags -Wno-pedantic)
+endif()
 
+set(LLAMA_CUBLAS ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(LLAMA_CUDA_F16 ON)
+set(LLAMA_ACCELERATE ON)
+set(LLAMA_K_QUANTS ON)
+
+#-DLLAMA_NATIVE=off
+set(LLAMA_AVX ON)
+set(LLAMA_AVX2 OFF)
+set(LLAMA_AVX512 OFF)
+set(LLAMA_FMA OFF)
+set(LLAMA_F16C OFF)
+set(CMAKE_CUDA_FLAGS "--verbose") #
+set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+set(CUDACXX /usr/local/cuda-12.3/bin/nvcc)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc)
+set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3)
+#GGML_USE_CUBLAS
+
+#set(CMAKE_EXE_LINKER_FLAGS -pg)
+#set(CMAKE_SHARED_LINKER_FLAGS -pg)
+
+set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
+
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -44,7 +70,7 @@ endif()
 
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 
 # debug
@@ -77,9 +103,9 @@ endif()
 
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  ON)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  ON)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
@@ -104,7 +130,7 @@ option(LLAMA_BUILD_SERVER               "llama: build server example"
 # Compile flags
 #
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -230,7 +256,12 @@ if (LLAMA_BLAS)
 
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+
+	# from https://github.com/NVIDIA/cutlass
+	make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
+	set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
+
+	#        add_compile_definitions(GGML_USE_OPENBLAS)
         if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
             add_compile_definitions(GGML_BLAS_USE_MKL)
         endif()
@@ -272,6 +303,7 @@ if (LLAMA_CUBLAS)
         endif()
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+
         if (DEFINED LLAMA_CUDA_DMMV_Y)
             add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
         endif()
@@ -312,7 +344,7 @@ if (LLAMA_MPI)
     if (MPI_C_FOUND)
         message(STATUS "MPI found")
         set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
         add_compile_definitions(GGML_USE_MPI)
         add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
         if (NOT MSVC)
@@ -390,14 +422,15 @@ endif()
 
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+    # -Wpedantic
+        set(warning_flags -Wall -Wextra  -Wcast-qual -Wno-unused-function)
         set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive)
         set(host_cxx_flags "")
 
         if (CMAKE_C_COMPILER_ID MATCHES "Clang")
             set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive)
 
             if (
                 (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@@ -407,37 +440,37 @@ if (LLAMA_ALL_WARNINGS)
             endif()
         elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
             set(c_flags ${c_flags} -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive)
 
             if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive)
             endif()
             if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive)
             endif()
         endif()
     else()
         # todo : msvc
     endif()
 
-    set(c_flags   ${c_flags}   ${warning_flags})
-    set(cxx_flags ${cxx_flags} ${warning_flags})
+    set(c_flags   ${c_flags}  -save-temps --verbose  ${warning_flags})
+    set(cxx_flags ${cxx_flags} -fpermissive  -save-temps --verbose ${warning_flags})
     add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
                         "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
                         "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 
 endif()
 
-if (NOT MSVC)
-    set(cuda_flags -Wno-pedantic)
-endif()
 set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
 
 list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
 if (NOT cuda_host_flags STREQUAL "")
     set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 
+# 
+set(cuda_flags --verbose -G  ${cuda_flags})
+
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 
 if (WIN32)
@@ -485,8 +518,10 @@ if (NOT MSVC)
             add_link_options(-static-libgcc -static-libstdc++)
         endif()
     endif()
+    add_link_options("-Wl,-Map=${TARGET}.map")
+
     if (LLAMA_GPROF)
-        add_compile_options(-pg)
+      add_compile_options(-pg)
     endif()
 endif()
 
@@ -645,13 +680,16 @@ if (GGML_USE_CPU_HBM)
 endif()
 
 add_library(ggml OBJECT
-            ggml.c
+            ggml.cpp
             ggml.h
-            ggml-alloc.c
+	    print.hpp
+	    ggml-internal.hpp
+	    llama-internal.hpp
+            ggml-alloc.cpp
             ggml-alloc.h
-            ggml-backend.c
+            ggml-backend.cpp
             ggml-backend.h
-            ggml-quants.c
+            ggml-quants.cpp
             ggml-quants.h
             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
@@ -683,7 +721,7 @@ add_library(llama
             )
 
 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features(llama PUBLIC cxx_std_20) # don't bump
 target_link_libraries(llama PRIVATE
     ggml
     ${LLAMA_EXTRA_LIBS}

diff --git a/Makefile b/Makefile
@@ -1,3 +1,4 @@
+
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
@@ -116,7 +117,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive
 
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -502,7 +503,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 endif # LLAMA_METAL
 
 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
@@ -537,17 +538,17 @@ $(info )
 # Build library
 #
 
-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml.o: ggml.cpp ggml.h ggml-cuda.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
+	$(CXX)  $(CXXFLAGS)   -c $< -o $@
 
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
+	$(CXX) $(CXXFLAGS)    -c $< -o $@
 
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 
@@ -582,7 +583,7 @@ clean:
 # Examples
 #
 
-main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp   ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@@ -678,6 +679,9 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 build-info.o: common/build-info.cpp
 	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
 
+#print.o: print.cpp # print.hpp
+#	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+
 #
 # Tests
 #
@@ -734,5 +738,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-c.o: tests/test-c.cpp llama.h
+	$(CXX) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tt:
+	clang++ -std=c++17 ggml.cpp
diff --git a/README.md b/README.md
@@ -696,7 +696,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
 
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 
 ### Instruction mode with Alpaca