Merge branch 'ggerganov:master' into master

sealad886 · Jun 20, 2024 · 59e694d · 59e694d
2 parents 7039c02 + abd894a
commit 59e694d
Show file tree

Hide file tree

Showing 15 changed files with 618 additions and 408 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,90 +1,123 @@
-*.o
+# Extensions
+
 *.a
-*.so
-*.gguf
-*.gguf.json
+*.bat
 *.bin
-*.exe
 *.dll
-*.log
-*.gcov
-*.gcno
-*.gcda
 *.dot
-*.bat
-*.tmp
-*.metallib
 *.etag
+*.exe
+*.gcda
+*.gcno
+*.gcov
+*.gguf
+*.gguf.json
 *.lastModified
-.DS_Store
-.build/
+*.log
+*.metallib
+*.o
+*.so
+*.tmp
+
+# IDE / OS
+
 .cache/
 .ccls-cache/
 .direnv/
+.DS_Store
 .envrc
+.idea/
 .swiftpm
-.venv
-.clang-tidy
 .vs/
 .vscode/
-.idea/
+nppBackup
 
-ggml-metal-embed.metal
 
-lcov-report/
+# Coverage
+
 gcovr-report/
+lcov-report/
+
+# Build Artifacts
 
 tags
+.build/
 build*
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
 !build.zig
-cmake-build-*
+/libllama.so
+/llama-*
 android-ndk-*
+arm_neon.h
+cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
 out/
 tmp/
 
+# CI
+
+!.github/workflows/*.yml
+
+# Models
+
 models/*
 models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*
 
-/Pipfile
-/libllama.so
-/llama-*
-llama-batched-swift
-/common/build-info.cpp
-arm_neon.h
-compile_commands.json
-CMakeSettings.json
-
-__pycache__
-dist
+# Zig
 
 zig-out/
 zig-cache/
 
+# Logs
+
 ppl-*.txt
 qnt-*.txt
 perf-*.txt
 
+# Examples
+
 examples/jeopardy/results.txt
+examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-examples/server/*.css.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh
 
+# Python
+
+__pycache__
+.venv
+/Pipfile
+dist
 poetry.lock
 poetry.toml
-nppBackup
 
 # Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
+/tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
+/tests/test-grammar-parser
+/tests/test-llama-grammar
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
+/tests/test-rope
 /tests/test-sampling
 /tests/test-tokenizer-0
-/tests/test-tokenizer-1-spm
 /tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
+/tests/test-tokenizer-1-spm
+
+# Scripts
+!/scripts/install-oneapi.bat
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -665,6 +665,7 @@ if (LLAMA_SYCL)
     #todo: AOT
 
     find_package(IntelSYCL REQUIRED)
+    find_package(MKL REQUIRED)
 
     message(STATUS "SYCL found")
 
@@ -679,11 +680,9 @@ if (LLAMA_SYCL)
     endif()
 
     add_compile_options(-I./) #include DPCT
-    add_compile_options(-I/${SYCL_INCLUDE_DIR})
 
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
     if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
     endif()
@@ -693,8 +692,10 @@ if (LLAMA_SYCL)
     list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
 
     if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
+        add_compile_options(-I/${SYCL_INCLUDE_DIR})
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
         if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -11,9 +11,21 @@
             "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
     },
-
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "LLAMA_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
     { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
     { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
 
     {
@@ -35,15 +47,18 @@
     },
 
     { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
 
     { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
 
     { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
   ]
 }
diff --git a/README-sycl.md b/README-sycl.md
@@ -410,15 +410,9 @@ Output (example):
 
 4. Install build tools
 
-a. Download & install cmake for Windows: https://cmake.org/download/
+a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
+b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
 
-b. Download & install mingw-w64 make for Windows provided by w64devkit
-
-- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
-
-- Extract `w64devkit` on your pc.
-
-- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
 
 ### II. Build llama.cpp
 
@@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 
 # Option 2: Or FP16
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 
 cmake --build build --config Release -j
 ```
@@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
 .\examples\sycl\win-build-sycl.bat
 ```
 
+Or, use CMake presets to build:
+```sh
+cmake --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake --build build-x64-windows-sycl-release -j --target llama-cli
+
+cmake --preset x64-windows-sycl-debug
+cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+```
+
+Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+
 *Notes:*
 
-- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
 
 ### III. Run the inference
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -6,7 +6,6 @@
 #include "llama.h"
 
 #include <algorithm>
-#include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <codecvt>
@@ -2657,7 +2656,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
         }
 
         // Set the output file
-        std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
+
+        struct FILE_deleter {
+            void operator()(FILE * f) const {
+                fclose(f);
+            }
+        };
+
+        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
         if (!outfile) {
             fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
             return false;

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1594,7 +1594,7 @@ struct server_context {
                     } else {
                         std::string prompt;
                         if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
-                            json_value(task.data, "prompt", std::string());
+                            prompt = json_value(task.data, "prompt", std::string());
                         }
 
                         slot = get_available_slot(prompt);

diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat
@@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
 
 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 
 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
 
 ::  build all binary
-make -j
+cmake --build . -j
 if %errorlevel% neq 0 goto ERROR
 
 cd ..

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
         }
 
         const int cc = ggml_cuda_info().devices[id].cc;
-        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
     }
     return row_rounding;
 }

diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
@@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
 }
 
 // Round rows to this value for --split-mode row:
-static int get_mmq_y_host(const int cc, const int mmq_x) {
-    return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+static int get_mmq_y_host(const int cc) {
+    return cc >= CC_VOLTA ? 128 : 64;
 }
 
 //////////////////////