Merge pull request #128 from ggerganov/master

b2928
Nexesenex · May 18, 2024 · 57755f6 · 57755f6
2 parents 0149e19 + 059031b
commit 57755f6
Show file tree

Hide file tree

Showing 30 changed files with 8,994 additions and 4,322 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -0,0 +1,73 @@
+# https://github.com/actions/labeler
+
+SYCL:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml-sycl.h
+            - ggml-sycl.cpp
+            - README-sycl.md
+Nvidia GPU:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml-cuda/**
+Vulkan:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml_vk_generate_shaders.py
+            - ggml-vulkan*
+documentation:
+    - changed-files:
+        - any-glob-to-any-file:
+            - docs/**
+            - media/**
+testing:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tests/**
+build:
+    - changed-files:
+        - any-glob-to-any-file:
+            - cmake/**
+            - CMakeLists.txt
+            - CMakePresets.json
+            - codecov.yml
+examples:
+    - changed-files:
+        - any-glob-to-any-file: examples/**
+devops:
+    - changed-files:
+        - any-glob-to-any-file:
+            - .devops/**
+            - .github/**
+            - ci/**
+python:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.py"
+            - requirements/**
+            - gguf-py/**
+            - .flake8
+script:
+    - changed-files:
+        - any-glob-to-any-file:
+            - scripts/**
+android:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/llama.android/**
+server:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/server/**
+ggml:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml-*.c
+            - ggml-*.h
+            - ggml-cuda/**
+nix:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.nix"
+            - .github/workflows/nix-*.yml
+            - .devops/nix/nixpkgs-instances.nix
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -271,40 +271,40 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
           name: llama-bin-ubuntu-x64.zip
 
-#  ubuntu-latest-cmake-sanitizer:
-#    runs-on: ubuntu-latest
-#
-#    continue-on-error: true
-#
-#    strategy:
-#      matrix:
-#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-#        build_type: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        id: depends
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#
-#      - name: Build
-#        id: cmake_build
-#        run: |
-#          mkdir build
-#          cd build
-#          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-#          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-#
-#      - name: Test
-#        id: cmake_test
-#        run: |
-#          cd build
-#          ctest -L main --verbose --timeout 900
+  ubuntu-latest-cmake-sanitizer:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug, Release]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
 
   ubuntu-latest-cmake-mpi:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -0,0 +1,12 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  labeler:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v5
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -32,13 +32,14 @@ jobs:
 
     strategy:
       matrix:
-        # TODO: temporary disabled due to linux kernel issues
-        #sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        sanitizer: [UNDEFINED]
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
         build_type: [Debug]
         include:
           - build_type: Release
             sanitizer: ""
+          - build_type: Debug
+            sanitizer: THREAD
+            disabled_on_pr: true
       fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
 
     steps:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
 
@@ -581,7 +581,7 @@ if (LLAMA_HIPBLAS)
     else()
         # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
         if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS})
+            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
         endif()
         cmake_minimum_required(VERSION 3.21)
         enable_language(HIP)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -573,6 +573,10 @@ def _set_vocab_sentencepiece(self):
 
         vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
 
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
         for token_id in range(tokenizer.vocab_size()):
             piece = tokenizer.IdToPiece(token_id)
             text = piece.encode("utf-8")
@@ -588,21 +592,23 @@ def _set_vocab_sentencepiece(self):
             elif tokenizer.IsByte(token_id):
                 toktype = SentencePieceTokenTypes.BYTE
 
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
 
         added_tokens_file = self.dir_model / 'added_tokens.json'
         if added_tokens_file.is_file():
             with open(added_tokens_file, "r", encoding="utf-8") as f:
                 added_tokens_json = json.load(f)
-
                 for key in added_tokens_json:
-                    key = key.encode("utf-8")
-                    if key not in tokens:
-                        tokens.append(key)
-                        scores.append(-1000.0)
-                        toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+                    token_id = added_tokens_json[key]
+                    if (token_id >= vocab_size):
+                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 
         if vocab_size > len(tokens):
             pad_count = vocab_size - len(tokens)
@@ -612,8 +618,6 @@ def _set_vocab_sentencepiece(self):
                 scores.append(-1000.0)
                 toktypes.append(SentencePieceTokenTypes.UNUSED)
 
-        assert len(tokens) == vocab_size
-
         self.gguf_writer.add_tokenizer_model("llama")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)

diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@@ -12,15 +12,17 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
 
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        ci-android
+#)
+#
+## Also provides "common"
+#FetchContent_MakeAvailable(llama)
 
-# Also provides "common"
-FetchContent_MakeAvailable(llama)
+add_subdirectory(../../../../../../ please-work)
 
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         // Use all tasks
         tasks.resize(n_task);
         printf("%s: reading tasks", __func__);
-        int n_dot = n_task/100;
+        int n_dot = std::max((int) n_task/100, 1);
         int i = 0;
         for (auto& task : tasks) {
             ++i;
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
     llama_batch_free(batch);
 
-    if (n_done < 100) return;
+    if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
 
     float p = 1.f*n_correct/n_done;
     float sigma = sqrt(p*(1-p)/(n_done-1));

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -18,8 +18,8 @@ The project is under active development, and we are [looking for feedback and co
 **Command line options:**
 
 - `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
-- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
-- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
+- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
+- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
 - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused

diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
@@ -315,6 +315,20 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 #endif
     return c;
 }
+
+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
+// __shfl_xor() for half2 was added in ROCm 5.6
+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
+    typedef union half2_b32 {
+        half2 val;
+        int   b32;
+    } half2_b32_t;
+    half2_b32_t tmp;
+    tmp.val = var;
+    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
+    return tmp.val;
+}
+#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
 #endif // defined(GGML_USE_HIPBLAS)
 
 #define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
@@ -463,6 +477,17 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
 
+static __device__ __forceinline__ float get_alibi_slope(
+    const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
+) {
+    if (max_bias <= 0.0f) {
+        return 1.0f;
+    }
+    const float base = h < n_head_log2 ? m0 : m1;
+    const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+    return powf(base, exph);
+}
 
 //////////////////////