Merge branch 'ggerganov:master' into master

sealad886 · May 19, 2024 · 04c547e · 04c547e
2 parents df4906d + 1ea2a00
commit 04c547e
Show file tree

Hide file tree

Showing 16 changed files with 9,048 additions and 5,330 deletions.
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -9,4 +9,9 @@ jobs:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
+    - uses: actions/checkout@v4
+      with:
+        repository: "ggerganov/llama.cpp"
     - uses: actions/labeler@v5
+      with:
+        configuration-path: '.github/labeler.yml'
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -72,6 +72,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
     {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
     {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "stablelm",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
     {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
     {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
     {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -446,6 +446,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
             # ref: https://huggingface.co/openai-community/gpt2
             res = "gpt-2"
+        if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
+            # ref: https://huggingface.co/stabilityai/stablelm-2-1_6b
+            res = "stablelm2"
         if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
             # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
             res = "refact"

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
             params.keep_split = true;
         } else {
             usage(argv[0]);

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -48,7 +48,7 @@ The project is under active development, and we are [looking for feedback and co
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
 - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
-- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -102,7 +102,6 @@ struct slot_params {
     bool stream       = true;
     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
 
-    uint32_t seed      = -1; // RNG seed
     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
     int32_t  n_predict = -1; // new tokens to predict
@@ -1264,7 +1263,7 @@ struct server_context {
             {"n_ctx",                     slot.n_ctx},
             {"n_predict",                 slot.n_predict},
             {"model",                     params.model_alias},
-            {"seed",                      slot.params.seed},
+            {"seed",                      slot.sparams.seed},
             {"temperature",               slot.sparams.temp},
             {"dynatemp_range",            slot.sparams.dynatemp_range},
             {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},

diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
@@ -70,12 +70,48 @@ Feature: Results
     Then all predictions are equal
     Examples:
       | n_parallel | temp |
-      |  1         | 0.0  |
-      |  2         | 0.0  |
-      |  4         | 0.0  |
-      |  1         | 1.0  |
-      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+      | 1          | 0.0  |
+      | 2          | 0.0  |
+      | 4          | 0.0  |
+      | 1          | 1.0  |
+      # FIXME: These tests fail on master.
+      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
       # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
-      # |  2         | 1.0  |
-      # |  4         | 1.0  |
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # | 2          | 1.0  |
+      # | 4          | 1.0  |
+
+  Scenario Outline: consistent token probs with same seed and prompt
+    Given <n_slots> slots
+    And   <n_kv> KV cache size
+    And   1.0 temperature
+    And   <n_predict> max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 1 prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then  the server is idle
+    And   all slots are idle
+
+    Given <n_parallel> prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then the server is idle
+    And  all slots are idle
+
+    Then all token probabilities are equal
+    Examples:
+      | n_slots | n_kv | n_predict | n_parallel |
+      | 4       | 1024 | 1         | 1          |
+      | 4       | 1024 | 1         | 4          |
+      # FIXME: These tests fail on master.
+      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
+      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # | 4       | 1024 | 100       | 1          |
+      # This test still fails even the above patches; the first token probabilities are already different.
+      # | 4       | 1024 | 100       | 4          |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,7 @@
 def step_server_config(context, server_fqdn, server_port):
     context.server_fqdn = server_fqdn
     context.server_port = int(server_port)
+    context.n_threads = None
     context.n_gpu_layer = None
     if 'PORT' in os.environ:
         context.server_port = int(os.environ['PORT'])
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
     context.n_gpu_layer = ngl
 
 
+@step('{n_threads:d} threads')
+def step_n_threads(context, n_threads):
+    context.n_thread = n_threads
+
+
 @step('{draft:d} as draft')
 def step_draft(context, draft):
     context.draft = draft
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
 
 @step('all predictions are different')
 @async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
     n_completions = await gather_tasks_results(context)
     assert n_completions >= 2, "need at least 2 completions"
     assert_all_predictions_different(context.tasks_result)
     context.tasks_result = []
 
 
+@step('all token probabilities are equal')
+@async_run_until_complete
+async def step_token_probabilities_equal(context):
+    n_completions = await gather_tasks_results(context)
+    assert n_completions >= 2, "need at least 2 completions"
+    assert_all_token_probabilities_equal(context.tasks_result)
+    context.tasks_result = []
+
+
 @step('the completion is  truncated')
 def step_assert_completion_truncated(context):
     step_assert_completion_truncated(context, '')
@@ -869,6 +884,7 @@ async def request_completion(prompt,
                                     "id_slot": id_slot,
                                     "seed": seed if seed is not None else 42,
                                     "temperature": temperature if temperature is not None else "0.8f",
+                                    "n_probs": 2,
                                 },
                                 headers=headers,
                                 timeout=3600) as response:
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
         assert content_i != content_j, "contents not different"
 
 
+def assert_all_token_probabilities_equal(completion_responses):
+    n_predict = len(completion_responses[0]['completion_probabilities'])
+    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+        for pos in range(n_predict):
+            for i, response_i in enumerate(completion_responses):
+                probs_i = response_i['completion_probabilities'][pos]['probs']
+                print(f"pos {pos}, probs {i}: {probs_i}")
+    for pos in range(n_predict):
+        for i, response_i in enumerate(completion_responses):
+            probs_i = response_i['completion_probabilities'][pos]['probs']
+            for j, response_j in enumerate(completion_responses):
+                if i == j:
+                    continue
+                probs_j = response_j['completion_probabilities'][pos]['probs']
+            assert probs_i == probs_j, "contents not equal"
+
+
 async def gather_tasks_results(context):
     n_tasks = len(context.concurrent_tasks)
     if context.debug:
@@ -1261,6 +1294,8 @@ def start_server_background(context):
         server_args.extend(['--batch-size', context.n_batch])
     if context.n_ubatch:
         server_args.extend(['--ubatch-size', context.n_ubatch])
+    if context.n_threads:
+        server_args.extend(['--threads', context.threads])
     if context.n_gpu_layer:
         server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
     if context.draft is not None:

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -539,6 +539,8 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
     void * dev_ptr;
     cudaError_t err = cudaMalloc(&dev_ptr, size);
     if (err != cudaSuccess) {
+        // clear the error
+        cudaGetLastError();
         GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
         return nullptr;
     }

diff --git a/ggml-quants.c b/ggml-quants.c
@@ -1149,7 +1149,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
         sumlx += w*x[i]*l;
         suml2 += w*l*l;
     }
-    float scale = sumlx/suml2;
+    float scale = suml2 ? sumlx/suml2 : 0.0f;
     if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
     float best = scale * sumlx;
     for (int is = -9; is <= 9; ++is) {