Merge branch 'master' into xsn/slot_state_machine

ggerganov · Sep 3, 2024 · ba0065f · ba0065f
2 parents e8e3e72 + b69a480
commit ba0065f
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -10,32 +10,14 @@
 
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 
-> [!IMPORTANT]
-[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
-
 ## Recent API changes
 
-- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
-- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
-- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
-- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
-- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
-- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
-- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
-- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
+- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
 
 ## Hot topics
 
-- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
-- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
-- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
-- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
-- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
-- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
-- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
-- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
-- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
-- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- *add hot topics here*
 
 ----
 

diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
@@ -52,8 +52,8 @@ Feature: Parallel
     Then all prompts are predicted with <n_predict> tokens
     Examples:
       | streaming | n_predict |
-      | disabled  | 200       |
-      | enabled   | 200       |
+      | disabled  | 128       |
+      | enabled   | 64        |
 
   Scenario Outline: Multi users OAI completions compatibility no v1
     Given a system prompt You are a writer.

diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,8 @@
 
 # pyright: reportRedeclaration=false
 
+DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
+
 @step("a server listening on {server_fqdn}:{server_port}")
 def step_server_config(context, server_fqdn: str, server_port: str):
     context.server_fqdn = server_fqdn
@@ -699,7 +701,7 @@ def step_tokenize_set_add_special(context):
 @async_run_until_complete
 async def step_tokenize(context):
     context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         tokenize_args = {
             "content": context.tokenized_text,
         }
@@ -716,7 +718,7 @@ async def step_tokenize(context):
 @async_run_until_complete
 async def step_detokenize(context):
     assert len(context.tokens) > 0
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{context.base_url}/detokenize',
                                 json={
                                     "tokens": context.tokens,
@@ -745,7 +747,7 @@ def step_strings_for_tokenization(context):
 @step('an OPTIONS request is sent from {origin}')
 @async_run_until_complete
 async def step_options_request(context, origin):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
         async with session.options(f'{context.base_url}/v1/chat/completions',
                                     headers=headers) as response:
@@ -761,7 +763,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
 @step('prometheus metrics are exposed')
 @async_run_until_complete
 async def step_prometheus_metrics_exported(context):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with await session.get(f'{context.base_url}/metrics') as metrics_response:
             assert metrics_response.status == 200
             assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
@@ -834,7 +836,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
 @step('the slot {slot_id:d} is saved with filename "{filename}"')
 @async_run_until_complete
 async def step_save_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
                                 json={"filename": filename},
                                 headers={"Content-Type": "application/json"}) as response:
@@ -844,7 +846,7 @@ async def step_save_slot(context, slot_id, filename):
 @step('the slot {slot_id:d} is restored with filename "{filename}"')
 @async_run_until_complete
 async def step_restore_slot(context, slot_id, filename):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
                                 json={"filename": filename},
                                 headers={"Content-Type": "application/json"}) as response:
@@ -854,7 +856,7 @@ async def step_restore_slot(context, slot_id, filename):
 @step('the slot {slot_id:d} is erased')
 @async_run_until_complete
 async def step_erase_slot(context, slot_id):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
                                 headers={"Content-Type": "application/json"}) as response:
             context.response = response
@@ -863,7 +865,7 @@ async def step_erase_slot(context, slot_id):
 @step('switch {on_or_off} lora adapter {lora_id:d}')
 @async_run_until_complete
 async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{context.base_url}/lora-adapters',
                                 json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
                                 headers={"Content-Type": "application/json"}) as response:
@@ -899,7 +901,7 @@ async def request_completion(prompt,
             print(f"Set user_api_key: {user_api_key}")
         headers['Authorization'] = f'Bearer {user_api_key}'
 
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{base_url}/completion',
                                 json={
                                     "input_prefix": prompt_prefix,
@@ -912,8 +914,7 @@ async def request_completion(prompt,
                                     "temperature": temperature if temperature is not None else 0.8,
                                     "n_probs": 2,
                                 },
-                                headers=headers,
-                                timeout=3600) as response:
+                                headers=headers) as response:
             if expect_api_error is None or not expect_api_error:
                 assert response.status == 200
                 assert response.headers['Access-Control-Allow-Origin'] == origin
@@ -971,7 +972,7 @@ async def oai_chat_completions(user_prompt,
     if async_client:
         origin = 'llama.cpp'
         headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
             async with session.post(f'{base_url}{base_path}',
                                     json=payload,
                                     headers=headers) as response:
@@ -1058,7 +1059,7 @@ async def oai_chat_completions(user_prompt,
 
 
 async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with session.post(f'{base_url}/embedding',
                                 json={
                                     "content": content,
@@ -1078,14 +1079,13 @@ async def request_oai_embeddings(input, seed,
         headers=[]
         if user_api_key is not None:
             headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
             async with session.post(f'{base_url}/v1/embeddings',
                                     json={
                                         "input": input,
                                         "model": model,
                                     },
-                                    headers=headers,
-                                    timeout=3600) as response:
+                                    headers=headers) as response:
                 assert response.status == 200, f"received status code not expected: {response.status}"
                 assert response.headers['Access-Control-Allow-Origin'] == origin
                 assert response.headers['Content-Type'] == "application/json; charset=utf-8"
@@ -1204,7 +1204,7 @@ async def wait_for_slots_status(context,
     if 'GITHUB_ACTIONS' in os.environ:
         timeout *= 2
 
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         while True:
             async with await session.get(f'{base_url}/slots', params=params) as slots_response:
                 status_code = slots_response.status
@@ -1247,7 +1247,7 @@ def assert_embeddings(embeddings):
 
 
 async def request_slots_status(context, expected_slots):
-    async with aiohttp.ClientSession() as session:
+    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
         async with await session.get(f'{context.base_url}/slots') as slots_response:
             assert slots_response.status == 200
             slots = await slots_response.json()