server : refactor (ggerganov#5882)

* server : refactoring (wip) * server : remove llava/clip objects from build * server : fix empty prompt handling + all slots idle logic * server : normalize id vars * server : code style * server : simplify model chat template validation * server : code style * server : minor * llama : llama_chat_apply_template support null buf * server : do not process embedding requests when disabled * server : reorganize structs and enums + naming fixes * server : merge oai.hpp in utils.hpp * server : refactor system prompt update at start * server : disable cached prompts with self-extend * server : do not process more than n_batch tokens per iter * server: tests: embeddings use a real embeddings model (ggerganov#5908) * server, tests : bump batch to fit 1 embedding prompt * server: tests: embeddings fix build type Debug is randomly failing (ggerganov#5911) * server: tests: embeddings, use different KV Cache size * server: tests: embeddings, fixed prompt do not exceed n_batch, increase embedding timeout, reduce number of concurrent embeddings * server: tests: embeddings, no need to wait for server idle as it can timout * server: refactor: clean up http code (ggerganov#5912) * server : avoid n_available var ggml-ci * server: refactor: better http codes * server : simplify json parsing + add comment about t_last * server : rename server structs * server : allow to override FQDN in tests ggml-ci * server : add comments --------- Co-authored-by: Pierrick Hymbert <[email protected]>
hodlen · Apr 1, 2024 · 00b2647 · 00b2647
1 parent 58ea388
commit 00b2647
Show file tree

Hide file tree

Showing 14 changed files with 2,265 additions and 2,711 deletions.
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -58,7 +58,8 @@ jobs:
             cmake \
             python3-pip \
             wget \
-            psmisc
+            psmisc \
+            language-pack-en
 
       - name: Build
         id: cmake_build

diff --git a/Makefile b/Makefile
@@ -724,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

diff --git a/examples/server-embd.py b/examples/server-embd.py
@@ -13,7 +13,7 @@ async def main():
     model_url = "http://127.0.0.1:6900"
     responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
         url= f"{model_url}/embedding",
-        json= {"content": str(i)*1024}
+        json= {"content": str(0)*1024}
     ) for i in range(n)])
 
     for response in responses:

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -1,12 +1,12 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()

diff --git a/examples/server/README.md b/examples/server/README.md
@@ -436,7 +436,7 @@ Notice that each `probs` is an array of length `n_probs`.
         "next_token": {
             "has_next_token": true,
             "n_remain": -1,
-            "num_tokens_predicted": 0,
+            "n_decoded": 0,
             "stopped_eos": false,
             "stopped_limit": false,
             "stopped_word": false,

diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp