llama-cli, llava-cli, minicpmv-cli: build process

tangledgroup · Nov 26, 2024 · 2e4609e · 2e4609e
1 parent b8c617c
commit 2e4609e
Show file tree

Hide file tree

Showing 8 changed files with 294 additions and 331 deletions.
diff --git a/Makefile_5.patch b/Makefile_5.patch
@@ -3,37 +3,37 @@
 @@ -970,7 +970,11 @@
  	$(DIR_COMMON)/build-info.o \
  	$(DIR_COMMON)/json-schema-to-grammar.o
- 
+
 -OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
 +OBJ_LLAVA = \
 +	examples/llava/llava.o \
 +	examples/llava/clip.o
 +
 +OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) $(OBJ_LLAVA)
- 
+
  LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
  LIB_GGML_S = $(LIB_PRE)ggml.a
 @@ -1153,6 +1157,24 @@
  # Helper function that replaces .c, .cpp, and .cu file endings with .o:
  GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
- 
+
 +#
 +# llama-cpp-cffi static library
 +#
 +llama-cli-static: examples/main/main.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llama_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs libllama_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
 +llava-cli-static: examples/llava/llava-cli.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llava_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs libllava_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
 +minicpmv-cli-static: examples/llava/minicpmv-cli.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llava_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs libminicpmv_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
  llama-cli: examples/main/main.cpp \
  	$(OBJ_ALL)

diff --git a/examples/demo_models.py b/examples/demo_models.py
@@ -28,6 +28,11 @@
         hf_repo='bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF',
         hf_file='SmolLM-1.7B-Instruct-v0.2-Q4_K_M.gguf',
     ),
+    'HuggingFaceTB/SmolLM2-1.7B-Instruct': Model(
+        creator_hf_repo='HuggingFaceTB/SmolLM2-1.7B-Instruct',
+        hf_repo='bartowski/SmolLM2-1.7B-Instruct-GGUF',
+        hf_file='SmolLM2-1.7B-Instruct-Q4_K_M.gguf',
+    ),
     'microsoft/phi-2': Model(
         creator_hf_repo='microsoft/phi-2',
         hf_repo='andrijdavid/phi-2-GGUF',

diff --git a/examples/demo_smollm_chat.py b/examples/demo_smollm_chat.py
@@ -26,7 +26,7 @@ def demo(model: Model):
 
 if __name__ == '__main__':
     models_ids: list[str] = [
-        'HuggingFaceTB/SmolLM-1.7B-Instruct-v0.2',
+        'HuggingFaceTB/SmolLM2-1.7B-Instruct',
     ]
 
     for model_id in models_ids:

diff --git a/examples/demo_smollm_tool.py b/examples/demo_smollm_tool.py
@@ -7,7 +7,7 @@
 def demo(model: Model):
     print(model)
     config = get_config(model.creator_hf_repo)
-    
+
     options = Options(
         ctx_size=config.max_position_embeddings,
         predict=-2,
@@ -27,9 +27,9 @@ def demo(model: Model):
 
 if __name__ == '__main__':
     models_ids: list[str] = [
-        'HuggingFaceTB/SmolLM-1.7B-Instruct-v0.2',
+        'HuggingFaceTB/SmolLM2-1.7B-Instruct',
     ]
-    
+
     for model_id in models_ids:
         model: Model = models[model_id]
-        demo(model)
+        demo(model)
diff --git a/llama/__init__.py b/llama/__init__.py
@@ -1,11 +1,7 @@
-import os
-os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
+# import os
+# os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
 
-from .formatter import *
-from .model import *
-from .options import *
-
-try:
-    from .llama_cli import *
-except ImportError:
-    pass
+from .formatter import * # noqa
+from .model import * # noqa
+from .options import * # noqa
+from .llama_cli import * # noqa
diff --git a/llama/llama_cli.py b/llama/llama_cli.py
@@ -1,7 +1,6 @@
 __all__ = ['llama_generate']
 
 import os
-import json
 import ctypes
 from queue import Queue
 from copy import deepcopy
@@ -21,22 +20,27 @@
 LLAMA_CPP_BACKEND = os.getenv('LLAMA_CPP_BACKEND', None)
 
 
-if LLAMA_CPP_BACKEND:
-    if LLAMA_CPP_BACKEND in ('cuda', 'CUDA'):
-        from ._llama_cli_cuda_12_6 import lib, ffi
-    elif LLAMA_CPP_BACKEND in ('vulkan', 'VULKAN'):
-        from ._llama_cli_vulkan_1_x import lib, ffi
-    elif LLAMA_CPP_BACKEND in ('cpu', 'CPU'):
-        from ._llama_cli_cpu import lib, ffi
+try:
+    if LLAMA_CPP_BACKEND:
+        if LLAMA_CPP_BACKEND in ('cuda', 'CUDA'):
+            from ._llama_cli_cuda_12_6_3 import lib, ffi
+        elif LLAMA_CPP_BACKEND in ('vulkan', 'VULKAN'):
+            from ._llama_cli_vulkan_1_x import lib, ffi
+        elif LLAMA_CPP_BACKEND in ('cpu', 'CPU'):
+            from ._llama_cli_cpu import lib, ffi
+        else:
+            raise ValueError(f'{LLAMA_CPP_BACKEND = }')
     else:
-        raise ValueError(f'{LLAMA_CPP_BACKEND = }')
-else:
-    if is_cuda_available():
-        from ._llama_cli_cuda_12_6 import lib, ffi
-    elif is_vulkan_available():
-        from ._llama_cli_vulkan_1_x import lib, ffi
-    else:
-        from ._llama_cli_cpu import lib, ffi
+        if is_cuda_available():
+            from ._llama_cli_cuda_12_6_3 import lib, ffi
+        elif is_vulkan_available():
+            from ._llama_cli_vulkan_1_x import lib, ffi
+        else:
+            from ._llama_cli_cpu import lib, ffi
+except ImportError:
+    from ._llama_cli_cpu import lib, ffi
+except ModuleNotFoundError:
+    from ._llama_cli_cpu import lib, ffi
 
 
 _LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
@@ -55,7 +59,7 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
         return
 
     metadata['prev_chunk_bytes'] = b''
-    
+
     if not stop_on_special_token:
         queue.put(chunk)
         return
@@ -64,10 +68,10 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
     buffer = metadata['buffer']
     buffer += chunk
     metadata['buffer'] = buffer
-    
+
     subtoken_found = False
     token_found = False
-    
+
     for token in special_tokens:
         for i in range(len(token)):
             subtoken = token[:i + 1]
@@ -85,13 +89,13 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
                     metadata['buffer'] = buffer
                     metadata['should_stop'] = True
                     token_found = True
-    
+
     if subtoken_found:
         return
 
     if token_found:
         return
-    
+
     buffer = metadata['buffer']
     queue.put(buffer)
     metadata['buffer'] = ''
@@ -111,7 +115,7 @@ def _llama_cli_main(argc, argv, queue: Queue, metadata: dict):
     cffi__llama_yield_token_callback = ffi.cast('void (*_llama_yield_token_t)(const char * token)', _llama_yield_token_address)
     cffi__llama_should_stop_callback = ffi.cast('int (*_llama_should_stop_t)(void)', _llama_should_stop_address)
 
-    r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback, 1)
+    r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback)
     # assert r == 0
     queue.put(None)