From 2e4609eb489d43217a5d7e77045827ccb81e7806 Mon Sep 17 00:00:00 2001
From: Marko Tasic <mtasic85@gmail.com>
Date: Tue, 26 Nov 2024 16:45:21 +0100
Subject: [PATCH] llama-cli, llava-cli, minicpmv-cli: build process

---
 Makefile_5.patch             |  12 +-
 examples/demo_models.py      |   5 +
 examples/demo_smollm_chat.py |   2 +-
 examples/demo_smollm_tool.py |   8 +-
 llama/__init__.py            |  16 +-
 llama/llama_cli.py           |  48 +++--
 poetry.lock                  | 138 ++++++------
 scripts/build.py             | 396 +++++++++++++++--------------------
 8 files changed, 294 insertions(+), 331 deletions(-)

diff --git a/Makefile_5.patch b/Makefile_5.patch
index caa922d..3400b3f 100644
--- a/Makefile_5.patch
+++ b/Makefile_5.patch
@@ -3,37 +3,37 @@
 @@ -970,7 +970,11 @@
  	$(DIR_COMMON)/build-info.o \
  	$(DIR_COMMON)/json-schema-to-grammar.o
- 
+
 -OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
 +OBJ_LLAVA = \
 +	examples/llava/llava.o \
 +	examples/llava/clip.o
 +
 +OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) $(OBJ_LLAVA)
- 
+
  LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
  LIB_GGML_S = $(LIB_PRE)ggml.a
 @@ -1153,6 +1157,24 @@
  # Helper function that replaces .c, .cpp, and .cu file endings with .o:
  GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
- 
+
 +#
 +# llama-cpp-cffi static library
 +#
 +llama-cli-static: examples/main/main.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llama_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs libllama_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
 +llava-cli-static: examples/llava/llava-cli.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llava_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs libllava_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
 +minicpmv-cli-static: examples/llava/minicpmv-cli.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llava_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs libminicpmv_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
  llama-cli: examples/main/main.cpp \
  	$(OBJ_ALL)
diff --git a/examples/demo_models.py b/examples/demo_models.py
index 880e677..df46074 100644
--- a/examples/demo_models.py
+++ b/examples/demo_models.py
@@ -28,6 +28,11 @@
         hf_repo='bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF',
         hf_file='SmolLM-1.7B-Instruct-v0.2-Q4_K_M.gguf',
     ),
+    'HuggingFaceTB/SmolLM2-1.7B-Instruct': Model(
+        creator_hf_repo='HuggingFaceTB/SmolLM2-1.7B-Instruct',
+        hf_repo='bartowski/SmolLM2-1.7B-Instruct-GGUF',
+        hf_file='SmolLM2-1.7B-Instruct-Q4_K_M.gguf',
+    ),
     'microsoft/phi-2': Model(
         creator_hf_repo='microsoft/phi-2',
         hf_repo='andrijdavid/phi-2-GGUF',
diff --git a/examples/demo_smollm_chat.py b/examples/demo_smollm_chat.py
index 8e5ef14..a95381b 100644
--- a/examples/demo_smollm_chat.py
+++ b/examples/demo_smollm_chat.py
@@ -26,7 +26,7 @@ def demo(model: Model):
 
 if __name__ == '__main__':
     models_ids: list[str] = [
-        'HuggingFaceTB/SmolLM-1.7B-Instruct-v0.2',
+        'HuggingFaceTB/SmolLM2-1.7B-Instruct',
     ]
 
     for model_id in models_ids:
diff --git a/examples/demo_smollm_tool.py b/examples/demo_smollm_tool.py
index cb50d6d..3547f3b 100644
--- a/examples/demo_smollm_tool.py
+++ b/examples/demo_smollm_tool.py
@@ -7,7 +7,7 @@
 def demo(model: Model):
     print(model)
     config = get_config(model.creator_hf_repo)
-    
+
     options = Options(
         ctx_size=config.max_position_embeddings,
         predict=-2,
@@ -27,9 +27,9 @@ def demo(model: Model):
 
 if __name__ == '__main__':
     models_ids: list[str] = [
-        'HuggingFaceTB/SmolLM-1.7B-Instruct-v0.2',
+        'HuggingFaceTB/SmolLM2-1.7B-Instruct',
     ]
-    
+
     for model_id in models_ids:
         model: Model = models[model_id]
-        demo(model)
\ No newline at end of file
+        demo(model)
diff --git a/llama/__init__.py b/llama/__init__.py
index d948efe..24cabe9 100644
--- a/llama/__init__.py
+++ b/llama/__init__.py
@@ -1,11 +1,7 @@
-import os
-os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
+# import os
+# os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
 
-from .formatter import *
-from .model import *
-from .options import *
-
-try:
-    from .llama_cli import *
-except ImportError:
-    pass
+from .formatter import * # noqa
+from .model import * # noqa
+from .options import * # noqa
+from .llama_cli import * # noqa
diff --git a/llama/llama_cli.py b/llama/llama_cli.py
index e70e535..f3f1510 100644
--- a/llama/llama_cli.py
+++ b/llama/llama_cli.py
@@ -1,7 +1,6 @@
 __all__ = ['llama_generate']
 
 import os
-import json
 import ctypes
 from queue import Queue
 from copy import deepcopy
@@ -21,22 +20,27 @@
 LLAMA_CPP_BACKEND = os.getenv('LLAMA_CPP_BACKEND', None)
 
 
-if LLAMA_CPP_BACKEND:
-    if LLAMA_CPP_BACKEND in ('cuda', 'CUDA'):
-        from ._llama_cli_cuda_12_6 import lib, ffi
-    elif LLAMA_CPP_BACKEND in ('vulkan', 'VULKAN'):
-        from ._llama_cli_vulkan_1_x import lib, ffi
-    elif LLAMA_CPP_BACKEND in ('cpu', 'CPU'):
-        from ._llama_cli_cpu import lib, ffi
+try:
+    if LLAMA_CPP_BACKEND:
+        if LLAMA_CPP_BACKEND in ('cuda', 'CUDA'):
+            from ._llama_cli_cuda_12_6_3 import lib, ffi
+        elif LLAMA_CPP_BACKEND in ('vulkan', 'VULKAN'):
+            from ._llama_cli_vulkan_1_x import lib, ffi
+        elif LLAMA_CPP_BACKEND in ('cpu', 'CPU'):
+            from ._llama_cli_cpu import lib, ffi
+        else:
+            raise ValueError(f'{LLAMA_CPP_BACKEND = }')
     else:
-        raise ValueError(f'{LLAMA_CPP_BACKEND = }')
-else:
-    if is_cuda_available():
-        from ._llama_cli_cuda_12_6 import lib, ffi
-    elif is_vulkan_available():
-        from ._llama_cli_vulkan_1_x import lib, ffi
-    else:
-        from ._llama_cli_cpu import lib, ffi
+        if is_cuda_available():
+            from ._llama_cli_cuda_12_6_3 import lib, ffi
+        elif is_vulkan_available():
+            from ._llama_cli_vulkan_1_x import lib, ffi
+        else:
+            from ._llama_cli_cpu import lib, ffi
+except ImportError:
+    from ._llama_cli_cpu import lib, ffi
+except ModuleNotFoundError:
+    from ._llama_cli_cpu import lib, ffi
 
 
 _LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
@@ -55,7 +59,7 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
         return
 
     metadata['prev_chunk_bytes'] = b''
-    
+
     if not stop_on_special_token:
         queue.put(chunk)
         return
@@ -64,10 +68,10 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
     buffer = metadata['buffer']
     buffer += chunk
     metadata['buffer'] = buffer
-    
+
     subtoken_found = False
     token_found = False
-    
+
     for token in special_tokens:
         for i in range(len(token)):
             subtoken = token[:i + 1]
@@ -85,13 +89,13 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
                     metadata['buffer'] = buffer
                     metadata['should_stop'] = True
                     token_found = True
-    
+
     if subtoken_found:
         return
 
     if token_found:
         return
-    
+
     buffer = metadata['buffer']
     queue.put(buffer)
     metadata['buffer'] = ''
@@ -111,7 +115,7 @@ def _llama_cli_main(argc, argv, queue: Queue, metadata: dict):
     cffi__llama_yield_token_callback = ffi.cast('void (*_llama_yield_token_t)(const char * token)', _llama_yield_token_address)
     cffi__llama_should_stop_callback = ffi.cast('int (*_llama_should_stop_t)(void)', _llama_should_stop_address)
 
-    r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback, 1)
+    r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback)
     # assert r == 0
     queue.put(None)
 
diff --git a/poetry.lock b/poetry.lock
index ae6cd43..01d312e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1218,13 +1218,13 @@ pyyaml = ">=5.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.5.45"
+version = "9.5.46"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocs_material-9.5.45-py3-none-any.whl", hash = "sha256:a9be237cfd0be14be75f40f1726d83aa3a81ce44808dc3594d47a7a592f44547"},
-    {file = "mkdocs_material-9.5.45.tar.gz", hash = "sha256:286489cf0beca4a129d91d59d6417419c63bceed1ce5cd0ec1fc7e1ebffb8189"},
+    {file = "mkdocs_material-9.5.46-py3-none-any.whl", hash = "sha256:98f0a2039c62e551a68aad0791a8d41324ff90c03a6e6cea381a384b84908b83"},
+    {file = "mkdocs_material-9.5.46.tar.gz", hash = "sha256:ae2043f4238e572f9a40e0b577f50400d6fc31e2fef8ea141800aebf3bd273d7"},
 ]
 
 [package.dependencies]
@@ -1450,13 +1450,13 @@ files = [
 
 [[package]]
 name = "openai"
-version = "1.55.0"
+version = "1.55.1"
 description = "The official Python library for the openai API"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "openai-1.55.0-py3-none-any.whl", hash = "sha256:446e08918f8dd70d8723274be860404c8c7cc46b91b93bbc0ef051f57eb503c1"},
-    {file = "openai-1.55.0.tar.gz", hash = "sha256:6c0975ac8540fe639d12b4ff5a8e0bf1424c844c4a4251148f59f06c4b2bd5db"},
+    {file = "openai-1.55.1-py3-none-any.whl", hash = "sha256:d10d96a4f9dc5f05d38dea389119ec8dcd24bc9698293c8357253c601b4a77a5"},
+    {file = "openai-1.55.1.tar.gz", hash = "sha256:471324321e7739214f16a544e801947a046d3c5d516fae8719a317234e4968d3"},
 ]
 
 [package.dependencies]
@@ -1684,62 +1684,82 @@ test = ["pytest", "pytest-xdist", "setuptools"]
 
 [[package]]
 name = "pycares"
-version = "4.4.0"
+version = "4.5.0"
 description = "Python interface for c-ares"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "pycares-4.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:24da119850841d16996713d9c3374ca28a21deee056d609fbbed29065d17e1f6"},
-    {file = "pycares-4.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8f64cb58729689d4d0e78f0bfb4c25ce2f851d0274c0273ac751795c04b8798a"},
-    {file = "pycares-4.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d33e2a1120887e89075f7f814ec144f66a6ce06a54f5722ccefc62fbeda83cff"},
-    {file = "pycares-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c680fef1b502ee680f8f0b95a41af4ec2c234e50e16c0af5bbda31999d3584bd"},
-    {file = "pycares-4.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fff16b09042ba077f7b8aa5868d1d22456f0002574d0ba43462b10a009331677"},
-    {file = "pycares-4.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:229a1675eb33bc9afb1fc463e73ee334950ccc485bc83a43f6ae5839fb4d5fa3"},
-    {file = "pycares-4.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3aebc73e5ad70464f998f77f2da2063aa617cbd8d3e8174dd7c5b4518f967153"},
-    {file = "pycares-4.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6ef64649eba56448f65e26546d85c860709844d2fc22ef14d324fe0b27f761a9"},
-    {file = "pycares-4.4.0-cp310-cp310-win32.whl", hash = "sha256:4afc2644423f4eef97857a9fd61be9758ce5e336b4b0bd3d591238bb4b8b03e0"},
-    {file = "pycares-4.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:5ed4e04af4012f875b78219d34434a6d08a67175150ac1b79eb70ab585d4ba8c"},
-    {file = "pycares-4.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bce8db2fc6f3174bd39b81405210b9b88d7b607d33e56a970c34a0c190da0490"},
-    {file = "pycares-4.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9a0303428d013ccf5c51de59c83f9127aba6200adb7fd4be57eddb432a1edd2a"},
-    {file = "pycares-4.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afb91792f1556f97be7f7acb57dc7756d89c5a87bd8b90363a77dbf9ea653817"},
-    {file = "pycares-4.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b61579cecf1f4d616e5ea31a6e423a16680ab0d3a24a2ffe7bb1d4ee162477ff"},
-    {file = "pycares-4.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7af06968cbf6851566e806bf3e72825b0e6671832a2cbe840be1d2d65350710"},
-    {file = "pycares-4.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ceb12974367b0a68a05d52f4162b29f575d241bd53de155efe632bf2c943c7f6"},
-    {file = "pycares-4.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2eeec144bcf6a7b6f2d74d6e70cbba7886a84dd373c886f06cb137a07de4954c"},
-    {file = "pycares-4.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e3a6f7cfdfd11eb5493d6d632e582408c8f3b429f295f8799c584c108b28db6f"},
-    {file = "pycares-4.4.0-cp311-cp311-win32.whl", hash = "sha256:34736a2ffaa9c08ca9c707011a2d7b69074bbf82d645d8138bba771479b2362f"},
-    {file = "pycares-4.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:eb66c30eb11e877976b7ead13632082a8621df648c408b8e15cdb91a452dd502"},
-    {file = "pycares-4.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:fd644505a8cfd7f6584d33a9066d4e3d47700f050ef1490230c962de5dfb28c6"},
-    {file = "pycares-4.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:52084961262232ec04bd75f5043aed7e5d8d9695e542ff691dfef0110209f2d4"},
-    {file = "pycares-4.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0c5368206057884cde18602580083aeaad9b860e2eac14fd253543158ce1e93"},
-    {file = "pycares-4.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:112a4979c695b1c86f6782163d7dec58d57a3b9510536dcf4826550f9053dd9a"},
-    {file = "pycares-4.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d186dafccdaa3409194c0f94db93c1a5d191145a275f19da6591f9499b8e7b8"},
-    {file = "pycares-4.4.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:64965dc19c578a683ea73487a215a8897276224e004d50eeb21f0bc7a0b63c88"},
-    {file = "pycares-4.4.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:ed2a38e34bec6f2586435f6ff0bc5fe11d14bebd7ed492cf739a424e81681540"},
-    {file = "pycares-4.4.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:94d6962db81541eb0396d2f0dfcbb18cdb8c8b251d165efc2d974ae652c547d4"},
-    {file = "pycares-4.4.0-cp312-cp312-win32.whl", hash = "sha256:1168a48a834813aa80f412be2df4abaf630528a58d15c704857448b20b1675c0"},
-    {file = "pycares-4.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:db24c4e7fea4a052c6e869cbf387dd85d53b9736cfe1ef5d8d568d1ca925e977"},
-    {file = "pycares-4.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:21a5a0468861ec7df7befa69050f952da13db5427ae41ffe4713bc96291d1d95"},
-    {file = "pycares-4.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:22c00bf659a9fa44d7b405cf1cd69b68b9d37537899898d8cbe5dffa4016b273"},
-    {file = "pycares-4.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23aa3993a352491a47fcf17867f61472f32f874df4adcbb486294bd9fbe8abee"},
-    {file = "pycares-4.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:813d661cbe2e37d87da2d16b7110a6860e93ddb11735c6919c8a3545c7b9c8d8"},
-    {file = "pycares-4.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:77cf5a2fd5583c670de41a7f4a7b46e5cbabe7180d8029f728571f4d2e864084"},
-    {file = "pycares-4.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3eaa6681c0a3e3f3868c77aca14b7760fed35fdfda2fe587e15c701950e7bc69"},
-    {file = "pycares-4.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ad58e284a658a8a6a84af2e0b62f2f961f303cedfe551854d7bd40c3cbb61912"},
-    {file = "pycares-4.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bfb89ca9e3d0a9b5332deeb666b2ede9d3469107742158f4aeda5ce032d003f4"},
-    {file = "pycares-4.4.0-cp38-cp38-win32.whl", hash = "sha256:f36bdc1562142e3695555d2f4ac0cb69af165eddcefa98efc1c79495b533481f"},
-    {file = "pycares-4.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:902461a92b6a80fd5041a2ec5235680c7cc35e43615639ec2a40e63fca2dfb51"},
-    {file = "pycares-4.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7bddc6adba8f699728f7fc1c9ce8cef359817ad78e2ed52b9502cb5f8dc7f741"},
-    {file = "pycares-4.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cb49d5805cd347c404f928c5ae7c35e86ba0c58ffa701dbe905365e77ce7d641"},
-    {file = "pycares-4.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56cf3349fa3a2e67ed387a7974c11d233734636fe19facfcda261b411af14d80"},
-    {file = "pycares-4.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bf2eaa83a5987e48fa63302f0fe7ce3275cfda87b34d40fef9ce703fb3ac002"},
-    {file = "pycares-4.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82bba2ab77eb5addbf9758d514d9bdef3c1bfe7d1649a47bd9a0d55a23ef478b"},
-    {file = "pycares-4.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c6a8bde63106f162fca736e842a916853cad3c8d9d137e11c9ffa37efa818b02"},
-    {file = "pycares-4.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f5f646eec041db6ffdbcaf3e0756fb92018f7af3266138c756bb09d2b5baadec"},
-    {file = "pycares-4.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9dc04c54c6ea615210c1b9e803d0e2d2255f87a3d5d119b6482c8f0dfa15b26b"},
-    {file = "pycares-4.4.0-cp39-cp39-win32.whl", hash = "sha256:97892cced5794d721fb4ff8765764aa4ea48fe8b2c3820677505b96b83d4ef47"},
-    {file = "pycares-4.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:917f08f0b5d9324e9a34211e68d27447c552b50ab967044776bbab7e42a553a2"},
-    {file = "pycares-4.4.0.tar.gz", hash = "sha256:f47579d508f2f56eddd16ce72045782ad3b1b3b678098699e2b6a1b30733e1c2"},
+    {file = "pycares-4.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13a82fad8239d6fbcf916099bee17d8b5666d0ddb77dace431e0f7961c9427ab"},
+    {file = "pycares-4.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fefc7bebbe39b2e3b4b9615471233a8f7356b96129a7db9030313a3ae4ecc42d"},
+    {file = "pycares-4.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e322e8ce810026f6e0c7c2a254b9ed02191ab8d42fa2ce6808ede1bdccab8e65"},
+    {file = "pycares-4.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:723ba0803b016294430e40e544503fed9164949b694342c2552ab189e2b688ef"},
+    {file = "pycares-4.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e48b20b59cdc929cc712a8b22e89c273256e482b49bb8999af98d2c6fc4563c2"},
+    {file = "pycares-4.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de6e55bd9af595b112ac6080ac0a0d52b5853d0d8e6d01ac65ff09e51e62490a"},
+    {file = "pycares-4.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6f4b9063e3dd70460400367917698f209c10aabb68bf70b09e364895444487d"},
+    {file = "pycares-4.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:95522d4840d702fd766439a7c7cd747935aa54cf0b8675e9fadd8414dd9dd0df"},
+    {file = "pycares-4.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4709ce4fd9dbee24b1397f71a2adb3267323bb5ad5e7fde3f87873d172dd156"},
+    {file = "pycares-4.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8addbf3408af1010f50fd67ef634a6cb239ccb9c534c32a40713f3b8d306a98e"},
+    {file = "pycares-4.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:d0428ef42fcf575e197047e6a47892404faa34231902a453b3dfed66af4178b3"},
+    {file = "pycares-4.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:aed5c2732f3a6bdbbfab202267d37044ca1162f690b9d34b7ece97ba43f27453"},
+    {file = "pycares-4.5.0-cp310-cp310-win32.whl", hash = "sha256:b1859ea770a7abec40a6d02b5ab03c2396c4900c01f4e50ddb6c0dca4c2a6a7c"},
+    {file = "pycares-4.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9f87d8da20a3a80ab05fe80c14a62bf078bd726ca6af609edbeb376fb97d50ab"},
+    {file = "pycares-4.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ca7a1dba7b88290710db45012e0903c21c839fa0a2b9ddc100bba8e66bfb251"},
+    {file = "pycares-4.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:160e92588cdf1a0fa3a7015f47990b508d50efd9109ea4d719dee31c058f0648"},
+    {file = "pycares-4.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f38e45d23660ed1dafdb956fd263ae4735530ef1578aa2bf2caabb94cee4523"},
+    {file = "pycares-4.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f742acc6d29a99ffc14e3f154b3848ea05c5533b71065e0f0a0fd99c527491b2"},
+    {file = "pycares-4.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ceaf71bcd7b6447705e689b8fee8836c20c6148511a90122981f524a84bfcca9"},
+    {file = "pycares-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdc3c0be7b5b83e78e28818fecd0405bd401110dd6e2e66f7f10713c1188362c"},
+    {file = "pycares-4.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd458ee69800195247aa19b5675c5914cbc091c5a220e4f0e96777a31bb555c1"},
+    {file = "pycares-4.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a6649d713df73266708642fc3d04f110c0a66bee510fbce4cc5fed79df42083"},
+    {file = "pycares-4.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ac57d7bda925c10b997434e7ce30a2c3689c2e96bab9fd0a1165d5577378eecd"},
+    {file = "pycares-4.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ba17d8e5eeec4b2e0eb1a6a840bae9e62cd1c1c9cbc8dc9db9d1b9fdf33d0b54"},
+    {file = "pycares-4.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9e9b7d1a8de703283e4735c0e532ba4bc600e88de872dcd1a9a4950cf74d9f4f"},
+    {file = "pycares-4.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c6922ecbe458c13a4a2c1177bbce38abc44b5f086bc82115a92eab34418915f"},
+    {file = "pycares-4.5.0-cp311-cp311-win32.whl", hash = "sha256:1004b8a17614e33410b4b1bb68360977667f1cc9ab2dbcfb27240d6703e4cb6a"},
+    {file = "pycares-4.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:2c9c1055c622258a0f315560b2880a372363484b87cbef48af092624804caa72"},
+    {file = "pycares-4.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:506efbe5017807747ccd1bdcb3c2f6e64635bc01fee01a50c0b97d649018c162"},
+    {file = "pycares-4.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c469ec9fbe0526f45a98f67c1ea55be03abf30809c4f9c9be4bc93fb6806304d"},
+    {file = "pycares-4.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597c0950ede240c3a779f023fcf2442207fc11e570d3ca4ccdbb0db5bbaf2588"},
+    {file = "pycares-4.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9aa0da03c4df6ed0f87dd52a293bd0508734515041cc5be0f85d9edc1814914f"},
+    {file = "pycares-4.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aea1ebf52767c777d10a1b3d03844b9b05cc892714b3ee177d5d9fbff74fb9fa"},
+    {file = "pycares-4.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb20d84269ddffb177b6048e3bc03d0b9ffe17592093d900d5544805958d86b3"},
+    {file = "pycares-4.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3125df81b657971ee5c0333f8f560ba0151db1eb7cf04aea7d783bb433b306c1"},
+    {file = "pycares-4.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:525c77ea44546c12f379641aee163585d403cf50e29b04a06059d6aac894e956"},
+    {file = "pycares-4.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:1fd87cb26b317a9988abfcfa4e4dbc55d5f20177e5979ad4d854468a9246c187"},
+    {file = "pycares-4.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a90aecd41188884e57ae32507a2c6b010c60b791a253083761bbb37a488ecaed"},
+    {file = "pycares-4.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0d3de65cab653979dcc491e03f596566c9d40346c9deb088e0f9fe70600d8737"},
+    {file = "pycares-4.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:27a77b43604b3ba24e4fc49fd3ea59f50f7d89c7255f1f1ea46928b26cccacfa"},
+    {file = "pycares-4.5.0-cp312-cp312-win32.whl", hash = "sha256:6028cb8766f0fea1d2caa69fac23621fbe2cff9ce6968374e165737258703a33"},
+    {file = "pycares-4.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:2ce10672c4cfd1c5fb6718e8b25f0336ca11c89aab88aa6df53dafc4e41df740"},
+    {file = "pycares-4.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:011cd670da7caf55664c944abb71ec39af82b837f8d48da7cf0eec80f5682c4c"},
+    {file = "pycares-4.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b5c67930497fb2b1dbcaa85f8c4188fc2cb62e41d787deeed2d33cfe9dd6bf52"},
+    {file = "pycares-4.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d435a3b8468c656a7e7180dd7c4794510f6c612c33ad61a0fff6e440621f8b5"},
+    {file = "pycares-4.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8371f5ee1efb33d6276e275d152c9c5605e5f2e58a9e168519ec1f9e13dd95ae"},
+    {file = "pycares-4.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c76a9096fd5dc49c61c5235ea7032e8b43f4382800d64ca1e0e0cda700c082aa"},
+    {file = "pycares-4.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b604af76b57469ff68b44e9e4c857eaee43bc5035f4f183f07f4f7149191fe1b"},
+    {file = "pycares-4.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c589bd4f9160bfdb2f8080cf564bb120a4312cf091db07fe417f8e58a896a63c"},
+    {file = "pycares-4.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:361262805bb09742c364ec0117842043c950339e38561009bcabbb6ac89458ef"},
+    {file = "pycares-4.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6d2afb3c0776467055bf33db843ef483d25639be0f32e3a13ef5d4dc64098bf5"},
+    {file = "pycares-4.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bc7a1d8ed7c7a4de17706a3c89b305b02eb64c778897e6727c043e5b9dd0d853"},
+    {file = "pycares-4.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5703ec878b5c1efacdbf24ceaedfa606112fc67af5564f4db99c2c210f3ffadc"},
+    {file = "pycares-4.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d87758e09dbf52c27ed7cf7bc7eaf8b3226217d10c52b03d61a14d59f40fcae1"},
+    {file = "pycares-4.5.0-cp313-cp313-win32.whl", hash = "sha256:3316d490b4ce1a69f034881ac1ea7608f5f24ea5293db24ab574ac70b7d7e407"},
+    {file = "pycares-4.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:018e700fb0d1a2db5ec96e404ffa85ed97cc96e96d6af0bb9548111e37cf36a3"},
+    {file = "pycares-4.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:78c9890d93108c70708babee8a783e6021233f1f0a763d3634add6fd429aae58"},
+    {file = "pycares-4.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba69f8123995aa3df99f6ebc726fc6a4b08e467a957b215c0a82749b901d5eed"},
+    {file = "pycares-4.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d33c4ffae31d1b544adebe0b9aee2be1fb18aedd3f4f91e41c495ccbafd6d8"},
+    {file = "pycares-4.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:17a060cfc469828abf7f5945964d505bd8c0a756942fee159538f7885169752e"},
+    {file = "pycares-4.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1d0d5e69fa29e41b590a9dd5842454e8f34e2b928c92540aaf87e0161de8120"},
+    {file = "pycares-4.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f096699c46f5dde2c7a8d91501a36d2d58500f4d63682e2ec14a0fed7cca6402"},
+    {file = "pycares-4.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:429fe2065581a64a5f024f507b5f679bf37ea0ed39c3ba6289dba907e1c8a8f4"},
+    {file = "pycares-4.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9ea2f6d48e64b413b97b41b47392087b452af9bf9f9d4d6d05305a159f45909f"},
+    {file = "pycares-4.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:96d3aecd747a3fcd1e12c1ea1481b0813b4e0e80d40f314db7a86dda5bb1bd94"},
+    {file = "pycares-4.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:32919f6eda7f5ea4df3e64149fc5792b0d455277d23d6d0fc365142062f35d80"},
+    {file = "pycares-4.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:37add862461f9a3fc7ee4dd8b68465812b39456e21cebd5a33c414131ac05060"},
+    {file = "pycares-4.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ed1d050d2c6d74a77c1b6c51fd99426cc000b4202a50d28d6ca75f7433099a6b"},
+    {file = "pycares-4.5.0-cp39-cp39-win32.whl", hash = "sha256:887ac451ffe6e39ee46d3d0989c7bb829933d77e1dad5776511d825fc7e6a25b"},
+    {file = "pycares-4.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c8b87c05740595bc8051dc98e51f022f003750e7da90f62f7a9fd50e330b196"},
+    {file = "pycares-4.5.0.tar.gz", hash = "sha256:025b6c2ffea4e9fb8f9a097381c2fecb24aff23fbd6906e70da22ec9ba60e19d"},
 ]
 
 [package.dependencies]
diff --git a/scripts/build.py b/scripts/build.py
index 535fc12..2336972 100644
--- a/scripts/build.py
+++ b/scripts/build.py
@@ -6,7 +6,7 @@
 
 from cffi import FFI
 
-from clean import clean_llama, clean_llama_cpp, clean
+from clean import clean_llama_cpp, clean
 
 
 # if 'PYODIDE' in env and env['PYODIDE'] == '1':
@@ -22,13 +22,13 @@ def clone_llama_cpp():
     subprocess.run(['patch', 'llama.cpp/examples/llava/minicpmv-cli.cpp', 'minicpmv-cli_5.patch'], check=True)
 
 
-def cuda_12_6_setup(*args, **kwargs):
+def cuda_12_6_3_setup(*args, **kwargs):
     #
     # cuda env
     #
-    cuda_file = 'cuda_12.6.0_560.28.03_linux.run'
-    cuda_url = f'https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/{cuda_file}'
-    cuda_output_dir = os.path.abspath('./cuda-12.6')
+    cuda_file = 'cuda_12.6.3_560.35.05_linux.run'
+    cuda_url = f'https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/{cuda_file}'
+    cuda_output_dir = os.path.abspath('./cuda-12.6.3')
     cuda_file_path = os.path.join(cuda_output_dir, cuda_file)
 
     # download cuda file
@@ -181,49 +181,50 @@ def cuda_12_4_1_setup(*args, **kwargs):
 def build_cpu(*args, **kwargs):
     # build static and shared library
     env = os.environ.copy()
-    env['CXXFLAGS'] = '-O3'
+    env['CXXFLAGS'] = '-O3 -DLLAMA_LIB'
     print('build_cpu:')
     pprint(env)
 
-    #
-    # build llama.cpp
-    #
-    subprocess.run([
-        'make',
-        '-C',
-        'llama.cpp',
-        '-j',
-        'llama-cli-static',
-        'GGML_NO_OPENMP=1',
-    ], check=True, env=env)
-
-    #
-    # cffi
-    #
-    ffibuilder = FFI()
-
-    ffibuilder.cdef('''
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
-    ''')
-
-    ffibuilder.set_source(
-        '_llama_cli_cpu',
-        '''
-        #include <stdio.h>
-
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
-        ''',
-        libraries=['stdc++'],
-        extra_objects=['../llama.cpp/llama_cli.a'],
-        extra_compile_args=['-O3'],
-        extra_link_args=['-O3', '-flto'],
-    )
-
-    ffibuilder.compile(tmpdir='build', verbose=True)
+    for name in ['llama', 'llava', 'minicpmv']:
+        #
+        # build llama.cpp
+        #
+        subprocess.run([
+            'make',
+            '-C',
+            'llama.cpp',
+            '-j',
+            f'{name}-cli-static',
+            'GGML_NO_OPENMP=1',
+        ], check=True, env=env)
+
+        #
+        # cffi
+        #
+        ffibuilder = FFI()
+
+        ffibuilder.cdef(f'''
+            typedef void (*_llama_yield_token_t)(const char * token);
+            typedef int (*_llama_should_stop_t)(void);
+            int _{name}_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
+        ''')
+
+        ffibuilder.set_source(
+            f'_{name}_cli_cpu',
+            f'''
+            #include <stdio.h>
+
+            typedef void (*_llama_yield_token_t)(const char * token);
+            typedef int (*_llama_should_stop_t)(void);
+            int _{name}_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
+            ''',
+            libraries=['stdc++'],
+            extra_objects=[f'../llama.cpp/lib{name}_cli.a'],
+            extra_compile_args=['-O3'],
+            extra_link_args=['-O3', '-flto'],
+        )
+
+        ffibuilder.compile(tmpdir='build', verbose=True)
 
     #
     # copy compiled modules
@@ -241,53 +242,54 @@ def build_cpu(*args, **kwargs):
 def build_vulkan_1_x(*args, **kwargs):
     # build static and shared library
     env = os.environ.copy()
-    env['CXXFLAGS'] = '-O3'
+    env['CXXFLAGS'] = '-O3 -DLLAMA_LIB'
     print('build_vulkan_1_x:')
     pprint(env)
 
-    #
-    # build llama.cpp
-    #
-    subprocess.run([
-        'make',
-        '-C',
-        'llama.cpp',
-        '-j',
-        'llama-cli-static',
-        'GGML_NO_OPENMP=1',
-        'GGML_VULKAN=1',
-    ], check=True, env=env)
-
-    #
-    # cffi
-    #
-    ffibuilder = FFI()
-
-    ffibuilder.cdef('''
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
-    ''')
-
-    ffibuilder.set_source(
-        '_llama_cli_vulkan_1_x',
-        '''
-        #include <stdio.h>
-
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
-        ''',
-        libraries=[
-            'stdc++',
-            'vulkan',
-        ],
-        extra_objects=['../llama.cpp/llama_cli.a'],
-        extra_compile_args=['-O3'],
-        extra_link_args=['-O3', '-flto'],
-    )
-
-    ffibuilder.compile(tmpdir='build', verbose=True)
+    for name in ['llama', 'llava', 'minicpmv']:
+        #
+        # build llama.cpp
+        #
+        subprocess.run([
+            'make',
+            '-C',
+            'llama.cpp',
+            '-j',
+            f'{name}-cli-static',
+            'GGML_NO_OPENMP=1',
+            'GGML_VULKAN=1',
+        ], check=True, env=env)
+
+        #
+        # cffi
+        #
+        ffibuilder = FFI()
+
+        ffibuilder.cdef(f'''
+            typedef void (*_llama_yield_token_t)(const char * token);
+            typedef int (*_llama_should_stop_t)(void);
+            int _{name}_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
+        ''')
+
+        ffibuilder.set_source(
+            f'_{name}_cli_vulkan_1_x',
+            f'''
+            #include <stdio.h>
+
+            typedef void (*_llama_yield_token_t)(const char * token);
+            typedef int (*_llama_should_stop_t)(void);
+            int _{name}_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
+            ''',
+            libraries=[
+                'stdc++',
+                'vulkan',
+            ],
+            extra_objects=[f'../llama.cpp/lib{name}_cli.a'],
+            extra_compile_args=['-O3'],
+            extra_link_args=['-O3', '-flto'],
+        )
+
+        ffibuilder.compile(tmpdir='build', verbose=True)
 
     #
     # copy compiled modules
@@ -302,85 +304,24 @@ def build_vulkan_1_x(*args, **kwargs):
         shutil.move(file, 'llama/')
 
 
-def build_cpu_openblas(*args, **kwargs):
-    # build static and shared library
-    env = os.environ.copy()
-    env['CXXFLAGS'] = '-O3'
-    print('build_cpu_openblas:')
-    pprint(env)
-
-    #
-    # build llama.cpp
-    #
-    subprocess.run([
-        'make',
-        '-C',
-        'llama.cpp',
-        '-j',
-        'llama-cli-static',
-        'GGML_NO_OPENMP=1',
-        'GGML_OPENBLAS=1',
-    ], check=True, env=env)
-
-    #
-    # cffi
-    #
-    ffibuilder = FFI()
-
-    ffibuilder.cdef('''
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
-    ''')
-
-    ffibuilder.set_source(
-        '_llama_cli_cpu_openblas',
-        '''
-        #include <stdio.h>
-
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
-        ''',
-        libraries=[
-            'stdc++',
-            'openblas',
-        ],
-        extra_objects=['../llama.cpp/llama_cli.a'],
-        extra_compile_args=['-O3'],
-        extra_link_args=['-O3', '-flto'],
-    )
-
-    ffibuilder.compile(tmpdir='build', verbose=True)
-
-    #
-    # copy compiled modules
-    #
-    for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'):
-        shutil.move(file, 'llama/')
-
-    for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'):
-        shutil.move(file, 'llama/')
-
-    for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
-        shutil.move(file, 'llama/')
-
-
-def build_linux_cuda_12_6(*args, **kwargs):
+def build_linux_cuda_12_6_3(*args, **kwargs):
     # build static and shared library
     env = os.environ.copy()
 
     #
     # cuda env
     #
-    cuda_output_dir = cuda_12_6_setup()
+    cuda_output_dir = cuda_12_6_3_setup()
 
     env['PATH'] =  f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
     env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
+    env['CC'] = 'gcc-13'
+    env['CXX'] = 'g++-13'
+    env['NVCC_PREPEND_FLAGS'] = '-ccbin /usr/bin/g++-13'
     env['CUDA_DOCKER_ARCH'] = 'compute_61'
-    env['CXXFLAGS'] = '-O3'
-    env['LD_LIBRARY_PATH'] = '/project/cuda-12.6/dist/lib64:/project/cuda-12.6/dist/targets/x86_64-linux/lib:/project/cuda-12.6/dist/lib64/stubs:$LD_LIBRARY_PATH'
-    env['CUDA_HOME'] = '/project/cuda-12.6/dist'
+    env['CXXFLAGS'] = '-O3 -DLLAMA_LIB'
+    env['LD_LIBRARY_PATH'] = '/project/cuda-12.6.3/dist/lib64:/project/cuda-12.6.3/dist/targets/x86_64-linux/lib:/project/cuda-12.6.3/dist/lib64/stubs:$LD_LIBRARY_PATH'
+    env['CUDA_HOME'] = '/project/cuda-12.6.3/dist'
     env['NVCCFLAGS'] = '\
             -gencode arch=compute_70,code=sm_70 \
             -gencode arch=compute_75,code=sm_75 \
@@ -389,61 +330,62 @@ def build_linux_cuda_12_6(*args, **kwargs):
             -gencode arch=compute_89,code=sm_89 \
             -gencode arch=compute_90,code=sm_90'
 
-    print('build_linux_cuda_12_6:')
+    print('build_linux_cuda_12_6_3:')
     pprint(env)
 
-    #
-    # build llama.cpp
-    #
-    subprocess.run([
-        'make',
-        '-C',
-        'llama.cpp',
-        '-j',
-        'llama-cli-static',
-        'GGML_NO_OPENMP=1',
-        'GGML_CUDA=1',
-    ], check=True, env=env)
-
-    #
-    # cffi
-    #
-    ffibuilder = FFI()
-
-    ffibuilder.cdef('''
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
-    ''')
-
-    ffibuilder.set_source(
-        '_llama_cli_cuda_12_6',
-        '''
-        #include <stdio.h>
-
-        typedef void (*_llama_yield_token_t)(const char * token);
-        typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
-        ''',
-        libraries=[
-            'stdc++',
-            'cuda',
-            'cublas',
-            'culibos',
-            'cudart',
-            'cublasLt',
-        ],
-        library_dirs=[
-            f'{cuda_output_dir}/dist/lib64',
-            f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
-            f'{cuda_output_dir}/dist/lib64/stubs',
-        ],
-        extra_objects=['../llama.cpp/llama_cli.a'],
-        extra_compile_args=['-O3'],
-        extra_link_args=['-O3', '-flto'],
-    )
-
-    ffibuilder.compile(tmpdir='build', verbose=True)
+    for name in ['llama', 'llava', 'minicpmv']:
+        #
+        # build llama.cpp
+        #
+        subprocess.run([
+            'make',
+            '-C',
+            'llama.cpp',
+            '-j',
+            f'{name}-cli-static',
+            'GGML_NO_OPENMP=1',
+            'GGML_CUDA=1',
+        ], check=True, env=env)
+
+        #
+        # cffi
+        #
+        ffibuilder = FFI()
+
+        ffibuilder.cdef(f'''
+            typedef void (*_llama_yield_token_t)(const char * token);
+            typedef int (*_llama_should_stop_t)(void);
+            int _{name}_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
+        ''')
+
+        ffibuilder.set_source(
+            f'_{name}_cli_cuda_12_6_3',
+            f'''
+            #include <stdio.h>
+
+            typedef void (*_llama_yield_token_t)(const char * token);
+            typedef int (*_llama_should_stop_t)(void);
+            int _{name}_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
+            ''',
+            libraries=[
+                'stdc++',
+                'cuda',
+                'cublas',
+                'culibos',
+                'cudart',
+                'cublasLt',
+            ],
+            library_dirs=[
+                f'{cuda_output_dir}/dist/lib64',
+                f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
+                f'{cuda_output_dir}/dist/lib64/stubs',
+            ],
+            extra_objects=[f'../llama.cpp/lib{name}_cli.a'],
+            extra_compile_args=['-O3'],
+            extra_link_args=['-O3', '-flto'],
+        )
+
+        ffibuilder.compile(tmpdir='build', verbose=True)
 
     #
     # copy compiled modules
@@ -470,7 +412,7 @@ def build_linux_cuda_12_5_1(*args, **kwargs):
     env['PATH'] =  f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
     env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
     env['CUDA_DOCKER_ARCH'] = 'compute_61'
-    env['CXXFLAGS'] = '-O3'
+    env['CXXFLAGS'] = '-O3 -DLLAMA_LIB'
     env['LD_LIBRARY_PATH'] = '/project/cuda-12.5.1/dist/lib64:/project/cuda-12.5.1/dist/targets/x86_64-linux/lib:/project/cuda-12.5.1/dist/lib64/stubs:$LD_LIBRARY_PATH'
     env['CUDA_HOME'] = '/project/cuda-12.5.1/dist'
     env['NVCCFLAGS'] = '\
@@ -505,7 +447,7 @@ def build_linux_cuda_12_5_1(*args, **kwargs):
     ffibuilder.cdef('''
         typedef void (*_llama_yield_token_t)(const char * token);
         typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
+        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
     ''')
 
     ffibuilder.set_source(
@@ -515,7 +457,7 @@ def build_linux_cuda_12_5_1(*args, **kwargs):
 
         typedef void (*_llama_yield_token_t)(const char * token);
         typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
+        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
         ''',
         libraries=[
             'stdc++',
@@ -530,7 +472,7 @@ def build_linux_cuda_12_5_1(*args, **kwargs):
             f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
             f'{cuda_output_dir}/dist/lib64/stubs',
         ],
-        extra_objects=['../llama.cpp/llama_cli.a'],
+        extra_objects=['../llama.cpp/libllama_cli.a'],
         extra_compile_args=['-O3'],
         extra_link_args=['-O3', '-flto'],
     )
@@ -562,7 +504,7 @@ def build_linux_cuda_12_4_1(*args, **kwargs):
     env['PATH'] =  f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
     env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
     env['CUDA_DOCKER_ARCH'] = 'compute_61'
-    env['CXXFLAGS'] = '-O3'
+    env['CXXFLAGS'] = '-O3 -DLLAMA_LIB'
     env['LD_LIBRARY_PATH'] = '/project/cuda-12.4.1/dist/lib64:/project/cuda-12.4.1/dist/targets/x86_64-linux/lib:/project/cuda-12.4.1/dist/lib64/stubs:$LD_LIBRARY_PATH'
     env['CUDA_HOME'] = '/project/cuda-12.4.1/dist'
     env['NVCCFLAGS'] = '\
@@ -597,7 +539,7 @@ def build_linux_cuda_12_4_1(*args, **kwargs):
     ffibuilder.cdef('''
         typedef void (*_llama_yield_token_t)(const char * token);
         typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
+        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
     ''')
 
     ffibuilder.set_source(
@@ -607,7 +549,7 @@ def build_linux_cuda_12_4_1(*args, **kwargs):
 
         typedef void (*_llama_yield_token_t)(const char * token);
         typedef int (*_llama_should_stop_t)(void);
-        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
+        int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop);
         ''',
         libraries=[
             'stdc++',
@@ -622,7 +564,7 @@ def build_linux_cuda_12_4_1(*args, **kwargs):
             f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
             f'{cuda_output_dir}/dist/lib64/stubs',
         ],
-        extra_objects=['../llama.cpp/llama_cli.a'],
+        extra_objects=['../llama.cpp/libllama_cli.a'],
         extra_compile_args=['-O3'],
         extra_link_args=['-O3', '-flto'],
     )
@@ -654,21 +596,16 @@ def build(*args, **kwargs):
         clean_llama_cpp()
         build_cpu(*args, **kwargs)
 
-    # # openblas
-    # if env.get('GGML_OPENBLAS', '1') != '0':
-    #     clean_llama_cpp()
-    #     build_cpu_openblas(*args, **kwargs)
-
     # vulkan 1.x
     if env.get('GGML_VULKAN', '1') != '0' and env.get('AUDITWHEEL_ARCH') in ('x86_64', None):
         clean_llama_cpp()
         build_vulkan_1_x(*args, **kwargs)
 
-    # cuda 12.6
-    if env.get('GGML_CUDA', '1') != '0':
-        if env.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and env.get('AUDITWHEEL_ARCH') in ('x86_64', None):
-            clean_llama_cpp()
-            build_linux_cuda_12_6(*args, **kwargs)
+    # # cuda 12.6.3
+    # if env.get('GGML_CUDA', '1') != '0':
+    #     if env.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and env.get('AUDITWHEEL_ARCH') in ('x86_64', None):
+    #         clean_llama_cpp()
+    #         build_linux_cuda_12_6_3(*args, **kwargs)
 
     # # cuda 12.5.1
     # if env.get('GGML_CUDA', '1') != '0':
@@ -682,5 +619,6 @@ def build(*args, **kwargs):
     #         clean_llama_cpp()
     #         build_linux_cuda_12_4_1(*args, **kwargs)
 
+
 if __name__ == '__main__':
     build()