feat: Update llama.cpp

tc-wolf · Feb 27, 2024 · fea33c9 · fea33c9
1 parent 4d574bd
commit fea33c9
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 1 deletion.
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -264,6 +264,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 #     LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
 
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -295,6 +296,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 LLAMA_FTYPE_MOSTLY_IQ3_M = 27
 LLAMA_FTYPE_MOSTLY_IQ2_S = 28
 LLAMA_FTYPE_MOSTLY_IQ2_M = 29
+LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
 LLAMA_FTYPE_GUESSED = 1024
 
 # enum llama_rope_scaling_type {
@@ -548,6 +550,7 @@ class llama_model_params(ctypes.Structure):
 #     float    yarn_beta_fast;   // YaRN low correction dim
 #     float    yarn_beta_slow;   // YaRN high correction dim
 #     uint32_t yarn_orig_ctx;    // YaRN original context size
+#     float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
 
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
@@ -580,6 +583,7 @@ class llama_context_params(ctypes.Structure):
         yarn_beta_fast (float): YaRN low correction dim
         yarn_beta_slow (float): YaRN high correction dim
         yarn_orig_ctx (int): YaRN original context size
+        defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default)
         cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
@@ -605,6 +609,7 @@ class llama_context_params(ctypes.Structure):
         ("yarn_beta_fast", ctypes.c_float),
         ("yarn_beta_slow", ctypes.c_float),
         ("yarn_orig_ctx", ctypes.c_uint32),
+        ("defrag_thold", ctypes.c_float),
         ("cb_eval", ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
+7 −2		Makefile
+9 −0		common/common.cpp
+1 −0		common/common.h
+2 −2		examples/passkey/passkey.cpp
+2 −1		examples/quantize/quantize.cpp
+4 −0		examples/server/server.cpp
+142 −50		ggml-cuda.cu
+27 −2		ggml-metal.m
+221 −3		ggml-metal.metal
+249 −26		ggml-quants.c
+13 −0		ggml-quants.h
+30 −0		ggml.c
+2 −0		ggml.h
+82 −37		llama.cpp
+2 −0		llama.h
+1 −1		tests/test-backend-ops.cpp