From 7bdb22d4cf4c46dd5bbebc04679041632157d368 Mon Sep 17 00:00:00 2001
From: jhen <developer@jhen.me>
Date: Fri, 23 Feb 2024 19:17:09 +0800
Subject: [PATCH] feat: sync llama.cpp

---
 cpp/ggml-impl.h          | 27 ++++++++++----
 cpp/ggml-quants.c        | 65 ++++++++++++++++++++++----------
 cpp/ggml.c               |  6 +--
 cpp/ggml.h               |  6 ---
 cpp/llama.cpp            | 81 +++++++++++++++++++++++++++++++++-------
 example/ios/Podfile.lock |  4 +-
 llama.cpp                |  2 +-
 scripts/llama.cpp.patch  |  6 +--
 8 files changed, 142 insertions(+), 55 deletions(-)

diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h
index 42bd0c5..eb7344d 100644
--- a/cpp/ggml-impl.h
+++ b/cpp/ggml-impl.h
@@ -53,11 +53,23 @@ extern "C" {
 //
 #include <arm_neon.h>
 
-#define LM_GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define LM_GGML_COMPUTE_FP32_TO_FP16(x) (x)
+#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
+#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
+
+#define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
+
+static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
+    __fp16 tmp;
+    memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
+    return (float)tmp;
+}
 
-#define LM_GGML_FP16_TO_FP32(x) ((float) (x))
-#define LM_GGML_FP32_TO_FP16(x) (x)
+static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
+    lm_ggml_fp16_t res;
+    __fp16 tmp = f;
+    memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
+    return res;
+}
 
 #else
 
@@ -214,8 +226,7 @@ extern float lm_ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32,
 // so we define LM_GGML_FP16_TO_FP32 and LM_GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
-#if !defined(LM_GGML_FP16_TO_FP32) || !defined(LM_GGML_FP32_TO_FP16)
-
+#if !defined(LM_GGML_FP16_TO_FP32)
 inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
     uint16_t s;
     memcpy(&s, &f, sizeof(uint16_t));
@@ -223,8 +234,10 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
 }
 
 #define LM_GGML_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x)
-#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
 
+#if !defined(LM_GGML_FP32_TO_FP16)
+#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 
 #define LM_GGML_HASHTABLE_FULL ((size_t)-1)
diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c
index 5dd07d6..babc181 100644
--- a/cpp/ggml-quants.c
+++ b/cpp/ggml-quants.c
@@ -438,6 +438,30 @@ inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) {
     return res;
 }
 
+// NOTE: not tested
+inline static int8x16_t lm_ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
 #else
 
 #define lm_ggml_int16x8x2_t  int16x8x2_t
@@ -451,6 +475,7 @@ inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) {
 #define lm_ggml_vld1q_u8_x4  vld1q_u8_x4
 #define lm_ggml_vld1q_s8_x2  vld1q_s8_x2
 #define lm_ggml_vld1q_s8_x4  vld1q_s8_x4
+#define lm_ggml_vqtbl1q_s8   vqtbl1q_s8
 
 #endif
 
@@ -5629,8 +5654,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -5779,8 +5804,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin);
 
         const uint8_t * restrict q2 = x[i].qs;
         const int8_t  * restrict q8 = y[i].qs;
@@ -6433,7 +6458,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
 
         int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
 
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
 
         const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
         q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
@@ -6635,7 +6660,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
 
         int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
 
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
 
         vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
 
@@ -7138,9 +7163,9 @@ void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void
         aux16[1] = (a[0] >> 4) & 0x0f0f;
 
         const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
-        sum_mins += y[i].d * (float)x[i].d[1] * summi;
+        sum_mins += y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * summi;
 
-        const float d = y[i].d * (float)x[i].d[0];
+        const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]);
 
         const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4);
 
@@ -7798,7 +7823,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
         const int8_t * sc = x[i].scales;
 
         const uint8_t * restrict q5 = x[i].qs;
@@ -7940,7 +7965,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
         const int8_t * sc = x[i].scales;
 
         const uint8_t * restrict q5 = x[i].qs;
@@ -8508,7 +8533,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d_all = (float)x[i].d;
+        const float d_all = LM_GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q6 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -8679,7 +8704,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void
 
     for (int i = 0; i < nb; ++i) {
 
-        const float d_all = (float)x[i].d;
+        const float d_all = LM_GGML_FP16_TO_FP32(x[i].d);
 
         const uint8_t * restrict q6 = x[i].ql;
         const uint8_t * restrict qh = x[i].qh;
@@ -9333,7 +9358,7 @@ void lm_ggml_vec_dot_iq1_s_q8_K  (int n, float * LM_GGML_RESTRICT s, size_t bs,
     uint16_t gindex[8];
     uint16x8x2_t vindex;
     int8x16x4_t q1b;
-    int8x16x4_t q8b;
+    lm_ggml_int8x16x4_t q8b;
     uint16x8x4_t scales;
     int32x4x2_t sumi;
     int32x4x2_t dotq;
@@ -9498,7 +9523,6 @@ void lm_ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const voi
     float sumf = 0;
 
     for (int ib = 0; ib < nb; ib += 2) {
-
         q4bits.val[0] = vld1q_u8(x[ib+0].qs);
         q4bits.val[1] = vld1q_u8(x[ib+1].qs);
         q8b.val[0]    = vld1q_s8(y[ib+0].qs);
@@ -9506,16 +9530,17 @@ void lm_ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const voi
         q8b.val[2]    = vld1q_s8(y[ib+1].qs);
         q8b.val[3]    = vld1q_s8(y[ib+1].qs + 16);
 
-        q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b));
-        q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b));
-        q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+        q4b.val[0] = lm_ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+        q4b.val[1] = lm_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+        q4b.val[2] = lm_ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+        q4b.val[3] = lm_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
 
         prod_1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
         prod_2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
 
-        sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
-
+        sumf +=
+            LM_GGML_FP16_TO_FP32(x[ib+0].d) * LM_GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
+            LM_GGML_FP16_TO_FP32(x[ib+1].d) * LM_GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
     }
 
     *s = sumf;
diff --git a/cpp/ggml.c b/cpp/ggml.c
index 1300352..94838f4 100644
--- a/cpp/ggml.c
+++ b/cpp/ggml.c
@@ -323,7 +323,7 @@ float lm_ggml_table_f32_f16[1 << 16];
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
 float lm_ggml_fp16_to_fp32(lm_ggml_fp16_t x) {
-    return (float) LM_GGML_FP16_TO_FP32(x);
+    return LM_GGML_FP16_TO_FP32(x);
 }
 
 lm_ggml_fp16_t lm_ggml_fp32_to_fp16(float x) {
@@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
     #define LM_GGML_F16x8              float16x8_t
     #define LM_GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
     #define LM_GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define LM_GGML_F16x8_LOAD         vld1q_f16
+    #define LM_GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
     #define LM_GGML_F16x8_STORE        vst1q_f16
     #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
     #define LM_GGML_F16x8_ADD          vaddq_f16
@@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
     #define LM_GGML_F32Cx4              float32x4_t
     #define LM_GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
     #define LM_GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define LM_GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
+    #define LM_GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
     #define LM_GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
     #define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
     #define LM_GGML_F32Cx4_ADD          vaddq_f32
diff --git a/cpp/ggml.h b/cpp/ggml.h
index cb359a9..1f9d6a8 100644
--- a/cpp/ggml.h
+++ b/cpp/ggml.h
@@ -315,13 +315,7 @@
 extern "C" {
 #endif
 
-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half lm_ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 lm_ggml_fp16_t;
-#else
     typedef uint16_t lm_ggml_fp16_t;
-#endif
 
     // convert FP16 <-> FP32
     LM_GGML_API float       lm_ggml_fp16_to_fp32(lm_ggml_fp16_t x);
diff --git a/cpp/llama.cpp b/cpp/llama.cpp
index 665d10a..3e50dae 100644
--- a/cpp/llama.cpp
+++ b/cpp/llama.cpp
@@ -520,7 +520,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
             { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
             { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
@@ -4065,7 +4064,12 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
+
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += lm_ggml_nbytes(model.output);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
@@ -4074,14 +4078,23 @@ static bool llm_load_tensors(
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, false);
 
                         layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);
+
                         layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, false);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, false);
 
                         // AWQ ScaleActivation layer
                         layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -6182,7 +6195,7 @@ struct llm_build_context {
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
-                    NULL,
+                    model.layers[il].attn_norm_b,
                     LLM_NORM, cb, il);
             cb(attn_norm, "attn_norm", il);
 
@@ -6193,6 +6206,11 @@ struct llm_build_context {
                 cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
+                if (model.layers[il].bqkv){
+                    cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+
                 if (hparams.f_clamp_kqv > 0.0f) {
                     cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(cur, "wqkv_clamped", il);
@@ -6209,7 +6227,7 @@ struct llm_build_context {
                 Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
@@ -6222,13 +6240,13 @@ struct llm_build_context {
             {
                 cur = llm_build_norm(ctx0, ffn_inp, hparams,
                         model.layers[il].ffn_norm,
-                        NULL,
+                        model.layers[il].ffn_norm_b,
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                         NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                         model.layers[il].ffn_act,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -6245,7 +6263,7 @@ struct llm_build_context {
 
         cur = llm_build_norm(ctx0, cur, hparams,
                 model.output_norm,
-                NULL,
+                model.output_norm_b,
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 
@@ -7445,6 +7463,7 @@ struct llm_build_context {
 
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
+
         inpL = lm_ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
 
@@ -7486,6 +7505,7 @@ struct llm_build_context {
                         n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
+
                 Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
                 cb(Qcur, "Qcur_scaled", il);
 
@@ -7500,6 +7520,7 @@ struct llm_build_context {
                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
                 cb(cur, "kqv_out", il);
             }
+
             struct lm_ggml_tensor * sa_out = lm_ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
@@ -10490,7 +10511,10 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type
         return std::make_pair(i_layer, n_layer);
     };
 
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
         int nx = tensor->ne[0];
         if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
             new_type = LM_GGML_TYPE_Q8_0;
@@ -12768,6 +12792,37 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
+    } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
+        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
+        for (auto message : chat) {
+            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
+            ss << bos << message->role << "\n" << message->content << "</s>\n";
+        }
+        if (add_ass) {
+            ss << "<s>assistant\n";
+        }
+    } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
+        // google/gemma-7b-it
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
+                system_prompt = trim(message->content);
+                continue;
+            }
+            // in gemma, "assistant" is "model"
+            role = role == "assistant" ? "model" : message->role;
+            ss << "<start_of_turn>" << role << "\n";
+            if (!system_prompt.empty() && role != "model") {
+                ss << system_prompt << "\n\n";
+                system_prompt = "";
+            }
+            ss << trim(message->content) << "<end_of_turn>\n";
+        }
+        if (add_ass) {
+            ss << "<start_of_turn>model\n";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index 43a998f..bbcc719 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -8,7 +8,7 @@ PODS:
     - hermes-engine/Pre-built (= 0.72.3)
   - hermes-engine/Pre-built (0.72.3)
   - libevent (2.1.12)
-  - llama-rn (0.3.0-rc.14):
+  - llama-rn (0.3.0-rc.15):
     - RCT-Folly
     - RCTRequired
     - RCTTypeSafety
@@ -1261,7 +1261,7 @@ SPEC CHECKSUMS:
   glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
   hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322
   libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
-  llama-rn: 26d2d3c08a3e788889a4833c4678ea9ccbbb1f33
+  llama-rn: 05393c3a05d9992952d3f2f96452b00dc145e06e
   RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1
   RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18
   RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3
diff --git a/llama.cpp b/llama.cpp
index 973053d..15499eb 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 973053d8b0d04809836b3339a50f68d9c842de90
+Subproject commit 15499eb94227401bdc8875da6eb85c15d37068f7
diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch
index 51f29d5..8c5627a 100644
--- a/scripts/llama.cpp.patch
+++ b/scripts/llama.cpp.patch
@@ -1,5 +1,5 @@
---- llama.cpp.orig	2024-02-22 12:19:15
-+++ llama.cpp	2024-02-22 12:19:17
+--- llama.cpp.orig	2024-02-23 19:15:45
++++ llama.cpp	2024-02-23 19:15:46
 @@ -114,6 +114,17 @@
  #define LLAMA_LOG_WARN(...)  llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
@@ -18,7 +18,7 @@
  //
  // helpers
  //
-@@ -1068,16 +1079,16 @@
+@@ -1067,16 +1078,16 @@
 
          if (prefetch > 0) {
              // advise the kernel to preload the mapped memory