From 7bdb22d4cf4c46dd5bbebc04679041632157d368 Mon Sep 17 00:00:00 2001 From: jhen Date: Fri, 23 Feb 2024 19:17:09 +0800 Subject: [PATCH] feat: sync llama.cpp --- cpp/ggml-impl.h | 27 ++++++++++---- cpp/ggml-quants.c | 65 ++++++++++++++++++++++---------- cpp/ggml.c | 6 +-- cpp/ggml.h | 6 --- cpp/llama.cpp | 81 +++++++++++++++++++++++++++++++++------- example/ios/Podfile.lock | 4 +- llama.cpp | 2 +- scripts/llama.cpp.patch | 6 +-- 8 files changed, 142 insertions(+), 55 deletions(-) diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h index 42bd0c5..eb7344d 100644 --- a/cpp/ggml-impl.h +++ b/cpp/ggml-impl.h @@ -53,11 +53,23 @@ extern "C" { // #include -#define LM_GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define LM_GGML_COMPUTE_FP32_TO_FP16(x) (x) +#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x) +#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x) + +#define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x) + +static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t)); + return (float)tmp; +} -#define LM_GGML_FP16_TO_FP32(x) ((float) (x)) -#define LM_GGML_FP32_TO_FP16(x) (x) +static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) { + lm_ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t)); + return res; +} #else @@ -214,8 +226,7 @@ extern float lm_ggml_table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32, // so we define LM_GGML_FP16_TO_FP32 and LM_GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. -#if !defined(LM_GGML_FP16_TO_FP32) || !defined(LM_GGML_FP32_TO_FP16) - +#if !defined(LM_GGML_FP16_TO_FP32) inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); @@ -223,8 +234,10 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) { } #define LM_GGML_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x) -#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x) +#endif +#if !defined(LM_GGML_FP32_TO_FP16) +#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x) #endif #define LM_GGML_HASHTABLE_FULL ((size_t)-1) diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index 5dd07d6..babc181 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -438,6 +438,30 @@ inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) { return res; } +// NOTE: not tested +inline static int8x16_t lm_ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + #else #define lm_ggml_int16x8x2_t int16x8x2_t @@ -451,6 +475,7 @@ inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) { #define lm_ggml_vld1q_u8_x4 vld1q_u8_x4 #define lm_ggml_vld1q_s8_x2 vld1q_s8_x2 #define lm_ggml_vld1q_s8_x4 vld1q_s8_x4 +#define lm_ggml_vqtbl1q_s8 vqtbl1q_s8 #endif @@ -5629,8 +5654,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -5779,8 +5804,8 @@ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6433,7 +6458,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); @@ -6635,7 +6660,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -7138,9 +7163,9 @@ void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void aux16[1] = (a[0] >> 4) & 0x0f0f; const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); - sum_mins += y[i].d * (float)x[i].d[1] * summi; + sum_mins += y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[1]) * summi; - const float d = y[i].d * (float)x[i].d[0]; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d[0]); const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); @@ -7798,7 +7823,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -7940,7 +7965,7 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -8508,7 +8533,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -8679,7 +8704,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = LM_GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -9333,7 +9358,7 @@ void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, uint16_t gindex[8]; uint16x8x2_t vindex; int8x16x4_t q1b; - int8x16x4_t q8b; + lm_ggml_int8x16x4_t q8b; uint16x8x4_t scales; int32x4x2_t sumi; int32x4x2_t dotq; @@ -9498,7 +9523,6 @@ void lm_ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const voi float sumf = 0; for (int ib = 0; ib < nb; ib += 2) { - q4bits.val[0] = vld1q_u8(x[ib+0].qs); q4bits.val[1] = vld1q_u8(x[ib+1].qs); q8b.val[0] = vld1q_s8(y[ib+0].qs); @@ -9506,16 +9530,17 @@ void lm_ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const voi q8b.val[2] = vld1q_s8(y[ib+1].qs); q8b.val[3] = vld1q_s8(y[ib+1].qs + 16); - q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b)); - q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); - q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b)); - q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + q4b.val[0] = lm_ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); + q4b.val[1] = lm_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = lm_ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); + q4b.val[3] = lm_ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); prod_1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); prod_2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); - sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); - + sumf += + LM_GGML_FP16_TO_FP32(x[ib+0].d) * LM_GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) + + LM_GGML_FP16_TO_FP32(x[ib+1].d) * LM_GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2); } *s = sumf; diff --git a/cpp/ggml.c b/cpp/ggml.c index 1300352..94838f4 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -323,7 +323,7 @@ float lm_ggml_table_f32_f16[1 << 16]; // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float lm_ggml_fp16_to_fp32(lm_ggml_fp16_t x) { - return (float) LM_GGML_FP16_TO_FP32(x); + return LM_GGML_FP16_TO_FP32(x); } lm_ggml_fp16_t lm_ggml_fp32_to_fp16(float x) { @@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define LM_GGML_F16x8 float16x8_t #define LM_GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define LM_GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define LM_GGML_F16x8_LOAD vld1q_f16 + #define LM_GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) #define LM_GGML_F16x8_STORE vst1q_f16 #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define LM_GGML_F16x8_ADD vaddq_f16 @@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define LM_GGML_F32Cx4 float32x4_t #define LM_GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define LM_GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define LM_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define LM_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) #define LM_GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define LM_GGML_F32Cx4_ADD vaddq_f32 diff --git a/cpp/ggml.h b/cpp/ggml.h index cb359a9..1f9d6a8 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -315,13 +315,7 @@ extern "C" { #endif -#if defined(__ARM_NEON) && defined(__CUDACC__) - typedef half lm_ggml_fp16_t; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - typedef __fp16 lm_ggml_fp16_t; -#else typedef uint16_t lm_ggml_fp16_t; -#endif // convert FP16 <-> FP32 LM_GGML_API float lm_ggml_fp16_to_fp32(lm_ggml_fp16_t x); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 665d10a..3e50dae 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -520,7 +520,6 @@ static std::map> LLM_TENSOR_NAMES = { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4065,7 +4064,12 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); + + // same as tok_embd, duplicated to allow offloading + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += lm_ggml_nbytes(model.output); } for (int i = 0; i < n_layer; ++i) { @@ -4074,14 +4078,23 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); // AWQ ScaleActivation layer layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); @@ -6182,7 +6195,7 @@ struct llm_build_context { attn_norm = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, - NULL, + model.layers[il].attn_norm_b, LLM_NORM, cb, il); cb(attn_norm, "attn_norm", il); @@ -6193,6 +6206,11 @@ struct llm_build_context { cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + if (model.layers[il].bqkv){ + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + if (hparams.f_clamp_kqv > 0.0f) { cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(cur, "wqkv_clamped", il); @@ -6209,7 +6227,7 @@ struct llm_build_context { Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, NULL, + model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6222,13 +6240,13 @@ struct llm_build_context { { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, - NULL, + model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -6245,7 +6263,7 @@ struct llm_build_context { cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, - NULL, + model.output_norm_b, LLM_NORM, cb, -1); cb(cur, "result_norm", -1); @@ -7445,6 +7463,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); + inpL = lm_ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -7486,6 +7505,7 @@ struct llm_build_context { n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); + Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); cb(Qcur, "Qcur_scaled", il); @@ -7500,6 +7520,7 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } + struct lm_ggml_tensor * sa_out = lm_ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); @@ -10490,7 +10511,10 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type return std::make_pair(i_layer, n_layer); }; - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings + // with the quantization of the output tensor + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || + (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) { int nx = tensor->ne[0]; if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = LM_GGML_TYPE_Q8_0; @@ -12768,6 +12792,37 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } + } else if (tmpl.find("bos_token + message['role']") != std::string::npos) { + // mlabonne/AlphaMonarch-7B template (the is included inside history) + for (auto message : chat) { + std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message + ss << bos << message->role << "\n" << message->content << "\n"; + } + if (add_ass) { + ss << "assistant\n"; + } + } else if (tmpl.find("") != std::string::npos) { + // google/gemma-7b-it + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken + system_prompt = trim(message->content); + continue; + } + // in gemma, "assistant" is "model" + role = role == "assistant" ? "model" : message->role; + ss << "" << role << "\n"; + if (!system_prompt.empty() && role != "model") { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << trim(message->content) << "\n"; + } + if (add_ass) { + ss << "model\n"; + } } else { // template not supported return -1; diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 43a998f..bbcc719 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.3.0-rc.14): + - llama-rn (0.3.0-rc.15): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1261,7 +1261,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: 26d2d3c08a3e788889a4833c4678ea9ccbbb1f33 + llama-rn: 05393c3a05d9992952d3f2f96452b00dc145e06e RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 diff --git a/llama.cpp b/llama.cpp index 973053d..15499eb 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 973053d8b0d04809836b3339a50f68d9c842de90 +Subproject commit 15499eb94227401bdc8875da6eb85c15d37068f7 diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 51f29d5..8c5627a 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,5 +1,5 @@ ---- llama.cpp.orig 2024-02-22 12:19:15 -+++ llama.cpp 2024-02-22 12:19:17 +--- llama.cpp.orig 2024-02-23 19:15:45 ++++ llama.cpp 2024-02-23 19:15:46 @@ -114,6 +114,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) @@ -18,7 +18,7 @@ // // helpers // -@@ -1068,16 +1079,16 @@ +@@ -1067,16 +1078,16 @@ if (prefetch > 0) { // advise the kernel to preload the mapped memory