Merge branch 'ggerganov:master' into master

sealad886 · Jun 21, 2024 · a42a18e · a42a18e
2 parents 59e694d + 7d5e877
commit a42a18e
Show file tree

Hide file tree

Showing 13 changed files with 867 additions and 95 deletions.
diff --git a/common/common.cpp b/common/common.cpp
@@ -541,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
         else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
         else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+        else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
         else { invalid_param = true; }
         return true;
     }
@@ -1869,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
     options.push_back({ "backend" });
     options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
+
     if (llama_supports_mlock()) {
         options.push_back({ "*",           "       --mlock",                "force system to keep model in RAM rather than swapping or compressing" });
     }

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -214,15 +214,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 """
 
 convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
-convert_py = convert_py_pth.read_text()
+convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
     r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
     lambda m: m.group(1) + src_func + m.group(3),
     convert_py,
     flags=re.DOTALL | re.MULTILINE,
 )
 
-convert_py_pth.write_text(convert_py)
+convert_py_pth.write_text(convert_py, encoding="utf-8")
 
 logger.info("+++ convert-hf-to-gguf.py was updated")
 

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
     return lines;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
@@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
         // try to get sequence embeddings - supported only when pooling_type is not NONE
         const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
 
         float * out = output + batch.seq_id[i][0] * n_embd;
         //TODO: I would also add a parameter here to enable normalization or not.
@@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
 
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
     if (n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 
         // clear previous kv_cache values (irrelevant for embeddings)
         llama_kv_cache_clear(ctx);
+        llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
         // run model
@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
     llama_token eos_token = llama_token_eos(mdl);
 
     llama_kv_cache_clear(ctx);
+    llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
+
     llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
 
     llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
 
-    // create new context - set to embedding mode
-    cparams.embeddings = true;
+    // create generation context
     llama_context * ctx = llama_new_context_with_model(mdl, cparams);
 
     // ### Embedding/Representation ###

diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
 
         messageLog += "\(text)"
 
-        while await llamaContext.n_cur < llamaContext.n_len {
-            let result = await llamaContext.completion_loop()
-            messageLog += "\(result)"
-        }
+        Task.detached {
+            while await llamaContext.n_cur < llamaContext.n_len {
+                let result = await llamaContext.completion_loop()
+                await MainActor.run {
+                    self.messageLog += "\(result)"
+                }
+            }
 
-        let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
-        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
+            let t_end = DispatchTime.now().uptimeNanoseconds
+            let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
+            let tokens_per_second = Double(await llamaContext.n_len) / t_generation
 
-        await llamaContext.clear()
-        messageLog += """
-            \n
-            Done
-            Heat up took \(t_heat)s
-            Generated \(tokens_per_second) t/s\n
-            """
+            await llamaContext.clear()
+
+            await MainActor.run {
+                self.messageLog += """
+                    \n
+                    Done
+                    Heat up took \(t_heat)s
+                    Generated \(tokens_per_second) t/s\n
+                    """
+            }
+        }
     }
 
     func bench() async {

diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
     return chunks;
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
 
@@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
 
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
     if (n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);