nomic-ai · cebtenzzre · Sep 27, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 26, 2024
diff --git a/gpt4all-backend/deps/llama.cpp-mainline b/gpt4all-backend/deps/llama.cpp-mainline
diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel.h b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <functional>
 #include <optional>
+#include <span>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@@ -149,9 +150,9 @@ class LLModel {
     virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
     virtual bool isModelLoaded() const = 0;
     virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const { return 0; }
-    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
-    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
+    virtual size_t stateSize() const = 0;
+    virtual size_t saveState(std::span<uint8_t> dest) const = 0;
+    virtual size_t restoreState(std::span<const uint8_t> src) = 0;
 
     // This method requires the model to return true from supportsCompletion otherwise it will throw
     // an error
@@ -215,7 +216,8 @@ class LLModel {
     virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
     virtual bool isSpecialToken(Token id) const = 0;
     virtual std::string tokenToString(Token id) const = 0;
-    virtual Token sampleToken(PromptContext &ctx) const = 0;
+    virtual void initSampler(PromptContext &ctx) = 0;
+    virtual Token sampleToken() const = 0;
     virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
     virtual void shiftContext(PromptContext &promptCtx) = 0;
     virtual int32_t contextLength() const = 0;

diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@@ -148,18 +148,20 @@ uint64_t llmodel_get_state_size(llmodel_model model);
  * NOTE: This state data is specific to the type of model you have created.
  * @param model A pointer to the llmodel_model instance.
  * @param dest A pointer to the destination.
- * @return the number of bytes copied
+ * @param size The size of the destination buffer.
+ * @return the number of bytes copied, or zero on error.
  */
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size);
 
 /**
  * Restores the internal state of the model using data from the specified address.
  * NOTE: This state data is specific to the type of model you have created.
  * @param model A pointer to the llmodel_model instance.
- * @param src A pointer to the src.
- * @return the number of bytes read
+ * @param src A pointer to the state data.
+ * @param size The size of the source data.
+ * @return The number of bytes read, or zero on error.
  */
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, size_t size);
 
 /**
  * Generate a response using the model.

diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
@@ -978,10 +978,13 @@ function(include_ggml SUFFIX)
 
     add_library(llama${SUFFIX} STATIC
                 ${DIRECTORY}/include/llama.h
+                ${DIRECTORY}/src/llama-grammar.cpp
+                ${DIRECTORY}/src/llama-sampling.cpp
+                ${DIRECTORY}/src/llama-vocab.cpp
                 ${DIRECTORY}/src/llama.cpp
-                ${DIRECTORY}/src/unicode.h
-                ${DIRECTORY}/src/unicode.cpp
                 ${DIRECTORY}/src/unicode-data.cpp
+                ${DIRECTORY}/src/unicode.cpp
+                ${DIRECTORY}/src/unicode.h
                 )
 
     target_include_directories(llama${SUFFIX} PUBLIC  ${DIRECTORY}/include ${DIRECTORY}/ggml/include)

diff --git a/gpt4all-backend/src/llamamodel.cpp b/gpt4all-backend/src/llamamodel.cpp
@@ -2,6 +2,7 @@
 #include "llamamodel_impl.h"
 
 #include "llmodel.h"
+#include "utils.h"
 
 #include <ggml.h>
 #include <llama.h>
@@ -103,26 +104,34 @@ static bool llama_verbose()
     return var && *var;
 }
 
-static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
+static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
 {
     (void)userdata;
-    if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
-        fputs(text, stderr);
-    }
-}
 
-#ifdef GGML_USE_CUDA
-static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
-{
-    (void)userdata;
-    if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
-        fputs(text, stderr);
+    static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
+    if (!llama_verbose()) {
+        auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
+        lastlevel = efflevel;
+        switch (efflevel) {
+            case GGML_LOG_LEVEL_CONT:
+                UNREACHABLE();
+                break;
+            case GGML_LOG_LEVEL_WARN:
+                if (warn) break;
+                [[fallthrough]];
+            case GGML_LOG_LEVEL_NONE: // not used?
+            case GGML_LOG_LEVEL_INFO:
+            case GGML_LOG_LEVEL_DEBUG:
+                return; // suppress
+            case GGML_LOG_LEVEL_ERROR:
+                ;
+        }
     }
+
+    fputs(text, stderr);
 }
-#endif
 
 struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
@@ -137,44 +146,6 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
 };
 
-static llama_token llama_sample_top_p_top_k(
-        llama_context *ctx,
-        const llama_token *last_n_tokens_data,
-        int last_n_tokens_size,
-        int top_k,
-        float top_p,
-        float min_p,
-        float temp,
-        float repeat_penalty) {
-    auto logits = llama_get_logits_ith(ctx, -1);
-    auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-    // Populate initial list of all candidates
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (int token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
-    // Sample repeat penalty
-    llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
-
-    llama_token id;
-    if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx, &candidates_p);
-    } else {
-        // temperature sampling
-        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-        llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
-        llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
-        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-        llama_sample_min_p(ctx, &candidates_p, min_p, 1);
-        llama_sample_temp(ctx, &candidates_p, temp);
-        id = llama_sample_token(ctx, &candidates_p);
-    }
-    return id;
-}
-
 const char *get_arch_name(gguf_context *ctx_gguf)
 {
     const int kid = gguf_find_key(ctx_gguf, "general.architecture");
@@ -241,21 +212,26 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
 }
 
 struct LLamaPrivate {
-    const std::string modelPath;
-    bool modelLoaded = false;
-    int device = -1;
-    std::string deviceName;
-    llama_model *model = nullptr;
-    llama_context *ctx = nullptr;
-    llama_model_params model_params;
-    llama_context_params ctx_params;
-    int64_t n_threads = 0;
-    std::vector<LLModel::Token> end_tokens;
-    const char *backend_name = nullptr;
+    bool                         modelLoaded  = false;
+    int                          device       = -1;
+    std::string                  deviceName;
+    int64_t                      n_threads    = 0;
+    std::vector<LLModel::Token>  end_tokens;
+    const char                  *backend_name = nullptr;
+
+    llama_model          *model        = nullptr;
+    llama_context        *ctx          = nullptr;
+    llama_model_params    model_params;
+    llama_context_params  ctx_params;
+    llama_sampler        *sampler_chain;
 };
 
 LLamaModel::LLamaModel()
-    : d_ptr(new LLamaPrivate) {}
+    : d_ptr(std::make_unique<LLamaPrivate>())
+{
+    auto sparams = llama_sampler_chain_default_params();
+    d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
+}
 
 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@@ -444,10 +420,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
         }
     }
 
-    d_ptr->ctx_params.n_ctx   = n_ctx;
-    d_ptr->ctx_params.seed    = params.seed;
-    d_ptr->ctx_params.type_k  = params.kv_type;
-    d_ptr->ctx_params.type_v  = params.kv_type;
+    d_ptr->ctx_params.n_ctx  = n_ctx;
+    d_ptr->ctx_params.type_k = params.kv_type;
+    d_ptr->ctx_params.type_v = params.kv_type;
 
     // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
     // that we want this many logits so the state serializes consistently.
@@ -513,6 +488,7 @@ LLamaModel::~LLamaModel()
         llama_free(d_ptr->ctx);
     }
     llama_free_model(d_ptr->model);
+    llama_sampler_free(d_ptr->sampler_chain);
 }
 
 bool LLamaModel::isModelLoaded() const
@@ -522,18 +498,17 @@ bool LLamaModel::isModelLoaded() const
 
 size_t LLamaModel::stateSize() const
 {
-    return llama_get_state_size(d_ptr->ctx);
+    return llama_state_get_size(d_ptr->ctx);
 }
 
-size_t LLamaModel::saveState(uint8_t *dest) const
+size_t LLamaModel::saveState(std::span<uint8_t> dest) const
 {
-    return llama_copy_state_data(d_ptr->ctx, dest);
+    return llama_state_get_data(d_ptr->ctx, dest.data(), dest.size());
 }
 
-size_t LLamaModel::restoreState(const uint8_t *src)
+size_t LLamaModel::restoreState(std::span<const uint8_t> src)
 {
-    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
-    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
+    return llama_state_set_data(d_ptr->ctx, src.data(), src.size());
 }
 
 std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
@@ -573,13 +548,50 @@ std::string LLamaModel::tokenToString(Token id) const
     return std::string(result.data(), result.size());
 }
 
-LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
+void LLamaModel::initSampler(PromptContext &promptCtx)
+{
+    auto *model = d_ptr->model;
+    auto *chain = d_ptr->sampler_chain;
+
+    // clear sampler chain
+    for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
+        auto *smpl = llama_sampler_chain_remove(chain, i);
+        llama_sampler_free(smpl);
+    }
+
+    // build new chain
+    llama_sampler_chain_add(chain,
+        llama_sampler_init_penalties(
+            llama_n_vocab(model),
+            llama_token_eos(model),
+            llama_token_nl(model),
+            promptCtx.repeat_last_n,
+            promptCtx.repeat_penalty,
+            // TODO(jared): consider making the below configurable
+            /*penalty_freq*/    0.0f,
+            /*penalty_present*/ 0.0f,
+            /*penalize_nl*/     true,
+            /*ignore_eos*/      false
+        )
+    );
+    if (promptCtx.temp == 0.0f) {
+        llama_sampler_chain_add(chain, llama_sampler_init_greedy());
+    } else {
+        struct llama_sampler *samplers[] = {
+            llama_sampler_init_top_k(promptCtx.top_k),
+            llama_sampler_init_top_p(promptCtx.top_p, 1),
+            llama_sampler_init_min_p(promptCtx.min_p, 1),
+            llama_sampler_init_temp(promptCtx.temp),
+            llama_sampler_init_dist(LLAMA_DEFAULT_SEED)
+        };
+        for (auto *smpl : samplers)
+            llama_sampler_chain_add(chain, smpl);
+    }
+}
+
+LLModel::Token LLamaModel::sampleToken() const
 {
-    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
-    return llama_sample_top_p_top_k(d_ptr->ctx,
-        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
-        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
-        promptCtx.repeat_penalty);
+    return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
 }
 
 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
@@ -1227,9 +1239,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
 
 DLL_EXPORT LLModel *construct()
 {
-    llama_log_set(llama_log_callback, nullptr);
+    llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
 #ifdef GGML_USE_CUDA
-    ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
+    ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
 #endif
     return new LLamaModel;
 }

diff --git a/gpt4all-backend/src/llamamodel_impl.h b/gpt4all-backend/src/llamamodel_impl.h
@@ -7,6 +7,7 @@
 #include "llmodel.h"
 
 #include <memory>
+#include <span>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -27,8 +28,8 @@ class LLamaModel : public LLModel {
     bool isModelLoaded() const override;
     size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
-    size_t saveState(uint8_t *dest) const override;
-    size_t restoreState(const uint8_t *src) override;
+    size_t saveState(std::span<uint8_t> dest) const override;
+    size_t restoreState(std::span<const uint8_t> src) override;
     void setThreadCount(int32_t n_threads) override;
     int32_t threadCount() const override;
     std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
@@ -56,7 +57,8 @@ class LLamaModel : public LLModel {
     std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
     bool isSpecialToken(Token id) const override;
     std::string tokenToString(Token id) const override;
-    Token sampleToken(PromptContext &ctx) const override;
+    void initSampler(PromptContext &ctx) override;
+    Token sampleToken() const override;
     bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
     void shiftContext(PromptContext &promptCtx) override;
     int32_t contextLength() const override;

diff --git a/gpt4all-backend/src/llmodel_c.cpp b/gpt4all-backend/src/llmodel_c.cpp
@@ -91,16 +91,16 @@ uint64_t llmodel_get_state_size(llmodel_model model)
     return wrapper->llModel->stateSize();
 }
 
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size)
 {
     auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->saveState(dest);
+    return wrapper->llModel->saveState({dest, size_t(size)});
 }
 
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, uint64_t size)
 {
     auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState(src);
+    return wrapper->llModel->restoreState({src, size_t(size)});
 }
 
 void llmodel_prompt(llmodel_model model, const char *prompt,

diff --git a/gpt4all-backend/src/llmodel_shared.cpp b/gpt4all-backend/src/llmodel_shared.cpp
@@ -244,14 +244,16 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
         return;
     }
 
+    initSampler(promptCtx);
+
     std::string cachedResponse;
     std::vector<Token> cachedTokens;
     int n_predicted = 0;
 
     // Predict next tokens
     for (bool stop = false; !stop;) {
         // Sample next token
-        std::optional<Token> new_tok = sampleToken(promptCtx);
+        std::optional<Token> new_tok = sampleToken();
         std::string new_piece = tokenToString(new_tok.value());
         cachedTokens.push_back(new_tok.value());
         cachedResponse += new_piece;