Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backend: rebase llama.cpp on upstream as of Sep 26th #2998

Merged
merged 5 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gpt4all-backend/deps/llama.cpp-mainline
10 changes: 6 additions & 4 deletions gpt4all-backend/include/gpt4all-backend/llmodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cstdint>
#include <functional>
#include <optional>
#include <span>
#include <stdexcept>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -149,9 +150,9 @@ class LLModel {
virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
virtual bool isModelLoaded() const = 0;
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
virtual size_t stateSize() const { return 0; }
virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
virtual size_t stateSize() const = 0;
virtual size_t saveState(std::span<uint8_t> dest) const = 0;
virtual size_t restoreState(std::span<const uint8_t> src) = 0;

// This method requires the model to return true from supportsCompletion otherwise it will throw
// an error
Expand Down Expand Up @@ -215,7 +216,8 @@ class LLModel {
virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
virtual bool isSpecialToken(Token id) const = 0;
virtual std::string tokenToString(Token id) const = 0;
virtual Token sampleToken(PromptContext &ctx) const = 0;
virtual void initSampler(PromptContext &ctx) = 0;
virtual Token sampleToken() const = 0;
virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
virtual void shiftContext(PromptContext &promptCtx) = 0;
virtual int32_t contextLength() const = 0;
Expand Down
12 changes: 7 additions & 5 deletions gpt4all-backend/include/gpt4all-backend/llmodel_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,20 @@ uint64_t llmodel_get_state_size(llmodel_model model);
* NOTE: This state data is specific to the type of model you have created.
* @param model A pointer to the llmodel_model instance.
* @param dest A pointer to the destination.
* @return the number of bytes copied
* @param size The size of the destination buffer.
* @return the number of bytes copied, or zero on error.
*/
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size);

/**
* Restores the internal state of the model using data from the specified address.
* NOTE: This state data is specific to the type of model you have created.
* @param model A pointer to the llmodel_model instance.
* @param src A pointer to the src.
* @return the number of bytes read
* @param src A pointer to the state data.
* @param size The size of the source data.
* @return The number of bytes read, or zero on error.
*/
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, size_t size);

/**
* Generate a response using the model.
Expand Down
7 changes: 5 additions & 2 deletions gpt4all-backend/llama.cpp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -978,10 +978,13 @@ function(include_ggml SUFFIX)

add_library(llama${SUFFIX} STATIC
${DIRECTORY}/include/llama.h
${DIRECTORY}/src/llama-grammar.cpp
${DIRECTORY}/src/llama-sampling.cpp
${DIRECTORY}/src/llama-vocab.cpp
${DIRECTORY}/src/llama.cpp
${DIRECTORY}/src/unicode.h
${DIRECTORY}/src/unicode.cpp
${DIRECTORY}/src/unicode-data.cpp
${DIRECTORY}/src/unicode.cpp
${DIRECTORY}/src/unicode.h
)

target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
Expand Down
174 changes: 93 additions & 81 deletions gpt4all-backend/src/llamamodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "llamamodel_impl.h"

#include "llmodel.h"
#include "utils.h"

#include <ggml.h>
#include <llama.h>
Expand Down Expand Up @@ -103,26 +104,34 @@ static bool llama_verbose()
return var && *var;
}

static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
fputs(text, stderr);
}
}

#ifdef GGML_USE_CUDA
static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
fputs(text, stderr);
static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
if (!llama_verbose()) {
auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
lastlevel = efflevel;
switch (efflevel) {
case GGML_LOG_LEVEL_CONT:
UNREACHABLE();
break;
case GGML_LOG_LEVEL_WARN:
if (warn) break;
[[fallthrough]];
case GGML_LOG_LEVEL_NONE: // not used?
case GGML_LOG_LEVEL_INFO:
case GGML_LOG_LEVEL_DEBUG:
return; // suppress
case GGML_LOG_LEVEL_ERROR:
;
}
}

fputs(text, stderr);
}
#endif

struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt

// sampling parameters
Expand All @@ -137,44 +146,6 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory
};

static llama_token llama_sample_top_p_top_k(
llama_context *ctx,
const llama_token *last_n_tokens_data,
int last_n_tokens_size,
int top_k,
float top_p,
float min_p,
float temp,
float repeat_penalty) {
auto logits = llama_get_logits_ith(ctx, -1);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
// Populate initial list of all candidates
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (int token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
// Sample repeat penalty
llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);

llama_token id;
if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
// temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_min_p(ctx, &candidates_p, min_p, 1);
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
return id;
}

const char *get_arch_name(gguf_context *ctx_gguf)
{
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
Expand Down Expand Up @@ -241,21 +212,26 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
}

struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
llama_context_params ctx_params;
int64_t n_threads = 0;
std::vector<LLModel::Token> end_tokens;
const char *backend_name = nullptr;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
int64_t n_threads = 0;
std::vector<LLModel::Token> end_tokens;
const char *backend_name = nullptr;

llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
llama_context_params ctx_params;
llama_sampler *sampler_chain;
};

LLamaModel::LLamaModel()
: d_ptr(new LLamaPrivate) {}
: d_ptr(std::make_unique<LLamaPrivate>())
{
auto sparams = llama_sampler_chain_default_params();
d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
}

// default hparams (LLaMA 7B)
struct llama_file_hparams {
Expand Down Expand Up @@ -444,10 +420,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
}
}

d_ptr->ctx_params.n_ctx = n_ctx;
d_ptr->ctx_params.seed = params.seed;
d_ptr->ctx_params.type_k = params.kv_type;
d_ptr->ctx_params.type_v = params.kv_type;
d_ptr->ctx_params.n_ctx = n_ctx;
d_ptr->ctx_params.type_k = params.kv_type;
d_ptr->ctx_params.type_v = params.kv_type;

// The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
// that we want this many logits so the state serializes consistently.
Expand Down Expand Up @@ -513,6 +488,7 @@ LLamaModel::~LLamaModel()
llama_free(d_ptr->ctx);
}
llama_free_model(d_ptr->model);
llama_sampler_free(d_ptr->sampler_chain);
}

bool LLamaModel::isModelLoaded() const
Expand All @@ -522,18 +498,17 @@ bool LLamaModel::isModelLoaded() const

size_t LLamaModel::stateSize() const
{
return llama_get_state_size(d_ptr->ctx);
return llama_state_get_size(d_ptr->ctx);
}

size_t LLamaModel::saveState(uint8_t *dest) const
size_t LLamaModel::saveState(std::span<uint8_t> dest) const
{
return llama_copy_state_data(d_ptr->ctx, dest);
return llama_state_get_data(d_ptr->ctx, dest.data(), dest.size());
}

size_t LLamaModel::restoreState(const uint8_t *src)
size_t LLamaModel::restoreState(std::span<const uint8_t> src)
{
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
return llama_state_set_data(d_ptr->ctx, src.data(), src.size());
}

std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
Expand Down Expand Up @@ -573,13 +548,50 @@ std::string LLamaModel::tokenToString(Token id) const
return std::string(result.data(), result.size());
}

LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
void LLamaModel::initSampler(PromptContext &promptCtx)
{
auto *model = d_ptr->model;
auto *chain = d_ptr->sampler_chain;

// clear sampler chain
for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
auto *smpl = llama_sampler_chain_remove(chain, i);
llama_sampler_free(smpl);
}

// build new chain
llama_sampler_chain_add(chain,
llama_sampler_init_penalties(
llama_n_vocab(model),
llama_token_eos(model),
llama_token_nl(model),
promptCtx.repeat_last_n,
promptCtx.repeat_penalty,
// TODO(jared): consider making the below configurable
/*penalty_freq*/ 0.0f,
/*penalty_present*/ 0.0f,
/*penalize_nl*/ true,
/*ignore_eos*/ false
)
);
if (promptCtx.temp == 0.0f) {
llama_sampler_chain_add(chain, llama_sampler_init_greedy());
} else {
struct llama_sampler *samplers[] = {
llama_sampler_init_top_k(promptCtx.top_k),
llama_sampler_init_top_p(promptCtx.top_p, 1),
llama_sampler_init_min_p(promptCtx.min_p, 1),
llama_sampler_init_temp(promptCtx.temp),
llama_sampler_init_dist(LLAMA_DEFAULT_SEED)
};
for (auto *smpl : samplers)
llama_sampler_chain_add(chain, smpl);
}
}

LLModel::Token LLamaModel::sampleToken() const
{
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
return llama_sample_top_p_top_k(d_ptr->ctx,
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
promptCtx.repeat_penalty);
return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
}

bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
Expand Down Expand Up @@ -1227,9 +1239,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)

DLL_EXPORT LLModel *construct()
{
llama_log_set(llama_log_callback, nullptr);
llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
#ifdef GGML_USE_CUDA
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
#endif
return new LLamaModel;
}
Expand Down
8 changes: 5 additions & 3 deletions gpt4all-backend/src/llamamodel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "llmodel.h"

#include <memory>
#include <span>
#include <string>
#include <string_view>
#include <vector>
Expand All @@ -27,8 +28,8 @@ class LLamaModel : public LLModel {
bool isModelLoaded() const override;
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
size_t stateSize() const override;
size_t saveState(uint8_t *dest) const override;
size_t restoreState(const uint8_t *src) override;
size_t saveState(std::span<uint8_t> dest) const override;
size_t restoreState(std::span<const uint8_t> src) override;
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
Expand Down Expand Up @@ -56,7 +57,8 @@ class LLamaModel : public LLModel {
std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
bool isSpecialToken(Token id) const override;
std::string tokenToString(Token id) const override;
Token sampleToken(PromptContext &ctx) const override;
void initSampler(PromptContext &ctx) override;
Token sampleToken() const override;
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
void shiftContext(PromptContext &promptCtx) override;
int32_t contextLength() const override;
Expand Down
8 changes: 4 additions & 4 deletions gpt4all-backend/src/llmodel_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,16 +91,16 @@ uint64_t llmodel_get_state_size(llmodel_model model)
return wrapper->llModel->stateSize();
}

uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size)
{
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->saveState(dest);
return wrapper->llModel->saveState({dest, size_t(size)});
}

uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, uint64_t size)
{
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->restoreState(src);
return wrapper->llModel->restoreState({src, size_t(size)});
}

void llmodel_prompt(llmodel_model model, const char *prompt,
Expand Down
4 changes: 3 additions & 1 deletion gpt4all-backend/src/llmodel_shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,14 +244,16 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
return;
}

initSampler(promptCtx);

std::string cachedResponse;
std::vector<Token> cachedTokens;
int n_predicted = 0;

// Predict next tokens
for (bool stop = false; !stop;) {
// Sample next token
std::optional<Token> new_tok = sampleToken(promptCtx);
std::optional<Token> new_tok = sampleToken();
std::string new_piece = tokenToString(new_tok.value());
cachedTokens.push_back(new_tok.value());
cachedResponse += new_piece;
Expand Down
Loading