Skip to content

Commit

Permalink
feat: sync llama.cpp (#71)
Browse files Browse the repository at this point in the history
* feat: sync llama.cpp

* feat: remove lora_base
  • Loading branch information
jhen0409 authored Jul 27, 2024
1 parent 7972f83 commit a579ce3
Show file tree
Hide file tree
Showing 36 changed files with 4,448 additions and 3,795 deletions.
3 changes: 3 additions & 0 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ set(
${RNLLAMA_LIB_DIR}/unicode-data.cpp
${RNLLAMA_LIB_DIR}/unicode.cpp
${RNLLAMA_LIB_DIR}/llama.cpp
${RNLLAMA_LIB_DIR}/llama-vocab.cpp
${RNLLAMA_LIB_DIR}/llama-sampling.cpp
${RNLLAMA_LIB_DIR}/llama-grammar.cpp
${RNLLAMA_LIB_DIR}/sgemm.cpp
${RNLLAMA_LIB_DIR}/ggml-aarch64.c
${RNLLAMA_LIB_DIR}/rn-llama.hpp
Expand Down
3 changes: 0 additions & 3 deletions android/src/main/java/com/rnllama/LlamaContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
params.hasKey("lora") ? params.getString("lora") : "",
// float lora_scaled,
params.hasKey("lora_scaled") ? (float) params.getDouble("lora_scaled") : 1.0f,
// String lora_base,
params.hasKey("lora_base") ? params.getString("lora_base") : "",
// float rope_freq_base,
params.hasKey("rope_freq_base") ? (float) params.getDouble("rope_freq_base") : 0.0f,
// float rope_freq_scale
Expand Down Expand Up @@ -312,7 +310,6 @@ protected static native long initContext(
boolean use_mmap,
String lora,
float lora_scaled,
String lora_base,
float rope_freq_base,
float rope_freq_scale
);
Expand Down
4 changes: 0 additions & 4 deletions android/src/main/jni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ Java_com_rnllama_LlamaContext_initContext(
jboolean use_mmap,
jstring lora_str,
jfloat lora_scaled,
jstring lora_base_str,
jfloat rope_freq_base,
jfloat rope_freq_scale
) {
Expand All @@ -158,10 +157,8 @@ Java_com_rnllama_LlamaContext_initContext(
defaultParams.use_mmap = use_mmap;

const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
if (lora_chars != nullptr && lora_chars[0] != '\0') {
defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
defaultParams.lora_base = lora_base_chars;
defaultParams.use_mmap = false;
}

Expand All @@ -180,7 +177,6 @@ Java_com_rnllama_LlamaContext_initContext(

env->ReleaseStringUTFChars(model_path_str, model_path_chars);
env->ReleaseStringUTFChars(lora_str, lora_chars);
env->ReleaseStringUTFChars(lora_base_str, lora_base_chars);

return reinterpret_cast<jlong>(llama->ctx);
}
Expand Down
21 changes: 11 additions & 10 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,11 +700,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
return true;
}
if (arg == "--lora-base") {
CHECK_ARG
params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") {
CHECK_ARG
params.control_vectors.push_back({ 1.0f, argv[i], });
Expand Down Expand Up @@ -1280,6 +1275,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
CHECK_ARG
params.out_file = argv[i];
params.cvector_outfile = argv[i];
params.lora_outfile = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
Expand Down Expand Up @@ -1589,9 +1585,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
"note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
Expand Down Expand Up @@ -1682,6 +1677,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });

options.push_back({ "export-lora" });
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });

printf("usage: %s [options]\n", argv[0]);

for (const auto & o : options) {
Expand Down Expand Up @@ -2727,7 +2729,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
const llama_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
Expand Down Expand Up @@ -3172,7 +3174,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
Expand Down
3 changes: 2 additions & 1 deletion cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ struct gpt_params {

// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter

std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

Expand Down Expand Up @@ -266,6 +265,8 @@ struct gpt_params {
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

bool spm_infill = false; // suffix/prefix/middle pattern for infill

std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};

void gpt_params_handle_hf_token(gpt_params & params);
Expand Down
12 changes: 6 additions & 6 deletions cpp/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
LM_GGML_ASSERT(!(lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__)
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -501,7 +501,7 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -613,7 +613,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(ncols_interleaved);
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE)
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
const void * b_ptr = vx;
const void * a_ptr = vy;
Expand Down Expand Up @@ -753,7 +753,7 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
LM_GGML_ASSERT(!(lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__)
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -1271,7 +1271,7 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -1727,7 +1727,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(ncols_interleaved);
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
const void * b_ptr = vx;
const void * a_ptr = vy;
Expand Down
42 changes: 18 additions & 24 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
LM_GGML_ASSERT(!"not enough space in the buffer");
return;
LM_GGML_ABORT("not enough space in the buffer");
}

void * addr = (char *)lm_ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
Expand Down Expand Up @@ -133,7 +132,7 @@ static void add_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offs
return;
}
}
LM_GGML_ASSERT(!"out of allocated_tensors");
LM_GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offset, const struct lm_ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
Expand All @@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t o
return;
}
}
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
LM_GGML_ASSERT(!"tensor not found");
LM_GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
}
#endif

Expand Down Expand Up @@ -176,8 +174,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
LM_GGML_ASSERT(!"not enough space in the buffer");
LM_GGML_UNREACHABLE();
LM_GGML_ABORT("not enough space in the buffer");
}
}

Expand Down Expand Up @@ -443,7 +440,7 @@ void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) {
}
}

free(galloc->hash_set.keys);
lm_ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
free(galloc->buffers);
Expand All @@ -456,7 +453,7 @@ void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) {
typedef struct lm_ggml_gallocr * lm_ggml_gallocr_t;

static struct hash_node * lm_ggml_gallocr_hash_get(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) {
size_t i = lm_ggml_hash_find_or_insert(galloc->hash_set, t);
size_t i = lm_ggml_hash_find_or_insert(&galloc->hash_set, t);
return &galloc->hash_values[i];
}

Expand Down Expand Up @@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {

static void lm_ggml_gallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct lm_ggml_tensor *));
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
lm_ggml_hash_set_reset(&galloc->hash_set);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);

// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
Expand Down Expand Up @@ -671,21 +668,19 @@ static void lm_ggml_gallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm
}

bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size;
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;

// initialize hash table
if (galloc->hash_set.size < hash_size) {
free(galloc->hash_set.keys);
free(galloc->hash_values);
galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(hash_size, sizeof(struct lm_ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
if (galloc->hash_set.size < min_hash_size) {
lm_ggml_hash_set_free(&galloc->hash_set);
galloc->hash_set = lm_ggml_hash_set_new(min_hash_size);
LM_GGML_ASSERT(galloc->hash_set.keys != NULL);

free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
LM_GGML_ASSERT(galloc->hash_values != NULL);
} else {
// reset hash table
memset(galloc->hash_set.keys, 0, sizeof(struct lm_ggml_tensor *) * galloc->hash_set.size);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
}

// reset allocators
Expand Down Expand Up @@ -817,8 +812,7 @@ static void lm_ggml_gallocr_init_tensor(lm_ggml_gallocr_t galloc, struct lm_ggml
}

static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, struct tensor_alloc * talloc) {
lm_ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(buft, node);
size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
return talloc->size_max >= node_size;
}

Expand Down
Loading

0 comments on commit a579ce3

Please sign in to comment.