Skip to content

Commit

Permalink
feat: sync llama.cpp (#31)
Browse files Browse the repository at this point in the history
* feat: sync llama.cpp

* feat: use -O3

* feat: sync llama.cpp

* feat: sync llama.cpp

* feat: sync llama.cpp
  • Loading branch information
jhen0409 authored Nov 10, 2023
1 parent 3c6c300 commit 1f20cef
Show file tree
Hide file tree
Showing 25 changed files with 7,198 additions and 7,578 deletions.
4 changes: 2 additions & 2 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ set(
${RNLLAMA_LIB_DIR}/ggml-alloc.c
${RNLLAMA_LIB_DIR}/ggml-backend.c
${RNLLAMA_LIB_DIR}/ggml.c
${RNLLAMA_LIB_DIR}/k_quants.c
${RNLLAMA_LIB_DIR}/ggml-quants.c
${RNLLAMA_LIB_DIR}/common.cpp
${RNLLAMA_LIB_DIR}/grammar-parser.cpp
${RNLLAMA_LIB_DIR}/sampling.cpp
Expand All @@ -32,7 +32,7 @@ function(build_library target_name)

target_link_libraries(${target_name} ${LOG_LIB} android)

target_compile_options(${target_name} PRIVATE -DLM_GGML_USE_K_QUANTS -pthread)
target_compile_options(${target_name} PRIVATE -pthread)

if (${target_name} STREQUAL "rnllama_v8fp16_va")
target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod)
Expand Down
9 changes: 0 additions & 9 deletions cpp/build-info.h

This file was deleted.

163 changes: 130 additions & 33 deletions cpp/common.cpp

Large diffs are not rendered by default.

61 changes: 40 additions & 21 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"

#include <cmath>
#include <string>
#include <vector>
#include <random>
Expand All @@ -25,35 +26,51 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;

//
// CLI argument parsing
//
int32_t get_num_physical_cores();

struct gpt_params {
uint32_t seed = -1; // RNG seed
uint32_t seed = -1; // RNG seed

int32_t n_threads = get_num_physical_cores();
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
// pinging @cebtenzzre

// // sampling parameters
struct llama_sampling_params sparams;
Expand All @@ -77,7 +94,7 @@ struct gpt_params {
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score

bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
Expand Down Expand Up @@ -110,6 +127,8 @@ struct gpt_params {
std::string image = ""; // path to an image file
};

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
Expand Down
21 changes: 12 additions & 9 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,13 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) {
}
}

static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view) {
static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view, bool update_backend) {
assert(view->view_src != NULL && view->view_src->data != NULL);
view->backend = view->view_src->backend;

if (update_backend) {
view->backend = view->view_src->backend;
}

view->buffer = view->view_src->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

Expand All @@ -394,7 +398,7 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) {
if (lm_ggml_is_view(node)) {
init_view(alloc, node);
init_view(alloc, node, true);
} else {
// see if we can reuse a parent's buffer (inplace)
if (lm_ggml_op_can_inplace(node->op)) {
Expand Down Expand Up @@ -424,15 +428,14 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->view_src = view_src;
view_src_hn->n_views += 1;
init_view(alloc, node);
init_view(alloc, node, false);
return;
}
}
else {
} else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->view_src = parent;
p_hn->n_views += 1;
init_view(alloc, node);
init_view(alloc, node, false);
return;
}
}
Expand Down Expand Up @@ -463,7 +466,7 @@ size_t lm_ggml_allocr_alloc_graph_n(
hash_get(ht, view_src)->n_views += 1;
if (node->buffer == NULL && node->data != NULL) {
// view of a pre-allocated tensor, didn't call init_view() yet
init_view(alloc, node);
init_view(alloc, node, true);
}
}

Expand All @@ -474,7 +477,7 @@ size_t lm_ggml_allocr_alloc_graph_n(
}
hash_get(ht, parent)->n_children += 1;
if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
init_view(alloc, parent);
init_view(alloc, parent, true);
}
}
}
Expand Down
Loading

0 comments on commit 1f20cef

Please sign in to comment.