Skip to content

Commit

Permalink
feat: sync llama.cpp (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 authored May 4, 2024
1 parent e6039bd commit 0025eec
Show file tree
Hide file tree
Showing 23 changed files with 2,608 additions and 982 deletions.
377 changes: 208 additions & 169 deletions cpp/common.cpp

Large diffs are not rendered by default.

17 changes: 13 additions & 4 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)

#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
Expand Down Expand Up @@ -103,7 +105,7 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;

std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
Expand Down Expand Up @@ -144,7 +146,7 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

bool kl_divergence = false; // compute KL-divergence
bool kl_divergence = false; // compute KL divergence

bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
Expand All @@ -159,6 +161,7 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
Expand All @@ -172,15 +175,20 @@ struct gpt_params {
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
};

void gpt_params_handle_model_default(gpt_params & params);

bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand All @@ -204,6 +212,7 @@ bool validate_file_name(const std::string & filename);
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

//
Expand Down
12 changes: 7 additions & 5 deletions cpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1784,12 +1784,14 @@ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {

void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
// reset state for the next run
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
if (!sched->is_reset) {
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);

sched->is_reset = true;
sched->is_reset = true;
}
sched->is_alloc = false;
}

Expand Down
8 changes: 7 additions & 1 deletion cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
#include <string.h> // memcpy
#include <math.h> // fabsf

#undef MIN
#undef MAX

#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -307,7 +313,7 @@ inline static int32x4_t lm_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t

#endif // defined(__ARM_NEON)

#if defined(__ARM_NEON) && !defined(__MSC_VER)
#if defined(__ARM_NEON) && !defined(_MSC_VER)

#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
Expand Down
Loading

0 comments on commit 0025eec

Please sign in to comment.