From d19754553029296c46d40b2f7bd4e2c2f531a8c5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 19 Jul 2024 16:50:47 +0300 Subject: [PATCH] llama : bump max layers from 256 to 512 (#8530) * llama : bump max layers from 256 to 512 * llama : replace asserts with exceptions --- include/llama.h | 2 +- src/llama.cpp | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/llama.h b/include/llama.h index c0fb53060eae4..b280df3255b6a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -40,7 +40,7 @@ #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 6 +#define LLAMA_SESSION_VERSION 7 #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ #define LLAMA_STATE_SEQ_VERSION 1 diff --git a/src/llama.cpp b/src/llama.cpp index 228e112acfe94..7d68ed8111873 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -114,7 +114,7 @@ // bump if necessary #define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_LAYERS 256 +#define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 // @@ -4007,7 +4007,9 @@ struct llama_model_loader { throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); } - GGML_ASSERT(arr_info.length <= N_MAX); + if (arr_info.length > N_MAX) { + throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); + } std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); @@ -4043,8 +4045,6 @@ struct llama_model_loader { // get array of n <= N_MAX elements, or a single element repeated n times template bool get_key_or_arr(const std::string & key, std::array & result, uint32_t n, const bool required = true) { - GGML_ASSERT(n <= N_MAX); - const int kid = gguf_find_key(meta, key.c_str()); if (kid < 0) { @@ -4054,6 +4054,10 @@ struct llama_model_loader { return false; } + if (n > N_MAX) { + throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); + } + if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) { struct GGUFMeta::ArrayInfo arr_info = GGUFMeta::GKV::get_kv(meta, kid); @@ -19920,7 +19924,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) { ); // on session change it is very likely that the state size has changed - so we need to update this function - static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?"); + static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?"); return s_total; }