diff --git a/common/common.cpp b/common/common.cpp index 77992aec9b135c..7913b9ed6f90e5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -264,6 +264,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.kv_overrides.back().key[0] = 0; } + if (params.sparams.seed == LLAMA_DEFAULT_SEED) { + params.sparams.seed = time(NULL); + } + return true; } @@ -294,8 +298,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa if (arg == "-s" || arg == "--seed") { CHECK_ARG - // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. - params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; } @@ -1414,7 +1416,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); - options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); @@ -1465,6 +1466,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); options.push_back({ "sampling" }); + options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", sparams.seed }); options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" "(default: %s)", sampler_type_names.c_str() }); options.push_back({ "*", " --sampling-seq SEQUENCE", @@ -2237,7 +2239,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_ubatch = params.n_ubatch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; @@ -3247,7 +3248,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); diff --git a/common/common.h b/common/common.h index bbc33a499afcd2..f9b08b18f069fb 100644 --- a/common/common.h +++ b/common/common.h @@ -68,8 +68,6 @@ enum dimre_method { }; struct gpt_params { - uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = cpu_get_num_math(); int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) diff --git a/common/sampling.cpp b/common/sampling.cpp index f4f659b81f944d..575baf747952e9 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -3,19 +3,10 @@ #include struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model) { - auto result = llama_sampling_init(params, llama_sampling_init(model, params.grammar.c_str(), "root")); - - result->owned = true; - - return result; -} - -struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl) { struct llama_sampling_context * result = new llama_sampling_context(); result->params = params; - result->owned = false; - result->smpl = smpl; + result->smpl = llama_sampling_init(model, params.grammar.c_str(), "root"); result->prev.resize(params.n_prev); @@ -27,9 +18,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ } void llama_sampling_free(struct llama_sampling_context * ctx) { - if (ctx->owned) { - llama_sampling_free(ctx->smpl); - } + llama_sampling_free(ctx->smpl); delete ctx; } diff --git a/common/sampling.h b/common/sampling.h index 6723895964b70f..244db47ba91bd2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -71,8 +71,6 @@ struct llama_sampling_context { // mirostat sampler state float mirostat_mu; - bool owned; - llama_sampling * smpl; // TODO: replace with ring-buffer @@ -86,7 +84,6 @@ struct llama_sampling_context { // Create a new sampling context instance. struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model); -struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl); void llama_sampling_free(struct llama_sampling_context * ctx); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 2bc5fce7dfb6ee..72b4b43f184184 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo print("Failed to load model") exit(1) } - defer { llama_free_model(model) } @@ -37,24 +36,29 @@ var tokens = tokenize(text: prompt, add_bos: true) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) var context_params = llama_context_default_params() -context_params.seed = 1234 context_params.n_ctx = n_kv_req context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_threads = 8 context_params.n_threads_batch = 8 let context = llama_new_context_with_model(model, context_params) -let smpl = llama_get_sampling(context) - guard context != nil else { print("Failed to initialize context") exit(1) } - defer { llama_free(context) } +let smpl = llama_sampling_init(model, nil, nil) +guard smpl != nil else { + print("Failed to initialize sampling") + exit(1) +} +defer { + llama_sampling_free(smpl) +} + let n_ctx = llama_n_ctx(context) print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 142c825fcd9252..02d98a3a3fc839 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -64,7 +64,7 @@ int main(int argc, char ** argv) { ctx_params.n_batch = std::max(n_predict, n_parallel); llama_context * ctx = llama_new_context_with_model(model, ctx_params); - llama_sampling * smpl = llama_get_sampling(ctx); + llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr); if (ctx == NULL) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index f4dc6d4c0d873d..5c6710a3cfba23 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -68,13 +68,7 @@ int main(int argc, char ** argv) { print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6de7c3dd787207..ff60bb0ae567a2 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -151,8 +151,6 @@ int main(int argc, char ** argv) { print_build_info(); - std::mt19937 rng(params.seed); - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 98acb29d25d412..e89819d2b8788c 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -92,11 +92,10 @@ static std::vector> encode(llama_context * ctx, const std::ve return result; } -static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { +static std::string generate(llama_context * ctx, llama_sampling * smpl, const std::string & prompt, bool stream) { std::string result; const llama_model * model = llama_get_model(ctx); - llama_sampling * smpl = llama_get_sampling(ctx); llama_token eos_token = llama_token_eos(model); llama_kv_cache_clear(ctx); @@ -117,7 +116,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo inputs.clear(); llama_decode(ctx, bat); - auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); + auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); auto candidates = std::vector(llama_n_vocab(model)); auto n_candidates = (int32_t)candidates.size(); @@ -173,6 +172,8 @@ int main(int argc, char * argv[]) { // create generation context llama_context * ctx = llama_new_context_with_model(model, cparams); + llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr); + // ### Embedding/Representation ### // samples taken from: https://github.com/ContextualAI/gritlm#basic { @@ -209,9 +210,10 @@ int main(int argc, char * argv[]) { // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction { const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; - std::string response = generate(ctx, prompt, true); + std::string response = generate(ctx, smpl, prompt, true); } + llama_sampling_free(smpl); llama_free(ctx); llama_free_model(model); llama_backend_free(); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 13b3c362214c09..555f595567eb7d 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -156,16 +156,9 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - LOG_TEE("%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); LOG("%s: llama backend init\n", __func__); llama_backend_init(); @@ -351,7 +344,7 @@ int main(int argc, char ** argv) { std::vector embd; - ctx_sampling = llama_sampling_init(sparams, llama_get_sampling(ctx)); + ctx_sampling = llama_sampling_init(sparams, model); while (n_remain != 0 || params.interactive) { // predict diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index af3b356bc04b7f..a20034e0cb0418 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -120,7 +120,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo LOGi("Using %d threads", n_threads); llama_context_params ctx_params = llama_context_default_params(); - ctx_params.seed = 1234; ctx_params.n_ctx = 2048; ctx_params.n_threads = n_threads; ctx_params.n_threads_batch = n_threads; @@ -380,12 +379,13 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( JNIEnv * env, jobject, jlong context_pointer, + jlong sampling_pointer, jlong batch_pointer, jint n_len, jobject intvar_ncur ) { const auto context = reinterpret_cast(context_pointer); - const auto sampling = reinterpret_cast(llama_get_sampling(context)); + const auto sampling = reinterpret_cast(sampling_pointer); const auto batch = reinterpret_cast(batch_pointer); const auto model = llama_get_model(context); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index bfd273072e627b..5b63f5ac4da434 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -43,14 +43,14 @@ actor LlamaContext { self.tokens_list = [] self.batch = llama_batch_init(512, 0, 1) self.temporary_invalid_cchars = [] - self.sampling = llama_get_sampling(context) + self.sampling = llama_sampling_init(context, nil, nil); } deinit { + llama_sampling_free(sampling) llama_batch_free(batch) llama_free(context) llama_free_model(model) - llama_sampling_free(sampling) llama_backend_free() } @@ -72,7 +72,6 @@ actor LlamaContext { print("Using \(n_threads) threads") var ctx_params = llama_context_default_params() - ctx_params.seed = 1234 ctx_params.n_ctx = 2048 ctx_params.n_threads = UInt32(n_threads) ctx_params.n_threads_batch = UInt32(n_threads) diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 55566d022735fb..f8a762dcaf8311 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG_TEE("\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, llama_get_sampling(ctx_llava->ctx_llama)); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, ctx_llava->model); if (!ctx_sampling) { fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); exit(1); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index f951b57b291587..f8915eab9d9888 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -161,7 +161,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_llama, int * n_past) { const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + llama_sampling_accept(ctx_sampling, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; @@ -218,7 +218,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla LOG_TEE("\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, ctx_llava->model); return ctx_sampling; } @@ -299,7 +299,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_print_timings(ctx_llava->ctx_llama); + llama_print_timings(ctx_llava->ctx_llama, nullptr); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 5617d45c45c77a..76ac6bb2bf2069 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,7 +1,6 @@ #include "common.h" #include "llama.h" -#include #include #include #include @@ -118,7 +117,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx)); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, model); // verification n-grams std::vector ngrams_cur(G); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 6753bedee8d844..8923f7c0a6ca9e 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -3,13 +3,11 @@ #include "common.h" #include "ngram-cache.h" -#include #include #include #include #include #include -#include int main(int argc, char ** argv){ gpt_params params; @@ -106,7 +104,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx)); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, model); std::vector draft; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b2ac2e54eaad40..d9037fb615299e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -183,16 +183,9 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - LOG_TEE("%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); LOG("%s: llama backend init\n", __func__); llama_backend_init(); @@ -535,7 +528,7 @@ int main(int argc, char ** argv) { antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); } - ctx_sampling = llama_sampling_init(sparams, llama_get_sampling(ctx)); + ctx_sampling = llama_sampling_init(sparams, model); if (!ctx_sampling) { fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); exit(1); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index de5dd625eee410..e9e4de260256ae 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -26,8 +26,6 @@ int main(int argc, char ** argv) { return 1; } - srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed); - int n_junk = params.n_junk; int n_keep = params.n_keep; int n_grp = params.grp_attn_n; @@ -85,7 +83,7 @@ int main(int argc, char ** argv) { return 1; } - llama_sampling * smpl = llama_get_sampling(ctx); + llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr); // tokenize the prompt std::vector tokens_list; @@ -274,6 +272,7 @@ int main(int argc, char ** argv) { llama_batch_free(batch); + llama_sampling_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index fbf3872a01e437..185379ccdf261b 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2007,13 +2007,7 @@ int main(int argc, char ** argv) { print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 25c2de60cbce7a..498cbbe3ce1cda 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -319,8 +319,7 @@ int main(int argc, char ** argv) { } auto cparams = llama_context_default_params(); - cparams.n_ctx = 256; - cparams.seed = 1; + cparams.n_ctx = 256; ctx = llama_new_context_with_model(model, cparams); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 937cd6389f8359..a62c0f294be48f 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -3,7 +3,6 @@ #include #include -#include int main(int argc, char ** argv) { gpt_params params; @@ -38,7 +37,7 @@ int main(int argc, char ** argv) { return 1; } - llama_sampling * smpl = llama_get_sampling(ctx); + llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr); // tokenize prompt auto tokens = llama_tokenize(ctx, params.prompt, true); @@ -98,7 +97,7 @@ int main(int argc, char ** argv) { // make new context auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); - llama_sampling * smpl2 = llama_get_sampling(ctx2); + llama_sampling * smpl2 = llama_sampling_init(model, nullptr, nullptr); printf("\nsecond run: %s", params.prompt.c_str()); @@ -163,7 +162,7 @@ int main(int argc, char ** argv) { // make new context auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); - llama_sampling * smpl3 = llama_get_sampling(ctx3); + llama_sampling * smpl3 = llama_sampling_init(model, nullptr, nullptr); printf("\nsingle seq run: %s", params.prompt.c_str()); @@ -246,6 +245,10 @@ int main(int argc, char ** argv) { printf("\n"); + llama_sampling_free(smpl); + llama_sampling_free(smpl2); + llama_sampling_free(smpl3); + llama_free(ctx3); llama_free_model(model); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 6c8aaacca468b9..cf5e74002fa747 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -55,7 +55,7 @@ int main(int argc, char ** argv) { return 1; } - llama_sampling * smpl = llama_get_sampling(ctx); + llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr); // tokenize the prompt @@ -168,6 +168,7 @@ int main(int argc, char ** argv) { llama_batch_free(batch); + llama_sampling_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 261fcd6afbeaed..f9d38e5ec62a0f 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -43,10 +43,7 @@ int main(int argc, char ** argv) { // probability threshold for splitting a draft branch (only for n_seq_dft > 1) const float p_split = params.p_split; - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - std::default_random_engine rng(params.seed); + std::default_random_engine rng(params.sparams.seed); std::uniform_real_distribution<> u_dist; #ifndef LOG_DISABLE_LOGS @@ -179,7 +176,7 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context (reuse the llama_context's sampling instance) - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx_tgt)); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, model_tgt); // draft sequence data std::vector drafts(n_seq_dft); diff --git a/include/llama.h b/include/llama.h index f0974b536547b6..546f62c4cc03ee 100644 --- a/include/llama.h +++ b/include/llama.h @@ -300,7 +300,6 @@ extern "C" { // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // https://github.com/ggerganov/llama.cpp/pull/7544 struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size @@ -407,6 +406,7 @@ extern "C" { LLAMA_API void llama_free_model(struct llama_model * model); + // TODO: rename to llama_init_from_model LLAMA_API struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params); @@ -432,8 +432,7 @@ extern "C" { LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model); - LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); - LLAMA_API struct llama_sampling * llama_get_sampling( struct llama_context * ctx); + LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @@ -660,7 +659,7 @@ extern "C" { // // Returns the *actual* size in bytes of the state - // (rng, logits, embedding and kv_cache) + // (logits, embedding and kv_cache) // Only use when saving the state, not when restoring it, otherwise the size may be too small. LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx), diff --git a/src/llama.cpp b/src/llama.cpp index 241689e6a44fcc..e44bf07a46b5d9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2653,7 +2653,6 @@ struct llama_model { struct llama_context { llama_context(const llama_model & model) : model(model) - , sampling(model.vocab, nullptr, nullptr) // by default, no grammar , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} @@ -2670,7 +2669,6 @@ struct llama_context { const struct llama_model & model; struct llama_cparams cparams; - struct llama_sampling sampling; struct llama_kv_cache kv_self; struct llama_control_vector cvec; @@ -16285,7 +16283,6 @@ struct llama_model_params llama_model_default_params() { struct llama_context_params llama_context_default_params() { struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, @@ -16564,10 +16561,6 @@ struct llama_context * llama_new_context_with_model( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); @@ -16578,8 +16571,6 @@ struct llama_context * llama_new_context_with_model( ctx->abort_callback = params.abort_callback; ctx->abort_callback_data = params.abort_callback_data; - llama_sampling_set_rng_seed_impl(ctx->sampling, params.seed); - ctx->logits_all = params.logits_all; uint32_t kv_size = cparams.n_ctx; @@ -16896,10 +16887,6 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } -struct llama_sampling * llama_get_sampling(struct llama_context * ctx) { - return &ctx->sampling; -} - enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { return ctx->cparams.pooling_type; } @@ -17363,14 +17350,14 @@ struct llama_data_write { // TODO: add more model-specific info which should prevent loading the session file if not identical } - void write_rng(const std::mt19937 & rng) { - std::ostringstream rng_ss; - rng_ss << rng; + //void write_rng(const std::mt19937 & rng) { + // std::ostringstream rng_ss; + // rng_ss << rng; - const std::string & rng_str = rng_ss.str(); + // const std::string & rng_str = rng_ss.str(); - write_string(rng_str); - } + // write_string(rng_str); + //} void write_output_ids(const struct llama_context * ctx) { const uint32_t n_outputs = ctx->n_outputs; @@ -17588,17 +17575,17 @@ struct llama_data_read { // TODO: add more info which needs to be identical but which is not verified otherwise } - void read_rng(std::mt19937 & rng) { - std::string rng_str; - read_string(rng_str); + //void read_rng(std::mt19937 & rng) { + // std::string rng_str; + // read_string(rng_str); - std::istringstream rng_ss(rng_str); - rng_ss >> rng; + // std::istringstream rng_ss(rng_str); + // rng_ss >> rng; - if (rng_ss.fail()) { - throw std::runtime_error("failed to load RNG state"); - } - } + // if (rng_ss.fail()) { + // throw std::runtime_error("failed to load RNG state"); + // } + //} void read_output_ids(struct llama_context * ctx) { std::vector output_pos; @@ -18012,8 +17999,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_model_info(ctx); - data_ctx.write_rng(ctx->sampling.rng); - // copy outputs data_ctx.write_output_ids(ctx); data_ctx.write_logits(ctx); @@ -18051,9 +18036,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_model_info(ctx); - // set rng - data_ctx.read_rng(ctx->sampling.rng); - // set outputs data_ctx.read_output_ids(ctx); data_ctx.read_logits(ctx); @@ -19092,12 +19074,12 @@ void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smp /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, /*.t_end_ms =*/ 1.00 * ggml_time_ms(), /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, - /*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : ctx->sampling.t_total_us), + /*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : 0.0), /*.t_grammar_ms =*/ 1e-3 * (smpl && smpl->grammar ? smpl->grammar->t_total_us : 0.0), /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, - /*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : ctx->sampling.n_sample), + /*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : 0), /*.n_grammar_sample =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_sample : 0), /*.n_grammar_accept =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_accept : 0), /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),