diff --git a/.github/_typos.toml b/.github/_typos.toml index 87860f0a9..fb576b499 100644 --- a/.github/_typos.toml +++ b/.github/_typos.toml @@ -14,3 +14,7 @@ extend-exclude = [ "LLama.Benchmark/Assets/", "LLama.Examples/Assets/" ] + +[default.extend-words] +# Used in a comment in SafeLLamaSamplerHandle.cs, as a prefix of "hello" +teh = "hel" \ No newline at end of file diff --git a/LLama.Examples/Examples/CustomSampler.cs b/LLama.Examples/Examples/CustomSampler.cs index d2df8db2b..7102e0921 100644 --- a/LLama.Examples/Examples/CustomSampler.cs +++ b/LLama.Examples/Examples/CustomSampler.cs @@ -60,7 +60,6 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl chain.AddCustom(new RemoveMostLikelyToken()); // Select from the distribution - chain.AddSoftmax(); chain.AddDistributionSampler(42); return chain; diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs index 7fe6cc871..61476bb7b 100644 --- a/LLama/Extensions/LLamaExecutorExtensions.cs +++ b/LLama/Extensions/LLamaExecutorExtensions.cs @@ -150,7 +150,6 @@ private string CreatePrompt(IList messages) MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep, MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP, Seed = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Seed), out uint seed) is true ? seed : (uint)(t_random ??= new()).Next(), - TailFreeZ = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.TailFreeZ), out float tfz) is true ? tfz : s_defaultPipeline.TailFreeZ, Temperature = options?.Temperature ?? 0, TopP = options?.TopP ?? 0, TopK = options?.TopK ?? s_defaultPipeline.TopK, diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index b7dbf5bcd..8c06598a8 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - c35e586ea5722184 + 958367bf530d943a90 diff --git a/LLama/LLavaWeights.cs b/LLama/LLavaWeights.cs index cb9692ead..f2f9f6256 100644 --- a/LLama/LLavaWeights.cs +++ b/LLama/LLavaWeights.cs @@ -9,7 +9,8 @@ namespace LLama; /// /// A set of llava model weights (mmproj), loaded into memory. /// -public sealed class LLavaWeights : IDisposable +public sealed class LLavaWeights + : IDisposable { /// /// The native handle, which is used in the native APIs diff --git a/LLama/Native/LLamaNativeBatch.cs b/LLama/Native/LLamaNativeBatch.cs index 8e9c40603..41817604a 100644 --- a/LLama/Native/LLamaNativeBatch.cs +++ b/LLama/Native/LLamaNativeBatch.cs @@ -25,6 +25,7 @@ public unsafe struct LLamaNativeBatch /// /// the positions of the respective token in the sequence + /// (if set to NULL, the token position will be tracked automatically by llama_decode) /// public LLamaPos* pos; @@ -35,18 +36,13 @@ public unsafe struct LLamaNativeBatch /// /// the sequence to which the respective token belongs + /// (if set to NULL, the sequence ID will be assumed to be 0) /// public LLamaSeqId** seq_id; /// /// if zero, the logits for the respective token will not be output + /// (if set to NULL, only the logits for last token will be returned) /// public byte* logits; - - // Note from llama.cpp: - // > helpers for smooth API transition - can be deprecated in the future - // > for future-proof code, use the above fields instead and ignore everything below - private LLamaPos _all_pos_0; - private LLamaPos _all_pos_1; - private LLamaSeqId _all_seq_id; } \ No newline at end of file diff --git a/LLama/Native/LLamaPoolingType.cs b/LLama/Native/LLamaPoolingType.cs index ab0b75457..3ee767b51 100644 --- a/LLama/Native/LLamaPoolingType.cs +++ b/LLama/Native/LLamaPoolingType.cs @@ -29,4 +29,9 @@ public enum LLamaPoolingType CLS = 2, Last = 3, + + /// + /// Used by reranking models to attach the classification head to the graph + /// + Rank, } \ No newline at end of file diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs index 35ed39c06..df41168b1 100644 --- a/LLama/Native/LLamaVocabPreType.cs +++ b/LLama/Native/LLamaVocabPreType.cs @@ -33,4 +33,5 @@ internal enum LLamaVocabPreType BLOOM = 23, GPT3_FINNISH = 24, EXAONE = 25, + CHAMELEON = 26, } \ No newline at end of file diff --git a/LLama/Native/NativeApi.Sampling.cs b/LLama/Native/NativeApi.Sampling.cs deleted file mode 100644 index 4b73d2e0b..000000000 --- a/LLama/Native/NativeApi.Sampling.cs +++ /dev/null @@ -1,186 +0,0 @@ -using System; - -namespace LLama.Native -{ - public static partial class NativeApi - { - /// - /// Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - /// Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - /// Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - /// Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - /// Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaContextHandle ctx, - ref LLamaTokenDataArrayNative candidates, - LLamaToken* last_tokens, ulong last_tokens_size, - float penalty_repeat, - float penalty_freq, - float penalty_present); - - /// - /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 - /// - /// - /// Logits extracted from the original generation context. - /// Logits extracted from a separate context from the same model. - /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. - /// Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Span logits, ReadOnlySpan logits_guidance, float scale) - { - if (logits == null) - throw new ArgumentNullException(nameof(logits)); - if (logits_guidance == null) - throw new ArgumentNullException(nameof(logits_guidance)); - if (logits.Length != ctx.VocabCount) - throw new ArgumentException("Logits count must have equal context vocab size", nameof(logits)); - if (logits_guidance.Length != ctx.VocabCount) - throw new ArgumentException("Guidance logits count must have equal context vocab size", nameof(logits_guidance)); - - unsafe - { - fixed (float* logitsPtr = logits) - fixed (float* logitsGuidancePtr = logits_guidance) - llama_sample_apply_guidance(ctx, logitsPtr, logitsGuidancePtr, scale); - } - } - - /// - /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 - /// - /// - /// Logits extracted from the original generation context. - /// Logits extracted from a separate context from the same model. - /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. - /// Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale); - - /// - /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - /// - /// - /// Pointer to LLamaTokenDataArray - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_softmax(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates); - - /// - /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_top_k(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, int k, ulong min_keep); - - /// - /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_top_p(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); - - /// - /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_min_p(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); - - - /// - /// Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_tail_free(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float z, ulong min_keep); - - /// - /// Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); - - /// - /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. - /// - /// - /// Pointer to LLamaTokenDataArray - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val); - - /// - /// Modify logits by temperature - /// - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_temp(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float temp); - - /// - /// Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - /// - /// - /// A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - /// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - /// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - /// The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. - /// Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern LLamaToken llama_sample_token_mirostat(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float tau, float eta, int m, ref float mu); - - /// - /// Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - /// - /// - /// A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - /// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - /// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - /// Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern LLamaToken llama_sample_token_mirostat_v2(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float tau, float eta, ref float mu); - - /// - /// Selects the token with the highest probability. - /// - /// - /// Pointer to LLamaTokenDataArray - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern LLamaToken llama_sample_token_greedy(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates); - - /// - /// Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. - /// - /// - /// Pointer to LLamaTokenDataArray - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern LLamaToken llama_sample_token(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates); - } -} diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 6087a0ecf..041cc0dd5 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -49,6 +49,14 @@ public static void llama_empty_call() [return: MarshalAs(UnmanagedType.U1)] public static extern bool llama_supports_gpu_offload(); + /// + /// Check if RPC offload is supported + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + [return: MarshalAs(UnmanagedType.U1)] + public static extern bool llama_supports_rpc(); + /// /// Initialize the llama + ggml backend. Call once at the start of the program. /// diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index ff1e217ea..8caff8d5f 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -368,8 +368,10 @@ static SafeLLamaContextHandle() private static extern LLamaPoolingType llama_pooling_type(SafeLLamaContextHandle ctx); /// - /// Get the embeddings for the a specific sequence. - /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd + /// Get the embeddings for a sequence id. + /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE + /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence + /// otherwise: float[n_embd] (1-dimensional) /// /// A pointer to the first float in an embedding, length = ctx.EmbeddingSize [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs index 7122fcc79..ef6a7ae30 100644 --- a/LLama/Native/SafeLLamaSamplerHandle.cs +++ b/LLama/Native/SafeLLamaSamplerHandle.cs @@ -267,19 +267,6 @@ public void AddMirostat2Sampler(uint seed, float tau, float eta) static extern IntPtr llama_sampler_init_mirostat_v2(uint seed, float tau, float eta); } - - /// - /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - /// - /// - public void AddSoftmax() - { - llama_sampler_chain_add(this, llama_sampler_init_softmax()); - - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - static extern IntPtr llama_sampler_init_softmax(); - } - /// /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @@ -309,7 +296,6 @@ public void AddTopP(float p, nint minKeep) /// /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 /// - /// public void AddMinP(float p, nint minKeep) { llama_sampler_chain_add(this, llama_sampler_init_min_p(p, minKeep)); @@ -320,24 +306,9 @@ public void AddMinP(float p, nint minKeep) // ReSharper restore InconsistentNaming } - /// - /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 - /// - /// - public void AddTailFree(float z, nint minKeep) - { - llama_sampler_chain_add(this, llama_sampler_init_tail_free(z, minKeep)); - - // ReSharper disable InconsistentNaming - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - static extern IntPtr llama_sampler_init_tail_free(float p, nint min_keep); - // ReSharper restore InconsistentNaming - } - /// /// Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. /// - /// public void AddTypical(float p, nint minKeep) { llama_sampler_chain_add(this, llama_sampler_init_typical(p, minKeep)); @@ -349,14 +320,15 @@ public void AddTypical(float p, nint minKeep) } /// - /// Apply temperature to the logits + /// Apply temperature to the logits. + /// If temperature is less than zero the maximum logit is left unchanged and the rest are set to -infinity /// /// - /// public void AddTemperature(float t) { llama_sampler_chain_add(this, llama_sampler_init_temp(t)); + // #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] static extern IntPtr llama_sampler_init_temp(float t); } @@ -367,7 +339,6 @@ public void AddTemperature(float t) /// /// /// - /// public void AddDynamicTemperature(float t, float delta, float exponent) { llama_sampler_chain_add(this, llama_sampler_init_temp_ext(t, delta, exponent)); @@ -376,6 +347,51 @@ public void AddDynamicTemperature(float t, float delta, float exponent) static extern IntPtr llama_sampler_init_temp_ext(float t, float delta, float exponent); } + /// + /// XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + /// + /// + /// + /// + /// + public void AddXTC(float p, float t, int minKeep, uint seed) + { + llama_sampler_chain_add(this, llama_sampler_init_xtc(p, t, minKeep, seed)); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + static extern IntPtr llama_sampler_init_xtc(float p, float t, nint minKeep, uint seed); + } + + /// + /// This sampler is meant to be used for fill-in-the-middle infilling, after top_k + top_p sampling + ///
+ /// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+ /// 2. combine probs of tokens that have the same prefix
+ ///
+ /// example:
+ ///
+ /// - before:
+ /// "abc": 0.5
+ /// "abcd": 0.2
+ /// "abcde": 0.1
+ /// "dummy": 0.1
+ ///
+ /// - after:
+ /// "abc": 0.8
+ /// "dummy": 0.1
+ ///
+ /// 3. discard non-EOG tokens with low prob
+ /// 4. if no tokens are left -> pick EOT + ///
+ /// + public void AddFillInMiddleInfill(SafeLlamaModelHandle model) + { + llama_sampler_chain_add(this, llama_sampler_init_infill(model)); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + static extern IntPtr llama_sampler_init_infill(SafeLlamaModelHandle model); + } + /// /// Create a sampler which makes tokens impossible unless they match the grammar /// diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 61fab8120..718b81809 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -386,32 +386,29 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k private static extern LLamaToken llama_token_pad(SafeLlamaModelHandle model); /// - /// codellama infill tokens, Beginning of infill prefix + /// codellama infill tokens, End of infill middle /// /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern int llama_token_prefix(SafeLlamaModelHandle model); + private static extern int llama_token_eot(SafeLlamaModelHandle model); - /// - /// codellama infill tokens, Beginning of infill middle - /// - /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern int llama_token_middle(SafeLlamaModelHandle model); + private static extern int llama_token_fim_pre(SafeLlamaModelHandle model); - /// - /// codellama infill tokens, Beginning of infill suffix - /// - /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern int llama_token_suffix(SafeLlamaModelHandle model); + private static extern int llama_token_fim_suf(SafeLlamaModelHandle model); - /// - /// codellama infill tokens, End of infill middle - /// - /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern int llama_token_eot(SafeLlamaModelHandle model); + private static extern int llama_token_fim_mid(SafeLlamaModelHandle model); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern int llama_token_fim_pad(SafeLlamaModelHandle model); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern int llama_token_fim_rep(SafeLlamaModelHandle model); + + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern int llama_token_fim_sep(SafeLlamaModelHandle model); /// /// For encoder-decoder models, this function returns id of the token that must be provided @@ -771,17 +768,32 @@ internal ModelTokens(SafeLlamaModelHandle model) /// /// Codellama beginning of infill prefix /// - public LLamaToken? InfillPrefix => Normalize(llama_token_prefix(_model)); + public LLamaToken? InfillPrefix => Normalize(llama_token_fim_pre(_model)); /// /// Codellama beginning of infill middle /// - public LLamaToken? InfillMiddle => Normalize(llama_token_middle(_model)); + public LLamaToken? InfillMiddle => Normalize(llama_token_fim_mid(_model)); /// /// Codellama beginning of infill suffix /// - public LLamaToken? InfillSuffix => Normalize(llama_token_suffix(_model)); + public LLamaToken? InfillSuffix => Normalize(llama_token_fim_suf(_model)); + + /// + /// Codellama pad + /// + public LLamaToken? InfillPad => Normalize(llama_token_fim_pad(_model)); + + /// + /// Codellama rep + /// + public LLamaToken? InfillRep => Normalize(llama_token_fim_rep(_model)); + + /// + /// Codellama rep + /// + public LLamaToken? InfillSep => Normalize(llama_token_fim_sep(_model)); /// /// Codellama end of infill middle diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs index 871eacd72..ee339be1f 100644 --- a/LLama/Sampling/DefaultSamplingPipeline.cs +++ b/LLama/Sampling/DefaultSamplingPipeline.cs @@ -83,11 +83,6 @@ public float AlphaPresence /// public int TopK { get; init; } = 40; - /// - /// Z value for tail free sampling - /// - public float TailFreeZ { get; init; } = 1; - /// /// P value for locally typical sampling /// @@ -135,13 +130,11 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl ); chain.AddTopK(TopK); - chain.AddTailFree(TailFreeZ, MinKeep); chain.AddTypical(TypicalP, MinKeep); chain.AddTopP(TopP, MinKeep); chain.AddMinP(MinP, MinKeep); chain.AddTemperature(Temperature); - chain.AddSoftmax(); chain.AddDistributionSampler(Seed); return chain;