diff --git a/.github/_typos.toml b/.github/_typos.toml
index 87860f0a9..fb576b499 100644
--- a/.github/_typos.toml
+++ b/.github/_typos.toml
@@ -14,3 +14,7 @@ extend-exclude = [
     "LLama.Benchmark/Assets/",
     "LLama.Examples/Assets/"
 ]
+
+[default.extend-words]
+# Used in a comment in SafeLLamaSamplerHandle.cs, as a prefix of "hello"
+teh = "hel"
\ No newline at end of file
diff --git a/LLama.Examples/Examples/CustomSampler.cs b/LLama.Examples/Examples/CustomSampler.cs
index d2df8db2b..7102e0921 100644
--- a/LLama.Examples/Examples/CustomSampler.cs
+++ b/LLama.Examples/Examples/CustomSampler.cs
@@ -60,7 +60,6 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
             chain.AddCustom(new RemoveMostLikelyToken());
 
             // Select from the distribution
-            chain.AddSoftmax();
             chain.AddDistributionSampler(42);
 
             return chain;
diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs
index 7fe6cc871..61476bb7b 100644
--- a/LLama/Extensions/LLamaExecutorExtensions.cs
+++ b/LLama/Extensions/LLamaExecutorExtensions.cs
@@ -150,7 +150,6 @@ private string CreatePrompt(IList<ChatMessage> messages)
                     MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
                     MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
                     Seed = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Seed), out uint seed) is true ? seed : (uint)(t_random ??= new()).Next(),
-                    TailFreeZ = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.TailFreeZ), out float tfz) is true ? tfz : s_defaultPipeline.TailFreeZ,
                     Temperature = options?.Temperature ?? 0,
                     TopP = options?.TopP ?? 0,
                     TopK = options?.TopK ?? s_defaultPipeline.TopK,
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index b7dbf5bcd..8c06598a8 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -56,7 +56,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>c35e586ea5722184</BinaryReleaseId>
+    <BinaryReleaseId>958367bf530d943a90</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/LLavaWeights.cs b/LLama/LLavaWeights.cs
index cb9692ead..f2f9f6256 100644
--- a/LLama/LLavaWeights.cs
+++ b/LLama/LLavaWeights.cs
@@ -9,7 +9,8 @@ namespace LLama;
 /// <summary>
 /// A set of llava model weights (mmproj), loaded into memory.
 /// </summary>
-public sealed class LLavaWeights : IDisposable
+public sealed class LLavaWeights
+    : IDisposable
 {
     /// <summary>
     /// The native handle, which is used in the native APIs
diff --git a/LLama/Native/LLamaNativeBatch.cs b/LLama/Native/LLamaNativeBatch.cs
index 8e9c40603..41817604a 100644
--- a/LLama/Native/LLamaNativeBatch.cs
+++ b/LLama/Native/LLamaNativeBatch.cs
@@ -25,6 +25,7 @@ public unsafe struct LLamaNativeBatch
 
     /// <summary>
     /// the positions of the respective token in the sequence
+    /// (if set to NULL, the token position will be tracked automatically by llama_decode)
     /// </summary>
     public LLamaPos* pos;
 
@@ -35,18 +36,13 @@ public unsafe struct LLamaNativeBatch
 
     /// <summary>
     /// the sequence to which the respective token belongs
+    /// (if set to NULL, the sequence ID will be assumed to be 0)
     /// </summary>
     public LLamaSeqId** seq_id;
 
     /// <summary>
     /// if zero, the logits for the respective token will not be output
+    /// (if set to NULL, only the logits for last token will be returned)
     /// </summary>
     public byte* logits;
-
-    // Note from llama.cpp:
-    // > helpers for smooth API transition - can be deprecated in the future
-    // > for future-proof code, use the above fields instead and ignore everything below
-    private LLamaPos _all_pos_0;
-    private LLamaPos _all_pos_1;
-    private LLamaSeqId _all_seq_id;
 }
\ No newline at end of file
diff --git a/LLama/Native/LLamaPoolingType.cs b/LLama/Native/LLamaPoolingType.cs
index ab0b75457..3ee767b51 100644
--- a/LLama/Native/LLamaPoolingType.cs
+++ b/LLama/Native/LLamaPoolingType.cs
@@ -29,4 +29,9 @@ public enum LLamaPoolingType
     CLS = 2,
 
     Last = 3,
+
+    /// <summary>
+    /// Used by reranking models to attach the classification head to the graph
+    /// </summary>
+    Rank,
 }
\ No newline at end of file
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
index 35ed39c06..df41168b1 100644
--- a/LLama/Native/LLamaVocabPreType.cs
+++ b/LLama/Native/LLamaVocabPreType.cs
@@ -33,4 +33,5 @@ internal enum LLamaVocabPreType
     BLOOM = 23,
     GPT3_FINNISH = 24,
     EXAONE = 25,
+    CHAMELEON = 26,
 }
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Sampling.cs b/LLama/Native/NativeApi.Sampling.cs
deleted file mode 100644
index 4b73d2e0b..000000000
--- a/LLama/Native/NativeApi.Sampling.cs
+++ /dev/null
@@ -1,186 +0,0 @@
-using System;
-
-namespace LLama.Native
-{
-    public static partial class NativeApi
-    {
-        /// <summary>
-        /// Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-        /// Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="last_tokens"></param>
-        /// <param name="last_tokens_size"></param>
-        /// <param name="penalty_repeat">Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.</param>
-        /// <param name="penalty_freq">Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.</param>
-        /// <param name="penalty_present">Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.</param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaContextHandle ctx,
-                                                                    ref LLamaTokenDataArrayNative candidates,
-                                                                    LLamaToken* last_tokens, ulong last_tokens_size,
-                                                                    float penalty_repeat,
-                                                                    float penalty_freq,
-                                                                    float penalty_present);
-
-        /// <summary>
-        /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="logits">Logits extracted from the original generation context.</param>
-        /// <param name="logits_guidance">Logits extracted from a separate context from the same model.
-        /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
-        /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
-        public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<float> logits_guidance, float scale)
-        {
-            if (logits == null)
-                throw new ArgumentNullException(nameof(logits));
-            if (logits_guidance == null)
-                throw new ArgumentNullException(nameof(logits_guidance));
-            if (logits.Length != ctx.VocabCount)
-                throw new ArgumentException("Logits count must have equal context vocab size", nameof(logits));
-            if (logits_guidance.Length != ctx.VocabCount)
-                throw new ArgumentException("Guidance logits count must have equal context vocab size", nameof(logits_guidance));
-
-            unsafe
-            {
-                fixed (float* logitsPtr = logits)
-                fixed (float* logitsGuidancePtr = logits_guidance)
-                    llama_sample_apply_guidance(ctx, logitsPtr, logitsGuidancePtr, scale);
-            }
-        }
-
-        /// <summary>
-        /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="logits">Logits extracted from the original generation context.</param>
-        /// <param name="logits_guidance">Logits extracted from a separate context from the same model.
-        /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
-        /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);
-
-        /// <summary>
-        /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_softmax(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);
-
-        /// <summary>
-        /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="k"></param>
-        /// <param name="min_keep"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_top_k(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, int k, ulong min_keep);
-
-        /// <summary>
-        /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="p"></param>
-        /// <param name="min_keep"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_top_p(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);
-
-        /// <summary>
-        /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="p"></param>
-        /// <param name="min_keep"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_min_p(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);
-        
-        
-        /// <summary>
-        /// Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="z"></param>
-        /// <param name="min_keep"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_tail_free(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float z, ulong min_keep);
-
-        /// <summary>
-        /// Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="p"></param>
-        /// <param name="min_keep"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);
-
-        /// <summary>
-        /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <param name="min_temp"></param>
-        /// <param name="max_temp"></param>
-        /// <param name="exponent_val"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);
-
-        /// <summary>
-        /// Modify logits by temperature
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates"></param>
-        /// <param name="temp"></param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_temp(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float temp);
-
-        /// <summary>
-        /// Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.</param>
-        /// <param name="tau">The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.</param>
-        /// <param name="eta">The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.</param>
-        /// <param name="m">The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.</param>
-        /// <param name="mu">Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.</param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern LLamaToken llama_sample_token_mirostat(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float tau, float eta, int m, ref float mu);
-
-        /// <summary>
-        /// Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.</param>
-        /// <param name="tau">The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.</param>
-        /// <param name="eta">The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.</param>
-        /// <param name="mu">Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.</param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern LLamaToken llama_sample_token_mirostat_v2(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float tau, float eta, ref float mu);
-
-        /// <summary>
-        /// Selects the token with the highest probability.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern LLamaToken llama_sample_token_greedy(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);
-
-        /// <summary>
-        /// Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern LLamaToken llama_sample_token(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);
-    }
-}
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 6087a0ecf..041cc0dd5 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -49,6 +49,14 @@ public static void llama_empty_call()
         [return: MarshalAs(UnmanagedType.U1)]
         public static extern bool llama_supports_gpu_offload();
 
+        /// <summary>
+        /// Check if RPC offload is supported
+        /// </summary>
+        /// <returns></returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        [return: MarshalAs(UnmanagedType.U1)]
+        public static extern bool llama_supports_rpc();
+
         /// <summary>
         /// Initialize the llama + ggml backend. Call once at the start of the program.
         ///
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index ff1e217ea..8caff8d5f 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -368,8 +368,10 @@ static SafeLLamaContextHandle()
         private static extern LLamaPoolingType llama_pooling_type(SafeLLamaContextHandle ctx);
 
         /// <summary>
-        /// Get the embeddings for the a specific sequence.
-        /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+        /// Get the embeddings for a sequence id.
+        /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+        /// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+        /// otherwise: float[n_embd] (1-dimensional)
         /// </summary>
         /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs
index 7122fcc79..ef6a7ae30 100644
--- a/LLama/Native/SafeLLamaSamplerHandle.cs
+++ b/LLama/Native/SafeLLamaSamplerHandle.cs
@@ -267,19 +267,6 @@ public void AddMirostat2Sampler(uint seed, float tau, float eta)
         static extern IntPtr llama_sampler_init_mirostat_v2(uint seed, float tau, float eta);
     }
 
-
-    /// <summary>
-    /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    /// </summary>
-    /// <returns></returns>
-    public void AddSoftmax()
-    {
-        llama_sampler_chain_add(this, llama_sampler_init_softmax());
-
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        static extern IntPtr llama_sampler_init_softmax();
-    }
-
     /// <summary>
     /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     /// </summary>
@@ -309,7 +296,6 @@ public void AddTopP(float p, nint minKeep)
     /// <summary>
     /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
     /// </summary>
-    /// <returns></returns>
     public void AddMinP(float p, nint minKeep)
     {
         llama_sampler_chain_add(this, llama_sampler_init_min_p(p, minKeep));
@@ -320,24 +306,9 @@ public void AddMinP(float p, nint minKeep)
         // ReSharper restore InconsistentNaming
     }
 
-    /// <summary>
-    /// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    /// </summary>
-    /// <returns></returns>
-    public void AddTailFree(float z, nint minKeep)
-    {
-        llama_sampler_chain_add(this, llama_sampler_init_tail_free(z, minKeep));
-
-        // ReSharper disable InconsistentNaming
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        static extern IntPtr llama_sampler_init_tail_free(float p, nint min_keep);
-        // ReSharper restore InconsistentNaming
-    }
-
     /// <summary>
     /// Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
     /// </summary>
-    /// <returns></returns>
     public void AddTypical(float p, nint minKeep)
     {
         llama_sampler_chain_add(this, llama_sampler_init_typical(p, minKeep));
@@ -349,14 +320,15 @@ public void AddTypical(float p, nint minKeep)
     }
 
     /// <summary>
-    /// Apply temperature to the logits
+    /// Apply temperature to the logits.
+    /// If temperature is less than zero the maximum logit is left unchanged and the rest are set to -infinity
     /// </summary>
     /// <param name="t"></param>
-    /// <returns></returns>
     public void AddTemperature(float t)
     {
         llama_sampler_chain_add(this, llama_sampler_init_temp(t));
 
+        // #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         static extern IntPtr llama_sampler_init_temp(float t);
     }
@@ -367,7 +339,6 @@ public void AddTemperature(float t)
     /// <param name="t"></param>
     /// <param name="delta"></param>
     /// <param name="exponent"></param>
-    /// <returns></returns>
     public void AddDynamicTemperature(float t, float delta, float exponent)
     {
         llama_sampler_chain_add(this, llama_sampler_init_temp_ext(t, delta, exponent));
@@ -376,6 +347,51 @@ public void AddDynamicTemperature(float t, float delta, float exponent)
         static extern IntPtr llama_sampler_init_temp_ext(float t, float delta, float exponent);
     }
 
+    /// <summary>
+    /// XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    /// </summary>
+    /// <param name="p"></param>
+    /// <param name="t"></param>
+    /// <param name="minKeep"></param>
+    /// <param name="seed"></param>
+    public void AddXTC(float p, float t, int minKeep, uint seed)
+    {
+        llama_sampler_chain_add(this, llama_sampler_init_xtc(p, t, minKeep, seed));
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        static extern IntPtr llama_sampler_init_xtc(float p, float t, nint minKeep, uint seed);
+    }
+
+    /// <summary>
+    /// This sampler is meant to be used for fill-in-the-middle infilling, after top_k + top_p sampling
+    ///<br />
+    /// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG<br />
+    /// 2. combine probs of tokens that have the same prefix<br />
+    /// <br />
+    /// example:<br />
+    /// <br />
+    /// - before:<br />
+    ///   "abc":   0.5<br />
+    ///   "abcd":  0.2<br />
+    ///   "abcde": 0.1<br />
+    ///   "dummy": 0.1<br />
+    ///<br />
+    /// - after:<br />
+    ///   "abc":   0.8<br />
+    ///   "dummy": 0.1<br />
+    ///<br />
+    /// 3. discard non-EOG tokens with low prob<br />
+    /// 4. if no tokens are left -> pick EOT
+    /// </summary>
+    /// <param name="model"></param>
+    public void AddFillInMiddleInfill(SafeLlamaModelHandle model)
+    {
+        llama_sampler_chain_add(this, llama_sampler_init_infill(model));
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        static extern IntPtr llama_sampler_init_infill(SafeLlamaModelHandle model);
+    }
+
     /// <summary>
     /// Create a sampler which makes tokens impossible unless they match the grammar
     /// </summary>
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 61fab8120..718b81809 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -386,32 +386,29 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
         private static extern LLamaToken llama_token_pad(SafeLlamaModelHandle model);
 
         /// <summary>
-        /// codellama infill tokens, Beginning of infill prefix
+        /// codellama infill tokens, End of infill middle
         /// </summary>
         /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_prefix(SafeLlamaModelHandle model);
+        private static extern int llama_token_eot(SafeLlamaModelHandle model);
 
-        /// <summary>
-        /// codellama infill tokens, Beginning of infill middle
-        /// </summary>
-        /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_middle(SafeLlamaModelHandle model);
+        private static extern int llama_token_fim_pre(SafeLlamaModelHandle model);
 
-        /// <summary>
-        /// codellama infill tokens, Beginning of infill suffix
-        /// </summary>
-        /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_suffix(SafeLlamaModelHandle model);
+        private static extern int llama_token_fim_suf(SafeLlamaModelHandle model);
 
-        /// <summary>
-        /// codellama infill tokens, End of infill middle
-        /// </summary>
-        /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern int llama_token_eot(SafeLlamaModelHandle model);
+        private static extern int llama_token_fim_mid(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_token_fim_pad(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_token_fim_rep(SafeLlamaModelHandle model);
+
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_token_fim_sep(SafeLlamaModelHandle model);
 
         /// <summary>
         /// For encoder-decoder models, this function returns id of the token that must be provided
@@ -771,17 +768,32 @@ internal ModelTokens(SafeLlamaModelHandle model)
             /// <summary>
             /// Codellama beginning of infill prefix
             /// </summary>
-            public LLamaToken? InfillPrefix => Normalize(llama_token_prefix(_model));
+            public LLamaToken? InfillPrefix => Normalize(llama_token_fim_pre(_model));
 
             /// <summary>
             /// Codellama beginning of infill middle
             /// </summary>
-            public LLamaToken? InfillMiddle => Normalize(llama_token_middle(_model));
+            public LLamaToken? InfillMiddle => Normalize(llama_token_fim_mid(_model));
 
             /// <summary>
             /// Codellama beginning of infill suffix
             /// </summary>
-            public LLamaToken? InfillSuffix => Normalize(llama_token_suffix(_model));
+            public LLamaToken? InfillSuffix => Normalize(llama_token_fim_suf(_model));
+
+            /// <summary>
+            /// Codellama pad
+            /// </summary>
+            public LLamaToken? InfillPad => Normalize(llama_token_fim_pad(_model));
+
+            /// <summary>
+            /// Codellama rep
+            /// </summary>
+            public LLamaToken? InfillRep => Normalize(llama_token_fim_rep(_model));
+
+            /// <summary>
+            /// Codellama rep
+            /// </summary>
+            public LLamaToken? InfillSep => Normalize(llama_token_fim_sep(_model));
 
             /// <summary>
             /// Codellama end of infill middle
diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs
index 871eacd72..ee339be1f 100644
--- a/LLama/Sampling/DefaultSamplingPipeline.cs
+++ b/LLama/Sampling/DefaultSamplingPipeline.cs
@@ -83,11 +83,6 @@ public float AlphaPresence
     /// </summary>
     public int TopK { get; init; } = 40;
 
-    /// <summary>
-    /// Z value for tail free sampling
-    /// </summary>
-    public float TailFreeZ { get; init; } = 1;
-
     /// <summary>
     /// P value for locally typical sampling
     /// </summary>
@@ -135,13 +130,11 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
         );
 
         chain.AddTopK(TopK);
-        chain.AddTailFree(TailFreeZ, MinKeep);
         chain.AddTypical(TypicalP, MinKeep);
         chain.AddTopP(TopP, MinKeep);
         chain.AddMinP(MinP, MinKeep);
         chain.AddTemperature(Temperature);
 
-        chain.AddSoftmax();
         chain.AddDistributionSampler(Seed);
 
         return chain;