llama : work around F16 DMMV buffer overflow by increasing padding

Upstream issue: ggerganov#8798
nomic-ai · Jul 31, 2024 · 7ea0fed · 7ea0fed
1 parent 2a4898a
commit 7ea0fed
Showing 1 changed file with 1 addition and 2 deletions.
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -3464,8 +3464,7 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
 }
 
 static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
+    return 256u;
 }
 
 //