Settings: add flash_attn & mlock

cztomsik · May 1, 2024 · ddec013 · ddec013
1 parent 6271d2f
commit ddec013
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 0 deletions.
diff --git a/src/app/settings/Settings.tsx b/src/app/settings/Settings.tsx
@@ -58,6 +58,13 @@ export const Settings = () => {
           <Field class="w-16" name="llama.n_gpu_layers" type="number" min={0} />
         </Row>
 
+        <Row
+          title="Flash Attention"
+          description="Enable the use of flash attention. This is experimental and it may or may not improve performance."
+        >
+          <Field class="w-20" name="llama.flash_attn" type="checkbox" />
+        </Row>
+
         <Row
           title="CPU threads count"
           description="Number of threads to use during generation. If empty, the app will use the number of performance cores available on the system."
@@ -76,6 +83,13 @@ export const Settings = () => {
           <Field class="w-20" name="llama.n_batch" type="number" min={1} />
         </Row>
 
+        <Row
+          title="Memory Lock"
+          description="Lock the model in memory to prevent it from being paged out. This can improve performance, but may slow down other applications."
+        >
+          <Field class="w-20" name="llama.mlock" type="checkbox" />
+        </Row>
+
         {/* <Row
           title="Factory reset"
           description="Reset all settings to their default values and delete all data. This action cannot be undone."

diff --git a/src/llama.zig b/src/llama.zig
@@ -29,6 +29,8 @@ pub const PoolOptions = struct {
     n_batch: u32 = 64,
     n_threads: ?u32 = null,
     n_threads_batch: ?u32 = null,
+    flash_attn: bool = false,
+    mlock: bool = false,
 };
 
 /// A single-model, single-context, thread-safe pool.
@@ -112,6 +114,7 @@ pub const Pool = struct {
         // It seems Metal never worked on Intel-based macs.
         // see https://github.com/ggerganov/llama.cpp/issues/3423#issuecomment-1745511586
         params.n_gpu_layers = @intCast(if (builtin.os.tag == .macos and builtin.cpu.arch == .aarch64) (self.options.n_gpu_layers orelse 999) else 0);
+        params.use_mlock = self.options.mlock;
 
         return params;
     }
@@ -123,6 +126,7 @@ pub const Pool = struct {
         params.n_batch = @intCast(self.options.n_batch);
         params.n_threads = @intCast(self.options.n_threads orelse getPerfCpuCount());
         params.n_threads_batch = @intCast(self.options.n_threads_batch orelse params.n_threads);
+        params.flash_attn = self.options.flash_attn;
 
         return params;
     }