Skip to content

Commit

Permalink
Settings: add flash_attn & mlock
Browse files Browse the repository at this point in the history
  • Loading branch information
cztomsik committed May 1, 2024
1 parent 6271d2f commit ddec013
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
14 changes: 14 additions & 0 deletions src/app/settings/Settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ export const Settings = () => {
<Field class="w-16" name="llama.n_gpu_layers" type="number" min={0} />
</Row>

<Row
title="Flash Attention"
description="Enable the use of flash attention. This is experimental and it may or may not improve performance."
>
<Field class="w-20" name="llama.flash_attn" type="checkbox" />
</Row>

<Row
title="CPU threads count"
description="Number of threads to use during generation. If empty, the app will use the number of performance cores available on the system."
Expand All @@ -76,6 +83,13 @@ export const Settings = () => {
<Field class="w-20" name="llama.n_batch" type="number" min={1} />
</Row>

<Row
title="Memory Lock"
description="Lock the model in memory to prevent it from being paged out. This can improve performance, but may slow down other applications."
>
<Field class="w-20" name="llama.mlock" type="checkbox" />
</Row>

{/* <Row
title="Factory reset"
description="Reset all settings to their default values and delete all data. This action cannot be undone."
Expand Down
4 changes: 4 additions & 0 deletions src/llama.zig
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pub const PoolOptions = struct {
n_batch: u32 = 64,
n_threads: ?u32 = null,
n_threads_batch: ?u32 = null,
flash_attn: bool = false,
mlock: bool = false,
};

/// A single-model, single-context, thread-safe pool.
Expand Down Expand Up @@ -112,6 +114,7 @@ pub const Pool = struct {
// It seems Metal never worked on Intel-based macs.
// see https://github.com/ggerganov/llama.cpp/issues/3423#issuecomment-1745511586
params.n_gpu_layers = @intCast(if (builtin.os.tag == .macos and builtin.cpu.arch == .aarch64) (self.options.n_gpu_layers orelse 999) else 0);
params.use_mlock = self.options.mlock;

return params;
}
Expand All @@ -123,6 +126,7 @@ pub const Pool = struct {
params.n_batch = @intCast(self.options.n_batch);
params.n_threads = @intCast(self.options.n_threads orelse getPerfCpuCount());
params.n_threads_batch = @intCast(self.options.n_threads_batch orelse params.n_threads);
params.flash_attn = self.options.flash_attn;

return params;
}
Expand Down

0 comments on commit ddec013

Please sign in to comment.