From 38fb9423cb30a996dbc991a2294ace97443008ae Mon Sep 17 00:00:00 2001 From: Eric Buehler <65165915+EricLBuehler@users.noreply.github.com> Date: Sun, 28 Jul 2024 04:14:44 -0400 Subject: [PATCH] Allow setting PagedAttention KV cache allocation from context size (#640) * Support paged attn memory allocation via context size * Slightly better logging * Connect it to the apis * Clippy --- Cargo.lock | 1 - mistralrs-bench/Cargo.toml | 1 - mistralrs-bench/src/main.rs | 59 ++++++++++++---- .../src/dummy_paged_attention/mod.rs | 38 ++++++++--- mistralrs-core/src/lib.rs | 2 +- mistralrs-core/src/paged_attention/mod.rs | 38 ++++++++--- mistralrs-paged-attn/build.rs | 42 ++++-------- .../src/backend/paged_attention.rs | 4 +- mistralrs-pyo3/mistralrs.pyi | 12 +++- mistralrs-pyo3/src/lib.rs | 68 +++++++++++++------ mistralrs-server/src/main.rs | 67 +++++++++++++----- mistralrs/examples/paged_attn/main.rs | 6 +- 12 files changed, 229 insertions(+), 109 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 020b3d794f..94e108be16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2319,7 +2319,6 @@ dependencies = [ "candle-core", "clap", "cli-table", - "either", "mistralrs-core", "serde", "serde_json", diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml index 0c8ec0d862..3549385958 100644 --- a/mistralrs-bench/Cargo.toml +++ b/mistralrs-bench/Cargo.toml @@ -20,7 +20,6 @@ serde_json.workspace = true clap.workspace = true mistralrs-core = { version = "0.2.2", path = "../mistralrs-core" } tracing.workspace = true -either.workspace = true tokio.workspace = true cli-table = "0.4.7" diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs index ddc93a7c50..249073e594 100644 --- a/mistralrs-bench/src/main.rs +++ b/mistralrs-bench/src/main.rs @@ -1,12 +1,11 @@ use candle_core::Device; use clap::Parser; use cli_table::{format::Justify, print_stdout, Cell, CellStruct, Style, Table}; -use either::Either; use mistralrs_core::{ initialize_logging, paged_attn_supported, Constraint, DefaultSchedulerMethod, - DeviceLayerMapMetadata, DeviceMapMetadata, Loader, LoaderBuilder, MistralRs, MistralRsBuilder, - ModelDType, ModelSelected, NormalRequest, PagedAttentionConfig, Request, RequestMessage, - Response, SamplingParams, SchedulerConfig, TokenSource, Usage, + DeviceLayerMapMetadata, DeviceMapMetadata, Loader, LoaderBuilder, MemoryGpuConfig, MistralRs, + MistralRsBuilder, ModelDType, ModelSelected, NormalRequest, PagedAttentionConfig, Request, + RequestMessage, Response, SamplingParams, SchedulerConfig, TokenSource, Usage, }; use std::fmt::Display; use std::sync::Arc; @@ -292,6 +291,12 @@ struct Args { #[arg(long = "pa-gpu-mem-usage")] paged_attn_gpu_mem_usage: Option, + /// Total context length to allocate the KV cache for (total number of tokens which the KV cache can hold) + /// when using PagedAttention, which is only supported on CUDA and is always automatically activated. + /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. + #[arg(long = "pa-ctxt-len")] + paged_ctxt_len: Option, + /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32. /// PagedAttention is only supported on CUDA and is always automatically activated. #[arg(long = "pa-blk-size")] @@ -383,31 +388,55 @@ fn main() -> anyhow::Result<()> { args.paged_attn_block_size, args.paged_attn_gpu_mem, args.paged_attn_gpu_mem_usage, + args.paged_ctxt_len, paged_attn_supported(), args.no_paged_attn, ) { - (block_size, None, None, true, false) => Some(PagedAttentionConfig::new( + (block_size, None, None, None, true, false) => Some(PagedAttentionConfig::new( block_size, 512, - Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory + MemoryGpuConfig::Utilization(0.9), // NOTE(EricLBuehler): default is to use 90% of memory )?), - (block_size, Some(m), None, true, false) => { - Some(PagedAttentionConfig::new(block_size, 512, Either::Left(m))?) - } - (block_size, None, Some(f), true, false) => Some(PagedAttentionConfig::new( + (block_size, None, None, Some(ctxt), true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::ContextSize(ctxt), + )?), + (block_size, None, Some(f), None, true, false) => Some(PagedAttentionConfig::new( block_size, 512, - Either::Right(f), + MemoryGpuConfig::Utilization(f), )?), - (block_size, Some(_m), Some(f), true, false) => { - info!("Both memory size and usage were specified, defaulting to the usage value."); + (block_size, Some(m), None, None, true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Amount(m), + )?), + (block_size, Some(_m), Some(f), None, true, false) => { + info!("Both memory size, and usage were specified, defaulting to the usage value."); + Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Utilization(f), + )?) + } + (block_size, Some(_m), None, Some(ctxt), true, false) => { + info!("All memory size and ctxt len, defaulting to the context len value."); + Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::ContextSize(ctxt), + )?) + } + (block_size, None, Some(f), Some(_ctxt), true, false) => { + info!("Both ctxt len and usage were specified, defaulting to the usage value."); Some(PagedAttentionConfig::new( block_size, 512, - Either::Right(f), + MemoryGpuConfig::Utilization(f), )?) } - (_, _, _, _, _) => None, + (_, _, _, _, _, _) => None, }; let pipeline = loader.load_model_from_hf( diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs index 6b8380fdb2..07064bd3cd 100644 --- a/mistralrs-core/src/dummy_paged_attention/mod.rs +++ b/mistralrs-core/src/dummy_paged_attention/mod.rs @@ -16,7 +16,6 @@ pub use block_engine_sequence::BlockEngineSequence; pub use cache_engine::{CacheConfig, CacheEngine}; use candle_core::{DType, Device}; pub use config::{ModelConfigLike, ModelConfigMetadata}; -use either::Either; pub use layers::PagedAttention; pub use scheduler::{ PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput, @@ -30,14 +29,14 @@ use tracing::info; pub struct PagedAttentionConfig { pub(crate) block_size: Option, pub(crate) mem_cpu: usize, - pub(crate) mem_gpu: Either, + pub(crate) mem_gpu: MemoryGpuConfig, } impl PagedAttentionConfig { pub fn new( _block_size: Option, _mem_cpu: usize, - _mem_gpu: Either, + _mem_gpu: MemoryGpuConfig, ) -> anyhow::Result { anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.") } @@ -48,6 +47,14 @@ pub enum AttentionImplementation { PagedAttention, } +#[derive(Clone, Copy)] +#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)] +pub enum MemoryGpuConfig { + Amount(usize), + Utilization(f32), + ContextSize(usize), +} + // See `pagedattention.cu` CALL_V1_LAUNCHER_BLOCK_SIZE const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32]; @@ -65,9 +72,20 @@ macro_rules! mb_to_blocks { }; } +macro_rules! ctxt_to_blocks { + ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => { + $context_len + * $dtype_size + * $config.num_kv_heads() + * ($config.hidden_size() / $config.num_attn_heads()) + * $config.num_layers() + * 2 + }; +} + /// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32. pub fn calculate_cache_config( - mem_gpu: Either, + mem_gpu: MemoryGpuConfig, mem_cpu: usize, block_size: Option, dtype: DType, @@ -82,16 +100,18 @@ pub fn calculate_cache_config( #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] let mem_gpu = match mem_gpu { - Either::Left(v) => v, - Either::Right(f) => { + MemoryGpuConfig::Amount(v) => v, + MemoryGpuConfig::Utilization(f) => { let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32; let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32; let used = total - free; - let size = (total * f - used) as usize; - info!("Allocating {size} MB for PagedAttention KV cache"); - size + (total * f - used) as usize + } + MemoryGpuConfig::ContextSize(toks) => { + ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB } }; + info!("Allocating {mem_gpu} MB for PagedAttention KV cache"); let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config); let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config); diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs index 31ebaec582..274237e085 100644 --- a/mistralrs-core/src/lib.rs +++ b/mistralrs-core/src/lib.rs @@ -59,7 +59,7 @@ mod xlora_models; pub use amoe::{AnyMoeConfig, AnyMoeExpertType}; pub use device_map::{DeviceLayerMapMetadata, DeviceMapMetadata, LayerDeviceMapper}; -pub use paged_attention::PagedAttentionConfig; +pub use paged_attention::{MemoryGpuConfig, PagedAttentionConfig}; pub use pipeline::{ chat_template::ChatTemplate, AnyMoeLoader, AnyMoePipeline, GGMLLoader, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GemmaLoader, diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs index 7ebabd2324..6b354fdfb5 100644 --- a/mistralrs-core/src/paged_attention/mod.rs +++ b/mistralrs-core/src/paged_attention/mod.rs @@ -16,7 +16,6 @@ pub use block_engine_sequence::BlockEngineSequence; pub use cache_engine::{CacheConfig, CacheEngine}; use candle_core::{DType, Device}; pub use config::{ModelConfigLike, ModelConfigMetadata}; -use either::Either; pub use layers::PagedAttention; pub use scheduler::{ PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput, @@ -30,14 +29,14 @@ use tracing::info; pub struct PagedAttentionConfig { pub(crate) block_size: Option, pub(crate) mem_cpu: usize, - pub(crate) mem_gpu: Either, + pub(crate) mem_gpu: MemoryGpuConfig, } impl PagedAttentionConfig { pub fn new( block_size: Option, mem_cpu: usize, - mem_gpu: Either, + mem_gpu: MemoryGpuConfig, ) -> anyhow::Result { Ok(Self { block_size, @@ -52,6 +51,14 @@ pub enum AttentionImplementation { PagedAttention, } +#[derive(Clone, Copy)] +#[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)] +pub enum MemoryGpuConfig { + Amount(usize), + Utilization(f32), + ContextSize(usize), +} + // See `pagedattention.cu` CALL_V1_LAUNCHER_BLOCK_SIZE const SUPPORTED_BLOCK_SIZE: &[usize] = &[8, 16, 32]; @@ -69,9 +76,20 @@ macro_rules! mb_to_blocks { }; } +macro_rules! ctxt_to_blocks { + ($context_len:expr, $dtype_size:expr, $block_size:expr, $config:expr) => { + $context_len + * $dtype_size + * $config.num_kv_heads() + * ($config.hidden_size() / $config.num_attn_heads()) + * $config.num_layers() + * 2 + }; +} + /// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32. pub fn calculate_cache_config( - mem_gpu: Either, + mem_gpu: MemoryGpuConfig, mem_cpu: usize, block_size: Option, dtype: DType, @@ -86,16 +104,18 @@ pub fn calculate_cache_config( #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] let mem_gpu = match mem_gpu { - Either::Left(v) => v, - Either::Right(f) => { + MemoryGpuConfig::Amount(v) => v, + MemoryGpuConfig::Utilization(f) => { let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32; let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32; let used = total - free; - let size = (total * f - used) as usize; - info!("Allocating {size} MB for PagedAttention KV cache"); - size + (total * f - used) as usize + } + MemoryGpuConfig::ContextSize(toks) => { + ctxt_to_blocks!(toks, dtype_size, block_size, config) / SIZE_IN_MB } }; + info!("Allocating {mem_gpu} MB for PagedAttention KV cache"); let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config); let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config); diff --git a/mistralrs-paged-attn/build.rs b/mistralrs-paged-attn/build.rs index e8c6a4f3f8..e1e6ee5442 100644 --- a/mistralrs-paged-attn/build.rs +++ b/mistralrs-paged-attn/build.rs @@ -5,32 +5,29 @@ const CUDA_NVCC_FLAGS: Option<&'static str> = option_env!("CUDA_NVCC_FLAGS"); #[cfg(all(feature = "cuda", target_family = "unix"))] fn main() -> Result<()> { - use std::fs; - use std::fs::read_to_string; use std::fs::OpenOptions; use std::io::prelude::*; use std::path::PathBuf; const OTHER_CONTENT: &str = r#" #[cfg(all(feature = "cuda", target_family = "unix"))] -mod ffi; +pub const COPY_BLOCKS_KERNEL: &str = + include_str!(concat!(env!("OUT_DIR"), "/copy_blocks_kernel.ptx")); +#[cfg(all(feature = "cuda", target_family = "unix"))] +pub const PAGEDATTENTION: &str = include_str!(concat!(env!("OUT_DIR"), "/pagedattention.ptx")); +#[cfg(all(feature = "cuda", target_family = "unix"))] +pub const RESHAPE_AND_CACHE_KERNEL: &str = + include_str!(concat!(env!("OUT_DIR"), "/reshape_and_cache_kernel.ptx")); + #[cfg(all(feature = "cuda", target_family = "unix"))] mod backend; +#[cfg(all(feature = "cuda", target_family = "unix"))] +mod ffi; #[cfg(all(feature = "cuda", target_family = "unix"))] -pub use backend::{{copy_blocks, paged_attention, reshape_and_cache, swap_blocks}}; +pub use backend::{copy_blocks, paged_attention, reshape_and_cache, swap_blocks}; "#; - fn read_lines(filename: &str) -> Vec { - let mut result = Vec::new(); - - for line in read_to_string(filename).unwrap().lines() { - result.push(line.to_string()) - } - - result - } - println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rerun-if-changed=src/pagedattention.cu"); println!("cargo:rerun-if-changed=src/copy_blocks_kernel.cu"); @@ -57,20 +54,11 @@ pub use backend::{{copy_blocks, paged_attention, reshape_and_cache, swap_blocks} println!("cargo:rustc-link-lib=pagedattention"); println!("cargo:rustc-link-lib=dylib=cudart"); - let contents = read_lines("src/lib.rs"); - for line in contents { - if line == "pub mod ffi;" { - return Ok(()); - } - } - let ct = fs::read_to_string("src/lib.rs")?; - if !ct.contains(OTHER_CONTENT) { - let mut file = OpenOptions::new().append(true).open("src/lib.rs").unwrap(); + let mut file = OpenOptions::new().write(true).open("src/lib.rs").unwrap(); - // Add the other stuff back - if let Err(e) = writeln!(file, "{OTHER_CONTENT}") { - anyhow::bail!("Error while building dependencies: {:?}\n", e) - } + // Add the other stuff back + if let Err(e) = writeln!(file, "{OTHER_CONTENT}") { + anyhow::bail!("Error while building dependencies: {:?}\n", e) } Ok(()) } diff --git a/mistralrs-paged-attn/src/backend/paged_attention.rs b/mistralrs-paged-attn/src/backend/paged_attention.rs index d3a3ed812d..032bf31b41 100644 --- a/mistralrs-paged-attn/src/backend/paged_attention.rs +++ b/mistralrs-paged-attn/src/backend/paged_attention.rs @@ -260,7 +260,7 @@ impl candle::CustomOp1 for PagedAttention { /// /// * `q` - Query tensor with shape `(num_sequences, num_heads_q, head_size)`. /// * `key_cache` - Key cache paged tensor of shape `(num_blocks, num_heads_kv, head_size / x, block_size, x)` -/// with `x` being the size of an element in bytes. +/// with `x` being the size of an element in bytes. /// * `value_cache` - Value cache paged tensor of shape `(num_blocks, num_heads_kv, head_size, block_size)`. /// * `block_tables` - Padded table associating blocks to each sequence of shape `(num_sequences, max_context_len // block_size)` /// * `context_lens` - Tensor associating lengths to each sequence of shape `(num_sequences)` @@ -439,7 +439,7 @@ fn update_cache< /// * `key` - Key tensor of shape `(num_tokens, num_heads, head_size)`. /// * `value` - Value tensor of shape `(num_tokens, num_heads, head_size)`. /// * `key_cache` - Key cache paged tensor of shape `(num_blocks, num_heads, head_size / x, block_size, x)` -/// with `x` being the size of an element in bytes. +/// with `x` being the size of an element in bytes. /// * `value_cache` - Value cache paged tensor of shape `(num_blocks, num_heads, head_size, block_size)`. /// * `slot_mapping` - Mapping associating a slot to each token of shape `(num_tokens)`. pub fn reshape_and_cache( diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi index 7d392c4330..edecc7a922 100644 --- a/mistralrs-pyo3/mistralrs.pyi +++ b/mistralrs-pyo3/mistralrs.pyi @@ -203,8 +203,16 @@ class Runner: the corresponding number of layers. - `in_situ_quant` sets the optional in-situ quantization for models that are not quantized (not GGUF or GGML). - `anymoe_config` specifies the AnyMoE config. If this is set, then the model will be loaded as an AnyMoE model. - - `pa_gpu_mem` sets GPU memory to allocate for KV cache with PagedAttention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is - CUDA, it will default to using 90% of the total memory after allocation of the KV cache. PagedAttention is only supported on CUDA and is always automatically activated. + - `pa_gpu_mem`: GPU memory to allocate for KV cache with PagedAttention in MBs. + PagedAttention is only supported on CUDA and is always automatically activated. + The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. + - `pa_gpu_mem_usage`: Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1. + If this is not set and the device is CUDA, it will default to `0.9`. + PagedAttention is only supported on CUDA and is always automatically activated. + The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. + - `pa_ctxt_len`: Total context length to allocate the KV cache for (total number of tokens which the KV cache can hold) + when using PagedAttention, which is only supported on CUDA and is always automatically activated. + The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. - `pa_blk_size` sets the block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32. PagedAttention is only supported on CUDA and is always automatically activated. - `no_paged_attn` disables PagedAttention on CUDA diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs index 8ff8ad95a9..c008c306d1 100644 --- a/mistralrs-pyo3/src/lib.rs +++ b/mistralrs-pyo3/src/lib.rs @@ -21,10 +21,10 @@ use candle_core::Device; use mistralrs_core::{ initialize_logging, paged_attn_supported, AnyMoeLoader, ChatCompletionResponse, CompletionResponse, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata, - DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, Loader, MistralRs, - MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig, - PagedAttentionConfig, Request as _Request, RequestMessage, Response, SamplingParams, - SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, TokenSource, + DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, Loader, + MemoryGpuConfig, MistralRs, MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalRequest, + NormalSpecificConfig, PagedAttentionConfig, Request as _Request, RequestMessage, Response, + SamplingParams, SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, TokenSource, VisionLoaderBuilder, VisionSpecificConfig, }; use pyo3::{ @@ -300,6 +300,8 @@ impl Runner { in_situ_quant = None, anymoe_config = None, pa_gpu_mem = None, + pa_gpu_mem_usage = None, + pa_ctxt_len = None, pa_blk_size = None, no_paged_attn = false, ))] @@ -315,7 +317,9 @@ impl Runner { num_device_layers: Option>, in_situ_quant: Option, anymoe_config: Option, - pa_gpu_mem: Option>, + pa_gpu_mem: Option, + pa_gpu_mem_usage: Option, + pa_ctxt_len: Option, pa_blk_size: Option, no_paged_attn: bool, ) -> PyResult { @@ -428,22 +432,44 @@ impl Runner { // Allocate 0.5 GB of CPU memory just as a placeholder. // Nothing happens here as we have no `swap_out`, see `_preempt_by_swap`. - let cache_config = match ( - pa_blk_size, - pa_gpu_mem, - paged_attn_supported(), - no_paged_attn, - ) { - (block_size, None, true, false) => Some(PagedAttentionConfig::new( - block_size, - 512, - Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory - )?), - (block_size, Some(either), true, false) => { - Some(PagedAttentionConfig::new(block_size, 512, either)?) - } - (_, _, _, _) => None, - }; + let cache_config = + match ( + pa_blk_size, + pa_gpu_mem, + pa_gpu_mem_usage, + pa_ctxt_len, + paged_attn_supported(), + no_paged_attn, + ) { + (block_size, None, None, None, true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Utilization(0.9), // NOTE(EricLBuehler): default is to use 90% of memory + )?), + (block_size, None, None, Some(ctxt), true, false) => Some( + PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::ContextSize(ctxt))?, + ), + (block_size, None, Some(f), None, true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Utilization(f), + )?), + (block_size, Some(m), None, None, true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Amount(m), + )?), + (block_size, Some(_m), Some(f), None, true, false) => Some( + PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::Utilization(f))?, + ), + (block_size, Some(_m), None, Some(ctxt), true, false) => Some( + PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::ContextSize(ctxt))?, + ), + (block_size, None, Some(f), Some(_ctxt), true, false) => Some( + PagedAttentionConfig::new(block_size, 512, MemoryGpuConfig::Utilization(f))?, + ), + (_, _, _, _, _, _) => None, + }; let pipeline = loader .load_model_from_hf( diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs index 0354ed58a6..f17bd3ab5e 100644 --- a/mistralrs-server/src/main.rs +++ b/mistralrs-server/src/main.rs @@ -7,12 +7,11 @@ use axum::{ }; use candle_core::{quantized::GgmlDType, Device}; use clap::Parser; -use either::Either; use mistralrs_core::{ get_model_dtype, get_tgt_non_granular_index, initialize_logging, paged_attn_supported, DefaultSchedulerMethod, DeviceLayerMapMetadata, DeviceMapMetadata, Loader, LoaderBuilder, - MistralRs, MistralRsBuilder, ModelSelected, PagedAttentionConfig, Request, SchedulerConfig, - TokenSource, + MemoryGpuConfig, MistralRs, MistralRsBuilder, ModelSelected, PagedAttentionConfig, Request, + SchedulerConfig, TokenSource, }; use openai::{ChatCompletionRequest, Message, ModelObjects, StopTokens}; use serde::{Deserialize, Serialize}; @@ -123,17 +122,25 @@ struct Args { #[arg(long = "isq", value_parser = parse_isq)] in_situ_quant: Option, - /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to - /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated. + /// GPU memory to allocate for KV cache with PagedAttention in MBs. + /// PagedAttention is only supported on CUDA and is always automatically activated. + /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. #[arg(long = "pa-gpu-mem")] paged_attn_gpu_mem: Option, /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1. - /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated. - /// This is always used over `pa-gpu-mem` if both are specified. + /// If this is not set and the device is CUDA, it will default to `0.9`. + /// PagedAttention is only supported on CUDA and is always automatically activated. + /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. #[arg(long = "pa-gpu-mem-usage")] paged_attn_gpu_mem_usage: Option, + /// Total context length to allocate the KV cache for (total number of tokens which the KV cache can hold) + /// when using PagedAttention, which is only supported on CUDA and is always automatically activated. + /// The priority is as follows: `pa-gpu-mem-usage` (default = 0.9) > `pa-ctxt-len` > `pa-gpu-mem`. + #[arg(long = "pa-ctxt-len")] + paged_ctxt_len: Option, + /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32. /// PagedAttention is only supported on CUDA and is always automatically activated. #[arg(long = "pa-blk-size")] @@ -351,31 +358,55 @@ async fn main() -> Result<()> { args.paged_attn_block_size, args.paged_attn_gpu_mem, args.paged_attn_gpu_mem_usage, + args.paged_ctxt_len, paged_attn_supported(), args.no_paged_attn, ) { - (block_size, None, None, true, false) => Some(PagedAttentionConfig::new( + (block_size, None, None, None, true, false) => Some(PagedAttentionConfig::new( block_size, 512, - Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory + MemoryGpuConfig::Utilization(0.9), // NOTE(EricLBuehler): default is to use 90% of memory )?), - (block_size, Some(m), None, true, false) => { - Some(PagedAttentionConfig::new(block_size, 512, Either::Left(m))?) - } - (block_size, None, Some(f), true, false) => Some(PagedAttentionConfig::new( + (block_size, None, None, Some(ctxt), true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::ContextSize(ctxt), + )?), + (block_size, None, Some(f), None, true, false) => Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Utilization(f), + )?), + (block_size, Some(m), None, None, true, false) => Some(PagedAttentionConfig::new( block_size, 512, - Either::Right(f), + MemoryGpuConfig::Amount(m), )?), - (block_size, Some(_m), Some(f), true, false) => { - info!("Both memory size and usage were specified, defaulting to the usage value."); + (block_size, Some(_m), Some(f), None, true, false) => { + info!("Both memory size, and usage were specified, defaulting to the usage value."); + Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::Utilization(f), + )?) + } + (block_size, Some(_m), None, Some(ctxt), true, false) => { + info!("All memory size and ctxt len, defaulting to the context len value."); + Some(PagedAttentionConfig::new( + block_size, + 512, + MemoryGpuConfig::ContextSize(ctxt), + )?) + } + (block_size, None, Some(f), Some(_ctxt), true, false) => { + info!("Both ctxt len and usage were specified, defaulting to the usage value."); Some(PagedAttentionConfig::new( block_size, 512, - Either::Right(f), + MemoryGpuConfig::Utilization(f), )?) } - (_, _, _, _, _) => None, + (_, _, _, _, _, _) => None, }; let pipeline = loader.load_model_from_hf( diff --git a/mistralrs/examples/paged_attn/main.rs b/mistralrs/examples/paged_attn/main.rs index a6fb014a19..5e8bdbc52a 100644 --- a/mistralrs/examples/paged_attn/main.rs +++ b/mistralrs/examples/paged_attn/main.rs @@ -10,8 +10,8 @@ use std::sync::Arc; use tokio::sync::mpsc::channel; use mistralrs::{ - Constraint, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder, ModelDType, - NormalLoaderBuilder, NormalLoaderType, NormalRequest, NormalSpecificConfig, + Constraint, Device, DeviceMapMetadata, MemoryGpuConfig, MistralRs, MistralRsBuilder, + ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalRequest, NormalSpecificConfig, PagedAttentionConfig, Request, RequestMessage, Response, Result, SamplingParams, SchedulerConfig, TokenSource, }; @@ -51,7 +51,7 @@ fn setup() -> anyhow::Result> { Some(PagedAttentionConfig::new( Some(32), 1024, - Either::Right(0.9), + MemoryGpuConfig::Utilization(0.9), )?), // Automatically determine memory usage )?; let config = pipeline