Skip to content

Commit

Permalink
refactor(config): increase rate limit to 1200, set 6000 for llama-cpp…
Browse files Browse the repository at this point in the history
…-server based config (#3482)
  • Loading branch information
wsxiaoys committed Nov 28, 2024
1 parent 7ee0092 commit 05b31d2
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 16 deletions.
27 changes: 14 additions & 13 deletions crates/http-api-bindings/src/rate_limit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,23 @@ use futures::stream::BoxStream;
use ratelimit::Ratelimiter;
use tabby_inference::{ChatCompletionStream, CompletionOptions, CompletionStream, Embedding};

fn new_rate_limiter(rpm: u64) -> anyhow::Result<Ratelimiter> {
fn new_rate_limiter(rpm: u64) -> Ratelimiter {
Ratelimiter::builder(rpm, Duration::from_secs(60))
.max_tokens(rpm)
.initial_available(rpm)
.build()
.map_err(|e| {
anyhow::anyhow!(
"Failed to create ratelimiter, please check the rate limit configuration: {}",
e,
)
})
.expect("Failed to create RateLimiter, please check the HttpModelConfig.rate_limit configuration")
}

pub struct RateLimitedEmbedding {
embedding: Box<dyn Embedding>,
rate_limiter: Ratelimiter,
}

pub fn new_embedding(embedding: Box<dyn Embedding>, rpm: u64) -> impl Embedding {
pub fn new_embedding(embedding: Box<dyn Embedding>, request_per_minute: u64) -> impl Embedding {
RateLimitedEmbedding {
embedding,
rate_limiter: new_rate_limiter(rpm).unwrap(),
rate_limiter: new_rate_limiter(request_per_minute),
}
}

Expand All @@ -57,10 +52,13 @@ pub struct RateLimitedCompletion {
rate_limiter: Ratelimiter,
}

pub fn new_completion(completion: Box<dyn CompletionStream>, rpm: u64) -> impl CompletionStream {
pub fn new_completion(
completion: Box<dyn CompletionStream>,
request_per_minute: u64,
) -> impl CompletionStream {
RateLimitedCompletion {
completion,
rate_limiter: new_rate_limiter(rpm).unwrap(),
rate_limiter: new_rate_limiter(request_per_minute),
}
}

Expand All @@ -86,10 +84,13 @@ pub struct RateLimitedChatStream {
rate_limiter: Ratelimiter,
}

pub fn new_chat(completion: Box<dyn ChatCompletionStream>, rpm: u64) -> impl ChatCompletionStream {
pub fn new_chat(
completion: Box<dyn ChatCompletionStream>,
request_per_minute: u64,
) -> impl ChatCompletionStream {
RateLimitedChatStream {
completion,
rate_limiter: new_rate_limiter(rpm).unwrap(),
rate_limiter: new_rate_limiter(request_per_minute),
}
}

Expand Down
12 changes: 11 additions & 1 deletion crates/llama-cpp-server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use futures::stream::BoxStream;
use serde::Deserialize;
use supervisor::LlamaCppSupervisor;
use tabby_common::{
config::{HttpModelConfigBuilder, LocalModelConfig, ModelConfig},
config::{HttpModelConfigBuilder, LocalModelConfig, ModelConfig, RateLimit, RateLimitBuilder},
registry::{parse_model_id, ModelRegistry, GGML_MODEL_PARTITIONED_PREFIX},
};
use tabby_inference::{ChatCompletionStream, CompletionOptions, CompletionStream, Embedding};
Expand Down Expand Up @@ -46,6 +46,7 @@ impl EmbeddingServer {

let config = HttpModelConfigBuilder::default()
.api_endpoint(Some(api_endpoint(server.port())))
.rate_limit(build_rate_limit_config())
.kind("llama.cpp/embedding".to_string())
.build()
.expect("Failed to create HttpModelConfig");
Expand Down Expand Up @@ -95,6 +96,7 @@ impl CompletionServer {
async fn new_with_supervisor(server: Arc<LlamaCppSupervisor>) -> Self {
let config = HttpModelConfigBuilder::default()
.api_endpoint(Some(api_endpoint(server.port())))
.rate_limit(build_rate_limit_config())
.kind("llama.cpp/completion".to_string())
.build()
.expect("Failed to create HttpModelConfig");
Expand Down Expand Up @@ -142,6 +144,7 @@ impl ChatCompletionServer {
async fn new_with_supervisor(server: Arc<LlamaCppSupervisor>) -> Self {
let config = HttpModelConfigBuilder::default()
.api_endpoint(Some(api_endpoint(server.port())))
.rate_limit(build_rate_limit_config())
.kind("openai/chat".to_string())
.model_name(Some("local".into()))
.build()
Expand Down Expand Up @@ -320,3 +323,10 @@ async fn resolve_prompt_info(model_id: &str) -> PromptInfo {
}
}
}

fn build_rate_limit_config() -> RateLimit {
RateLimitBuilder::default()
.request_per_minute(6000)
.build()
.expect("Failed to create RateLimit")
}
3 changes: 1 addition & 2 deletions crates/tabby-common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ pub struct HttpModelConfig {
#[builder(default)]
pub api_key: Option<String>,

#[builder(default)]
#[serde(default)]
pub rate_limit: RateLimit,

Expand Down Expand Up @@ -354,7 +353,7 @@ pub struct RateLimit {
impl Default for RateLimit {
fn default() -> Self {
Self {
request_per_minute: 600,
request_per_minute: 1200,
}
}
}
Expand Down

0 comments on commit 05b31d2

Please sign in to comment.