diff --git a/crates/http-api-bindings/src/rate_limit.rs b/crates/http-api-bindings/src/rate_limit.rs index c9cc0e269adc..53f5bea8cc62 100644 --- a/crates/http-api-bindings/src/rate_limit.rs +++ b/crates/http-api-bindings/src/rate_limit.rs @@ -11,17 +11,12 @@ use futures::stream::BoxStream; use ratelimit::Ratelimiter; use tabby_inference::{ChatCompletionStream, CompletionOptions, CompletionStream, Embedding}; -fn new_rate_limiter(rpm: u64) -> anyhow::Result { +fn new_rate_limiter(rpm: u64) -> Ratelimiter { Ratelimiter::builder(rpm, Duration::from_secs(60)) .max_tokens(rpm) .initial_available(rpm) .build() - .map_err(|e| { - anyhow::anyhow!( - "Failed to create ratelimiter, please check the rate limit configuration: {}", - e, - ) - }) + .expect("Failed to create RateLimiter, please check the HttpModelConfig.rate_limit configuration") } pub struct RateLimitedEmbedding { @@ -29,10 +24,10 @@ pub struct RateLimitedEmbedding { rate_limiter: Ratelimiter, } -pub fn new_embedding(embedding: Box, rpm: u64) -> impl Embedding { +pub fn new_embedding(embedding: Box, request_per_minute: u64) -> impl Embedding { RateLimitedEmbedding { embedding, - rate_limiter: new_rate_limiter(rpm).unwrap(), + rate_limiter: new_rate_limiter(request_per_minute), } } @@ -57,10 +52,13 @@ pub struct RateLimitedCompletion { rate_limiter: Ratelimiter, } -pub fn new_completion(completion: Box, rpm: u64) -> impl CompletionStream { +pub fn new_completion( + completion: Box, + request_per_minute: u64, +) -> impl CompletionStream { RateLimitedCompletion { completion, - rate_limiter: new_rate_limiter(rpm).unwrap(), + rate_limiter: new_rate_limiter(request_per_minute), } } @@ -86,10 +84,13 @@ pub struct RateLimitedChatStream { rate_limiter: Ratelimiter, } -pub fn new_chat(completion: Box, rpm: u64) -> impl ChatCompletionStream { +pub fn new_chat( + completion: Box, + request_per_minute: u64, +) -> impl ChatCompletionStream { RateLimitedChatStream { completion, - rate_limiter: new_rate_limiter(rpm).unwrap(), + rate_limiter: new_rate_limiter(request_per_minute), } } diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs index ef1ba84804d4..a7c0a30b71cb 100644 --- a/crates/llama-cpp-server/src/lib.rs +++ b/crates/llama-cpp-server/src/lib.rs @@ -9,7 +9,7 @@ use futures::stream::BoxStream; use serde::Deserialize; use supervisor::LlamaCppSupervisor; use tabby_common::{ - config::{HttpModelConfigBuilder, LocalModelConfig, ModelConfig}, + config::{HttpModelConfigBuilder, LocalModelConfig, ModelConfig, RateLimit, RateLimitBuilder}, registry::{parse_model_id, ModelRegistry, GGML_MODEL_PARTITIONED_PREFIX}, }; use tabby_inference::{ChatCompletionStream, CompletionOptions, CompletionStream, Embedding}; @@ -46,6 +46,7 @@ impl EmbeddingServer { let config = HttpModelConfigBuilder::default() .api_endpoint(Some(api_endpoint(server.port()))) + .rate_limit(build_rate_limit_config()) .kind("llama.cpp/embedding".to_string()) .build() .expect("Failed to create HttpModelConfig"); @@ -95,6 +96,7 @@ impl CompletionServer { async fn new_with_supervisor(server: Arc) -> Self { let config = HttpModelConfigBuilder::default() .api_endpoint(Some(api_endpoint(server.port()))) + .rate_limit(build_rate_limit_config()) .kind("llama.cpp/completion".to_string()) .build() .expect("Failed to create HttpModelConfig"); @@ -142,6 +144,7 @@ impl ChatCompletionServer { async fn new_with_supervisor(server: Arc) -> Self { let config = HttpModelConfigBuilder::default() .api_endpoint(Some(api_endpoint(server.port()))) + .rate_limit(build_rate_limit_config()) .kind("openai/chat".to_string()) .model_name(Some("local".into())) .build() @@ -320,3 +323,10 @@ async fn resolve_prompt_info(model_id: &str) -> PromptInfo { } } } + +fn build_rate_limit_config() -> RateLimit { + RateLimitBuilder::default() + .request_per_minute(6000) + .build() + .expect("Failed to create RateLimit") +} diff --git a/crates/tabby-common/src/config.rs b/crates/tabby-common/src/config.rs index 8c1f1897b6cd..019e9233ffab 100644 --- a/crates/tabby-common/src/config.rs +++ b/crates/tabby-common/src/config.rs @@ -289,7 +289,6 @@ pub struct HttpModelConfig { #[builder(default)] pub api_key: Option, - #[builder(default)] #[serde(default)] pub rate_limit: RateLimit, @@ -354,7 +353,7 @@ pub struct RateLimit { impl Default for RateLimit { fn default() -> Self { Self { - request_per_minute: 600, + request_per_minute: 1200, } } }