TabbyML · wsxiaoys · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/crates/http-api-bindings/src/chat/mod.rs b/crates/http-api-bindings/src/chat/mod.rs
@@ -1,6 +1,9 @@
-use std::sync::Arc;
+mod rate_limit;
+
+use std::{sync::Arc, time::Duration};
 
 use async_openai::config::OpenAIConfig;
+use ratelimit::Ratelimiter;
 use tabby_common::config::HttpModelConfig;
 use tabby_inference::{ChatCompletionStream, ExtendedOpenAIConfig};
 
@@ -31,8 +34,16 @@
 
     let config = builder.build().expect("Failed to build config");
 
-    Arc::new(
+    let engine = Box::new(
         async_openai::Client::with_config(config)
             .with_http_client(create_reqwest_client(api_endpoint)),
-    )
+    );
+
+    let ratelimiter =
+        Ratelimiter::builder(model.rate_limit.request_per_minute, Duration::from_secs(60))
+            .max_tokens(model.rate_limit.request_per_minute)
+            .build()
+            .expect("Failed to create ratelimiter, please check the rate limit configuration");
+
+    Arc::new(rate_limit::RateLimitedChatStream::new(engine, ratelimiter))
 }
diff --git a/crates/http-api-bindings/src/chat/rate_limit.rs b/crates/http-api-bindings/src/chat/rate_limit.rs
@@ -0,0 +1,64 @@
+use async_openai::{
+    error::{ApiError, OpenAIError},
+    types::{
+        ChatCompletionResponseStream, CreateChatCompletionRequest, CreateChatCompletionResponse,
+    },
+};
+use async_trait::async_trait;
+use ratelimit::Ratelimiter;
+use tabby_inference::ChatCompletionStream;
+
+pub struct RateLimitedChatStream {
+    completion: Box<dyn ChatCompletionStream>,
+    rate_limiter: Ratelimiter,
+}
+
+impl RateLimitedChatStream {
+    pub fn new(completion: Box<dyn ChatCompletionStream>, rate_limiter: Ratelimiter) -> Self {
+        Self {
+            completion,
+            rate_limiter,
+        }
+    }
+}
+
+#[async_trait]
+impl ChatCompletionStream for RateLimitedChatStream {
+    async fn chat(
+        &self,
+        request: CreateChatCompletionRequest,
+    ) -> Result<CreateChatCompletionResponse, OpenAIError> {
+        for _ in 0..5 {
+            if let Err(sleep) = self.rate_limiter.try_wait() {
+                tokio::time::sleep(sleep).await;
+                continue;
+            }
+
+            return self.completion.chat(request).await;
+        }
+
+        Err(OpenAIError::ApiError(ApiError {
+            message: "Rate limit exceeded for chat completion".to_owned(),
+            r#type: None,
+            param: None,
+            code: None,
+        }))
+    }
+
+    async fn chat_stream(
+        &self,
+        request: CreateChatCompletionRequest,
+    ) -> Result<ChatCompletionResponseStream, OpenAIError> {
+        for _ in 0..5 {
+            if let Err(sleep) = self.rate_limiter.try_wait() {
+                tokio::time::sleep(sleep).await;
+                continue;
+            }
+
+            return self.completion.chat_stream(request).await;
+        }
+
+        // Return an empty stream if the rate limit is exceeded
+        Ok(Box::pin(futures::stream::empty()))
+    }
+}
diff --git a/crates/http-api-bindings/src/completion/llama.rs b/crates/http-api-bindings/src/completion/llama.rs
@@ -14,14 +14,14 @@
 }
 
 impl LlamaCppEngine {
-    pub fn create(api_endpoint: &str, api_key: Option<String>) -> Self {
+    pub fn create(api_endpoint: &str, api_key: Option<String>) -> Box<dyn CompletionStream> {
         let client = create_reqwest_client(api_endpoint);
 
-        Self {
+        Box::new(Self {
             client,
             api_endpoint: format!("{}/completion", api_endpoint),
             api_key,
-        }
+        })
     }
 }
 

diff --git a/crates/http-api-bindings/src/completion/mistral.rs b/crates/http-api-bindings/src/completion/mistral.rs
@@ -21,20 +21,20 @@
         api_endpoint: Option<&str>,
         api_key: Option<String>,
         model_name: Option<String>,
-    ) -> Self {
+    ) -> Box<dyn CompletionStream> {
         let client = reqwest::Client::new();
         let model_name = model_name.unwrap_or("codestral-latest".into());
         let api_key = api_key.expect("API key is required for mistral/completion");
 
-        Self {
+        Box::new(Self {
             client,
             model_name,
             api_endpoint: format!(
                 "{}/v1/fim/completions",
                 api_endpoint.unwrap_or(DEFAULT_API_ENDPOINT)
             ),
             api_key,
-        }
+        })
     }
 }
 

diff --git a/crates/http-api-bindings/src/completion/mod.rs b/crates/http-api-bindings/src/completion/mod.rs
@@ -1,65 +1,63 @@
 mod llama;
 mod mistral;
 mod openai;
+mod rate_limit;
 
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 
 use llama::LlamaCppEngine;
 use mistral::MistralFIMEngine;
 use openai::OpenAICompletionEngine;
+use ratelimit::Ratelimiter;
 use tabby_common::config::HttpModelConfig;
 use tabby_inference::CompletionStream;
 
 pub async fn create(model: &HttpModelConfig) -> Arc<dyn CompletionStream> {
-    match model.kind.as_str() {
-        "llama.cpp/completion" => {
-            let engine = LlamaCppEngine::create(
-                model
-                    .api_endpoint
-                    .as_deref()
-                    .expect("api_endpoint is required"),
-                model.api_key.clone(),
-            );
-            Arc::new(engine)
-        }
+    let engine = match model.kind.as_str() {
+        "llama.cpp/completion" => LlamaCppEngine::create(
+            model
+                .api_endpoint
+                .as_deref()
+                .expect("api_endpoint is required"),
+            model.api_key.clone(),
+        ),
         "ollama/completion" => ollama_api_bindings::create_completion(model).await,
-        "mistral/completion" => {
-            let engine = MistralFIMEngine::create(
-                model.api_endpoint.as_deref(),
-                model.api_key.clone(),
-                model.model_name.clone(),
-            );
-            Arc::new(engine)
-        }
-        x if OPENAI_LEGACY_COMPLETION_FIM_ALIASES.contains(&x) => {
-            let engine = OpenAICompletionEngine::create(
-                model.model_name.clone(),
-                model
-                    .api_endpoint
-                    .as_deref()
-                    .expect("api_endpoint is required"),
-                model.api_key.clone(),
-                true,
-            );
-            Arc::new(engine)
-        }
-        "openai/legacy_completion_no_fim" | "vllm/completion" => {
-            let engine = OpenAICompletionEngine::create(
-                model.model_name.clone(),
-                model
-                    .api_endpoint
-                    .as_deref()
-                    .expect("api_endpoint is required"),
-                model.api_key.clone(),
-                false,
-            );
-            Arc::new(engine)
-        }
+        "mistral/completion" => MistralFIMEngine::create(
+            model.api_endpoint.as_deref(),
+            model.api_key.clone(),
+            model.model_name.clone(),
+        ),
+        x if OPENAI_LEGACY_COMPLETION_FIM_ALIASES.contains(&x) => OpenAICompletionEngine::create(
+            model.model_name.clone(),
+            model
+                .api_endpoint
+                .as_deref()
+                .expect("api_endpoint is required"),
+            model.api_key.clone(),
+            true,
+        ),
+        "openai/legacy_completion_no_fim" | "vllm/completion" => OpenAICompletionEngine::create(
+            model.model_name.clone(),
+            model
+                .api_endpoint
+                .as_deref()
+                .expect("api_endpoint is required"),
+            model.api_key.clone(),
+            false,
+        ),
         unsupported_kind => panic!(
             "Unsupported model kind for http completion: {}",
             unsupported_kind
         ),
-    }
+    };
+
+    let ratelimiter =
+        Ratelimiter::builder(model.rate_limit.request_per_minute, Duration::from_secs(60))
+            .max_tokens(model.rate_limit.request_per_minute)
+            .build()
+            .expect("Failed to create ratelimiter, please check the rate limit configuration");
+
+    Arc::new(rate_limit::RateLimitedCompletion::new(engine, ratelimiter))
 }
 
 const FIM_TOKEN: &str = "<|FIM|>";

diff --git a/crates/http-api-bindings/src/completion/openai.rs b/crates/http-api-bindings/src/completion/openai.rs
@@ -25,17 +25,17 @@
         api_endpoint: &str,
         api_key: Option<String>,
         support_fim: bool,
-    ) -> Self {
+    ) -> Box<dyn CompletionStream> {
         let model_name = model_name.expect("model_name is required for openai/completion");
         let client = reqwest::Client::new();
 
-        Self {
+        Box::new(Self {
             client,
             model_name,
             api_endpoint: format!("{}/completions", api_endpoint),
             api_key,
             support_fim,
-        }
+        })
     }
 }
 

diff --git a/crates/http-api-bindings/src/completion/rate_limit.rs b/crates/http-api-bindings/src/completion/rate_limit.rs
@@ -0,0 +1,35 @@
+use async_trait::async_trait;
+use futures::stream::BoxStream;
+use ratelimit::Ratelimiter;
+use tabby_inference::{CompletionOptions, CompletionStream};
+
+pub struct RateLimitedCompletion {
+    completion: Box<dyn CompletionStream>,
+    rate_limiter: Ratelimiter,
+}
+
+impl RateLimitedCompletion {
+    pub fn new(completion: Box<dyn CompletionStream>, rate_limiter: Ratelimiter) -> Self {
+        Self {
+            completion,
+            rate_limiter,
+        }
+    }
+}
+
+#[async_trait]
+impl CompletionStream for RateLimitedCompletion {
+    async fn generate(&self, prompt: &str, options: CompletionOptions) -> BoxStream<String> {
+        for _ in 0..5 {
+            if let Err(sleep) = self.rate_limiter.try_wait() {
+                tokio::time::sleep(sleep).await;
+                continue;
+            }
+
+            return self.completion.generate(prompt, options).await;
+        }
+
+        // Return an empty stream if the rate limit is exceeded
+        Box::pin(futures::stream::empty())
+    }
+}
diff --git a/crates/ollama-api-bindings/src/completion.rs b/crates/ollama-api-bindings/src/completion.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use async_stream::stream;
 use async_trait::async_trait;
 use futures::{stream::BoxStream, StreamExt};
@@ -58,11 +56,11 @@
     }
 }
 
-pub async fn create(config: &HttpModelConfig) -> Arc<dyn CompletionStream> {
+pub async fn create(config: &HttpModelConfig) -> Box<dyn CompletionStream> {
     let connection = Ollama::try_new(config.api_endpoint.as_deref().unwrap().to_owned())
         .expect("Failed to create connection to Ollama, URL invalid");
 
     let model = connection.select_model_or_default(config).await.unwrap();
 
-    Arc::new(OllamaCompletion { connection, model })
+    Box::new(OllamaCompletion { connection, model })
 }