Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(models-http-api): add rate limit support for completion and chat #3468

Merged
merged 4 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions crates/http-api-bindings/src/chat/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
use tabby_common::config::HttpModelConfig;
use tabby_inference::{ChatCompletionStream, ExtendedOpenAIConfig};

use super::rate_limit;
use crate::create_reqwest_client;

pub async fn create(model: &HttpModelConfig) -> Arc<dyn ChatCompletionStream> {
Expand All @@ -16,6 +17,7 @@
.with_api_key(model.api_key.clone().unwrap_or_default());

let mut builder = ExtendedOpenAIConfig::builder();

Check warning on line 20 in crates/http-api-bindings/src/chat/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/chat/mod.rs#L20

Added line #L20 was not covered by tests
builder
.base(config)
.supported_models(model.supported_models.clone())
Expand All @@ -31,8 +33,13 @@

let config = builder.build().expect("Failed to build config");

Arc::new(
let engine = Box::new(

Check warning on line 36 in crates/http-api-bindings/src/chat/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/chat/mod.rs#L36

Added line #L36 was not covered by tests
async_openai::Client::with_config(config)
.with_http_client(create_reqwest_client(api_endpoint)),
)
);

Arc::new(rate_limit::new_chat(
engine,
model.rate_limit.request_per_minute,
))

Check warning on line 44 in crates/http-api-bindings/src/chat/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/chat/mod.rs#L39-L44

Added lines #L39 - L44 were not covered by tests
}
6 changes: 3 additions & 3 deletions crates/http-api-bindings/src/completion/llama.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
}

impl LlamaCppEngine {
pub fn create(api_endpoint: &str, api_key: Option<String>) -> Self {
pub fn create(api_endpoint: &str, api_key: Option<String>) -> Box<dyn CompletionStream> {

Check warning on line 17 in crates/http-api-bindings/src/completion/llama.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/llama.rs#L17

Added line #L17 was not covered by tests
let client = create_reqwest_client(api_endpoint);

Self {
Box::new(Self {

Check warning on line 20 in crates/http-api-bindings/src/completion/llama.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/llama.rs#L20

Added line #L20 was not covered by tests
client,
api_endpoint: format!("{}/completion", api_endpoint),
api_key,
}
})

Check warning on line 24 in crates/http-api-bindings/src/completion/llama.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/llama.rs#L24

Added line #L24 was not covered by tests
}
}

Expand Down
6 changes: 3 additions & 3 deletions crates/http-api-bindings/src/completion/mistral.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@
api_endpoint: Option<&str>,
api_key: Option<String>,
model_name: Option<String>,
) -> Self {
) -> Box<dyn CompletionStream> {

Check warning on line 24 in crates/http-api-bindings/src/completion/mistral.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/mistral.rs#L24

Added line #L24 was not covered by tests
let client = reqwest::Client::new();
let model_name = model_name.unwrap_or("codestral-latest".into());
let api_key = api_key.expect("API key is required for mistral/completion");

Self {
Box::new(Self {

Check warning on line 29 in crates/http-api-bindings/src/completion/mistral.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/mistral.rs#L29

Added line #L29 was not covered by tests
client,
model_name,
api_endpoint: format!(
"{}/v1/fim/completions",
api_endpoint.unwrap_or(DEFAULT_API_ENDPOINT)
),
api_key,
}
})

Check warning on line 37 in crates/http-api-bindings/src/completion/mistral.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/mistral.rs#L37

Added line #L37 was not covered by tests
}
}

Expand Down
83 changes: 39 additions & 44 deletions crates/http-api-bindings/src/completion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,56 +10,51 @@
use tabby_common::config::HttpModelConfig;
use tabby_inference::CompletionStream;

use super::rate_limit;

pub async fn create(model: &HttpModelConfig) -> Arc<dyn CompletionStream> {
match model.kind.as_str() {
"llama.cpp/completion" => {
let engine = LlamaCppEngine::create(
model
.api_endpoint
.as_deref()
.expect("api_endpoint is required"),
model.api_key.clone(),
);
Arc::new(engine)
}
let engine = match model.kind.as_str() {
"llama.cpp/completion" => LlamaCppEngine::create(
model
.api_endpoint
.as_deref()
.expect("api_endpoint is required"),
model.api_key.clone(),
),

Check warning on line 23 in crates/http-api-bindings/src/completion/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/mod.rs#L16-L23

Added lines #L16 - L23 were not covered by tests
"ollama/completion" => ollama_api_bindings::create_completion(model).await,
"mistral/completion" => {
let engine = MistralFIMEngine::create(
model.api_endpoint.as_deref(),
model.api_key.clone(),
model.model_name.clone(),
);
Arc::new(engine)
}
x if OPENAI_LEGACY_COMPLETION_FIM_ALIASES.contains(&x) => {
let engine = OpenAICompletionEngine::create(
model.model_name.clone(),
model
.api_endpoint
.as_deref()
.expect("api_endpoint is required"),
model.api_key.clone(),
true,
);
Arc::new(engine)
}
"openai/legacy_completion_no_fim" | "vllm/completion" => {
let engine = OpenAICompletionEngine::create(
model.model_name.clone(),
model
.api_endpoint
.as_deref()
.expect("api_endpoint is required"),
model.api_key.clone(),
false,
);
Arc::new(engine)
}
"mistral/completion" => MistralFIMEngine::create(
model.api_endpoint.as_deref(),
model.api_key.clone(),
model.model_name.clone(),
),
x if OPENAI_LEGACY_COMPLETION_FIM_ALIASES.contains(&x) => OpenAICompletionEngine::create(
model.model_name.clone(),
model
.api_endpoint
.as_deref()
.expect("api_endpoint is required"),
model.api_key.clone(),
true,
),
"openai/legacy_completion_no_fim" | "vllm/completion" => OpenAICompletionEngine::create(
model.model_name.clone(),
model
.api_endpoint
.as_deref()
.expect("api_endpoint is required"),
model.api_key.clone(),
false,
),

Check warning on line 47 in crates/http-api-bindings/src/completion/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/mod.rs#L25-L47

Added lines #L25 - L47 were not covered by tests
unsupported_kind => panic!(
"Unsupported model kind for http completion: {}",
unsupported_kind
),
}
};

Arc::new(rate_limit::new_completion(
engine,
model.rate_limit.request_per_minute,
))

Check warning on line 57 in crates/http-api-bindings/src/completion/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/mod.rs#L54-L57

Added lines #L54 - L57 were not covered by tests
}

const FIM_TOKEN: &str = "<|FIM|>";
Expand Down
6 changes: 3 additions & 3 deletions crates/http-api-bindings/src/completion/openai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@
api_endpoint: &str,
api_key: Option<String>,
support_fim: bool,
) -> Self {
) -> Box<dyn CompletionStream> {

Check warning on line 28 in crates/http-api-bindings/src/completion/openai.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/openai.rs#L28

Added line #L28 was not covered by tests
let model_name = model_name.expect("model_name is required for openai/completion");
let client = reqwest::Client::new();

Self {
Box::new(Self {

Check warning on line 32 in crates/http-api-bindings/src/completion/openai.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/openai.rs#L32

Added line #L32 was not covered by tests
client,
model_name,
api_endpoint: format!("{}/completions", api_endpoint),
api_key,
support_fim,
}
})

Check warning on line 38 in crates/http-api-bindings/src/completion/openai.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/completion/openai.rs#L38

Added line #L38 was not covered by tests
}
}

Expand Down
16 changes: 5 additions & 11 deletions crates/http-api-bindings/src/embedding/mod.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
mod llama;
mod openai;
mod rate_limit;
mod voyage;

use core::panic;
use std::{sync::Arc, time::Duration};
use std::sync::Arc;

use llama::LlamaCppEngine;
use ratelimit::Ratelimiter;
use tabby_common::config::HttpModelConfig;
use tabby_inference::Embedding;

use self::{openai::OpenAIEmbeddingEngine, voyage::VoyageEmbeddingEngine};
use super::rate_limit;

pub async fn create(config: &HttpModelConfig) -> Arc<dyn Embedding> {
let engine = match config.kind.as_str() {
Expand Down Expand Up @@ -48,13 +47,8 @@
),
};

let ratelimiter = Ratelimiter::builder(
Arc::new(rate_limit::new_embedding(
engine,

Check warning on line 51 in crates/http-api-bindings/src/embedding/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/embedding/mod.rs#L50-L51

Added lines #L50 - L51 were not covered by tests
config.rate_limit.request_per_minute,
Duration::from_secs(60),
)
.max_tokens(config.rate_limit.request_per_minute)
.build()
.expect("Failed to create ratelimiter, please check the rate limit configuration");

Arc::new(rate_limit::RateLimitedEmbedding::new(engine, ratelimiter))
))

Check warning on line 53 in crates/http-api-bindings/src/embedding/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/http-api-bindings/src/embedding/mod.rs#L53

Added line #L53 was not covered by tests
}
33 changes: 0 additions & 33 deletions crates/http-api-bindings/src/embedding/rate_limit.rs

This file was deleted.

1 change: 1 addition & 0 deletions crates/http-api-bindings/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod chat;
mod completion;
mod embedding;
mod rate_limit;

pub use chat::create as create_chat;
pub use completion::{build_completion_prompt, create};
Expand Down
Loading
Loading