EricLBuehler · EricLBuehler · Jun 20, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/src/main.rs b/src/main.rs
@@ -56,6 +56,14 @@ struct Args {
 
     #[arg(long, default_value_t = false)]
     cpu: bool,
+
+    /// Available GPU memory for kvcache (MB)
+    #[arg(long, default_value_t = 4096)]
+    kvcache_mem: usize,
+
+    /// Record conversation (default false, the client need to record chat history)
+    #[arg(long)]
+    record_conversation: bool,
 }
 
 #[actix_web::main]
@@ -72,33 +80,44 @@ async fn main() -> Result<(), APIError> {
         _ => loader.download_model(model_id, None, args.hf_token, args.hf_token_path)?,
     };
 
-    let dtype = match args.dtype.as_deref() {
-        Some("f16") => DType::F16,
-        Some("bf16") => DType::BF16,
-        Some("f32") => DType::F32,
+    let (dtype, dsize) = match args.dtype.as_deref() {
+        Some("f16") => (DType::F16, 2),
+        Some("bf16") => (DType::BF16, 2),
+        Some("f32") => (DType::F32, 4),
         Some(dtype) => panic!("Unsupported dtype {dtype}"),
-        None => DType::BF16,
+        None => (DType::BF16, 2),
     };
 
     let device = candle_examples::device(args.cpu).unwrap();
     let model = loader.load_model(paths, dtype, device)?;
+    let config = model.0.get_model_config();
+    let num_gpu_blocks = args.kvcache_mem * 1024 * 1024
+        / dsize
+        / args.block_size
+        / config.get_num_kv_heads()
+        / config.get_head_size()
+        / config.get_num_hidden_layers()
+        / 2;
+    let cache_config = CacheConfig {
+        block_size: args.block_size,
+        num_gpu_blocks: Some(num_gpu_blocks),
+        num_cpu_blocks: Some(32),
+        fully_init: true,
+    };
+    println!("Cache config {:?}", cache_config);
 
     let llm_engine = LLMEngine::new(
         model.0,
         SchedulerConfig {
             max_num_seqs: args.max_num_seqs,
         },
-        CacheConfig {
-            block_size: args.block_size,
-            num_gpu_blocks: Some(64),
-            num_cpu_blocks: Some(64),
-            fully_init: true,
-        },
+        cache_config,
     )?;
 
     let server_data = OpenAIServerData {
         pipeline_config: model.1,
         model: Arc::new(Mutex::new(llm_engine)),
+        record_conversation: args.record_conversation,
         device: Device::Cpu,
     };
 

diff --git a/src/openai/conversation/default_conversation.rs b/src/openai/conversation/default_conversation.rs
@@ -112,6 +112,9 @@ impl Conversation for DefaultConversation {
         &self.roles
     }
 
+    fn clear_message(&mut self) {
+        self.messages.clear()
+    }
     /// Convert this conversation to a String prompt
     fn get_prompt(&mut self) -> String {
         let system_prompt = self.system_template.format(&[self.system_message.clone()]);

diff --git a/src/openai/conversation/mod.rs b/src/openai/conversation/mod.rs
@@ -13,4 +13,6 @@ pub trait Conversation {
     fn get_roles(&self) -> &(String, String);
 
     fn get_prompt(&mut self) -> String;
+
+    fn clear_message(&mut self);
 }
diff --git a/src/openai/mod.rs b/src/openai/mod.rs
@@ -40,6 +40,7 @@ pub struct PipelineConfig {
 pub struct OpenAIServerData<'s> {
     pub model: Arc<Mutex<LLMEngine<'s>>>,
     pub pipeline_config: PipelineConfig,
+    pub record_conversation: bool,
     pub device: Device,
 }
 

diff --git a/src/openai/openai_server.rs b/src/openai/openai_server.rs
@@ -32,7 +32,9 @@ async fn get_gen_prompt(
     request: &web::Json<ChatCompletionRequest>,
 ) -> Result<String, APIError> {
     let mut model = data.model.lock().unwrap();
-    let conversation = model.get_mut_pipeline().get_conversation();
+    let conversation = model
+        .get_mut_pipeline()
+        .get_conversation(data.record_conversation);
 
     match &request.messages {
         Messages::Literal(msg) => {
@@ -129,6 +131,7 @@ async fn chat_completions(
         return Either::Left(Err(prompt.err().unwrap()));
     }
     let prompt = prompt.unwrap();
+    println!("\n\n\nPrompt {:?}", prompt);
 
     let token_ids = check_length(&request, prompt, &data);
     if token_ids.is_err() {

diff --git a/src/openai/pipelines/llama.rs b/src/openai/pipelines/llama.rs
@@ -300,7 +300,10 @@ impl<'s> ModulePipeline<'s> for LlamaPipeline {
         &self.tokenizer
     }
 
-    fn get_conversation(&mut self) -> &mut dyn Conversation {
+    fn get_conversation(&mut self, with_history: bool) -> &mut dyn Conversation {
+        if !with_history {
+            self.conversation.clear_message();
+        }
         &mut self.conversation
     }
 

diff --git a/src/openai/pipelines/llm_engine.rs b/src/openai/pipelines/llm_engine.rs
@@ -181,7 +181,7 @@ impl<'a> LLMEngine<'a> {
                             .map_err(APIError::from)?;
                         let choice = ChatChoice {
                             message: ChatChoiceData {
-                                role: self.pipeline.get_conversation().get_roles().0.clone(),
+                                role: self.pipeline.get_conversation(true).get_roles().0.clone(),
                                 content: Some(data),
                             },
                             finish_reason: Some(seq.deref_mut().get_finish_reason().clone()),

diff --git a/src/openai/pipelines/mod.rs b/src/openai/pipelines/mod.rs
@@ -40,7 +40,7 @@ pub trait ModulePipeline<'s>: Send + Sync {
 
     fn tokenizer(&self) -> &TokenOutputStream;
 
-    fn get_conversation(&mut self) -> &mut dyn Conversation;
+    fn get_conversation(&mut self, with_history: bool) -> &mut dyn Conversation;
 
     fn get_model_config(&self) -> Box<dyn ConfigLike>;
 

diff --git a/src/scheduler/cache_engine.rs b/src/scheduler/cache_engine.rs
@@ -11,7 +11,7 @@ use crate::{
     try_api,
 };
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct CacheConfig {
     pub block_size: usize,
     pub num_gpu_blocks: Option<usize>, // Set after profiling init