EricLBuehler · EricLBuehler · Jun 29, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -0,0 +1,2 @@
+[build]
+rustflags = ["-C", "target-cpu=native"]
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-actix-web = "4.4.0"
+actix-web = "4.8.0"
 anyhow = "1.0.75"
 candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.6.0" }
 candle-examples = { git = "https://github.com/huggingface/candle.git", version = "0.6.0" }
@@ -30,7 +30,7 @@ candle-flash-attn = { git = "https://github.com/huggingface/candle.git", version
 clap = { version = "4.4.7", features = ["derive"] }
 #candle-sampling = { git = "https://github.com/EricLBuehler/candle-sampling.git", version = "0.2.0" }
 futures = "0.3.29"
-tokio = { version = "1.33.0", features = ["sync"] }
+tokio = { version = "1.38.0", features = ["sync"] }
 env_logger = "0.10.1"
 tracing = "0.1.40"
 range-checked = { git = "https://github.com/EricLBuehler/range-checked.git", version = "0.1.0" }

diff --git a/README.md b/README.md
@@ -19,12 +19,13 @@ Efficient, easy-to-use platform for inference and serving local LLMs including a
     - 13b
     - 70b
 
-## Examples
-See [this folder](examples/) for some examples.
+## Demo Chat with candle-vllm (71 tokens/s, LLaMa2 7B, bf16, on A100)
+<img src="./res/candle-vllm-demo.gif" width="90%" height="90%" >
 
-### Example with Llama 7b
+## Usage
+See [this folder](examples/) for some examples.
 
-#### Step 1: Run Candle-VLLM service (assume llama2-7b model weights downloaded)
+### Step 1: Run Candle-VLLM service (assume llama2-7b model weights downloaded)
 
 ```
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
@@ -35,9 +36,27 @@ cd candle-vllm
 cargo run --release -- --port 2000 --weight-path /home/llama2_7b/ llama7b --repeat-last-n 64
 ```
 
-#### Step 2:
+### Step 2:
+
+#### Option 1: Chat with ChatUI (recommended)
+Install ChatUI and its dependencies:
 
-#### Option 1: Chat completion request with HTTP post
+```
+git clone [email protected]:guoqingbao/candle-vllm-demo.git
+cd candle-vllm-demo
+apt install npm #install npm if needed
+npm install n -g #update node js if needed
+n stable #update node js if needed
+npm i -g pnpm #install pnpm manager
+pnpm install #install ChatUI dependencies
+```
+
+Launching the ChatUI:
+```
+pnpm run dev # run the ChatUI
+```
+
+#### Option 2: Chat completion request with HTTP post
 
 ``` shell
 curl -X POST "http://127.0.0.1:2000/v1/chat/completions" \
@@ -59,7 +78,7 @@ Sample response:
 {"id":"cmpl-53092967-c9cf-40e0-ae26-d7ac786d59e8","choices":[{"message":{"content":" Learning any programming language requires a combination of theory, practice, and dedication. Here are some steps and resources to help you learn Rust effectively:\n\n1. Start with the basics:\n\t* Understand the syntax and basic structure of Rust programs.\n\t* Learn about variables, data types, loops, and control structures.\n\t* Familiarize yourself with Rust's ownership system and borrowing mechanism.\n2. Read the Rust book:\n\t* The Rust book is an official resource that provides a comprehensive introduction to the language.\n\t* It covers topics such","role":"[INST]"},"finish_reason":"length","index":0,"logprobs":null}],"created":1718784498,"model":"llama7b","object":"chat.completion","usage":{"completion_tokens":129,"prompt_tokens":29,"total_tokens":158}}
 ```
 
-#### Option 2: Chat completion with with openai package
+#### Option 3: Chat completion with with openai package
 
 In your terminal, install the `openai` Python package by running `pip install openai`. I use version `1.3.5`.
 
@@ -85,21 +104,8 @@ print(completion.choices[0].message.content)
 ```
 After the `candle-vllm` service is running, run the Python script and enjoy efficient inference with an OpenAI compatible API server!
 
-#### Option 3: Chat with ChatUI (recommended)
-Installation of the required packages from here [ChatUI for Candle-VLLM](https://github.com/guoqingbao/ChattierGPT-UI):
-
-```
-git clone [email protected]:guoqingbao/ChattierGPT-UI.git
-pip install -r requirements.txt
-```
 
-Launching the ChatUI:
-```
-python -m streamlit run src/main.py
-```
 
-## Demo Chat with candle-vllm 
-<img src="./res/candle-vllm-demo.gif" width="95%" height="95%" >
 
 ## Usage Help
 For general configuration help, run `cargo run -- --help`.
@@ -112,14 +118,13 @@ For kvcache configuration, set `kvcache_mem_cpu` and `kvcache_mem_gpu`, default
 
 For chat history settings, set `record_conversation` to `true` to let candle-vllm remember chat history. By `default`, candle-vllm `does not` record chat history; instead, the client sends both the messages and the contextual history to candle-vllm. If record_conversation is set to `true`, the client sends only new chat messages to candle-vllm, and candle-vllm is responsible for recording the previous chat messages. However, this approach requires per-session chat recording, which is not yet implemented, so the default approach `record_conversation=false` is recommended.
 
+For chat streaming, the `stream` flag in chat request need to be set to `True`.
+
 
-## Installation
+## Report issue
 Installing `candle-vllm` is as simple as the following steps. If you have any problems, please create an
 [issue](https://github.com/EricLBuehler/candle-lora/issues).
 
-0) Be sure to install Rust here: https://www.rust-lang.org/tools/install
-1) Run `sudo apt install libssl-dev` or equivalent install command
-2) Run `sudo apt install pkg-config` or equivalent install command
 
 ## Contributing
 The following features are planned to be implemented, but contributions are especially welcome:

diff --git a/res/candle-vllm-demo.gif b/res/candle-vllm-demo.gif
diff --git a/src/main.rs b/src/main.rs
@@ -1,5 +1,3 @@
-use std::sync::{Arc, Mutex};
-
 use actix_web::middleware::Logger;
 use actix_web::web::Data;
 use actix_web::{App, HttpServer};
@@ -14,6 +12,8 @@ use candle_vllm::scheduler::cache_engine::CacheConfig;
 use candle_vllm::scheduler::SchedulerConfig;
 use candle_vllm::{get_model_loader, hub_load_local_safetensors, ModelSelected};
 use clap::Parser;
+use futures::lock::Mutex;
+use std::sync::Arc;
 const SIZE_IN_MB: usize = 1024 * 1024;
 
 #[derive(Parser, Debug)]

diff --git a/src/openai/conversation/default_conversation.rs b/src/openai/conversation/default_conversation.rs
@@ -220,25 +220,23 @@ impl Conversation for DefaultConversation {
             }
 
             SeparatorStyle::Llama2 => {
-                let seps = [&self.sep, &self.sep2.clone().unwrap_or("".to_string())];
-                let mut accum = if !system_prompt.is_empty() {
-                    system_prompt.clone()
-                } else {
-                    "[INST] ".to_string()
-                };
+                let mut accum = "".to_string();
                 for (i, message) in self.messages.iter().enumerate() {
                     let Message((_role, message)) = message;
-
-                    let tag = &[self.roles.0.clone(), self.roles.1.clone()][i % 2];
-
-                    if let Some(message) = message {
-                        if i == 0 {
-                            accum += &format!("{message} ");
+                    if _role.clone() == self.roles.0 {
+                        //user message
+                        if let Some(message) = message {
+                            accum += &format!("[INST] {message} [/INST]");
                         } else {
-                            accum += &format!("{tag} {message}{}", seps[i % 2]);
+                            accum += &format!("[INST] [/INST]");
                         }
-                    } else {
-                        accum += tag;
+                    } else if _role.clone() == self.roles.1 {
+                        //assistant message
+                        if let Some(message) = message {
+                            accum += &format!("{message} \n");
+                        }
+                    } else if i == 0 && !system_prompt.is_empty() {
+                        accum += &system_prompt;
                     }
                 }
                 accum

diff --git a/src/openai/mod.rs b/src/openai/mod.rs
@@ -1,14 +1,14 @@
-use std::sync::{Arc, Mutex};
-
 use candle_core::Device;
+use futures::lock::Mutex;
+use std::sync::Arc;
 use tokenizers::{EncodeInput, Encoding, Tokenizer};
 
 use self::{pipelines::llm_engine::LLMEngine, responses::APIError};
 
 pub mod requests;
 pub mod responses;
 pub mod sampling_params;
-mod streaming;
+pub mod streaming;
 
 pub trait TokenizerWrapper<'s, E>
 where

diff --git a/src/openai/models/llama.rs b/src/openai/models/llama.rs
@@ -228,18 +228,27 @@ impl CausalSelfAttention {
         let k = self.k_proj.forward(x)?;
         let v = self.v_proj.forward(x)?;
 
-        let q = q
-            .reshape((b_sz, seq_len, self.num_attention_heads, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()?;
-        let k = k
-            .reshape((b_sz, seq_len, self.num_key_value_heads, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()?;
-        let v = v
-            .reshape((b_sz, seq_len, self.num_key_value_heads, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()?;
+        let (q, k, v) = if seq_len == 1 {
+            //no need transpose for seq_len == 1, change reshape dim
+            let q = q.reshape((b_sz, self.num_attention_heads, seq_len, self.head_dim))?;
+            let k = k.reshape((b_sz, self.num_key_value_heads, seq_len, self.head_dim))?;
+            let v = v.reshape((b_sz, self.num_key_value_heads, seq_len, self.head_dim))?;
+            (q, k, v)
+        } else {
+            let q = q
+                .reshape((b_sz, seq_len, self.num_attention_heads, self.head_dim))?
+                .transpose(1, 2)?
+                .contiguous()?;
+            let k = k
+                .reshape((b_sz, seq_len, self.num_key_value_heads, self.head_dim))?
+                .transpose(1, 2)?
+                .contiguous()?;
+            let v = v
+                .reshape((b_sz, seq_len, self.num_key_value_heads, self.head_dim))?
+                .transpose(1, 2)?
+                .contiguous()?;
+            (q, k, v)
+        };
 
         let q = self.apply_rotary_emb(&q, index_pos)?;
         let k = self.apply_rotary_emb(&k, index_pos)?;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[build]
		rustflags = ["-C", "target-cpu=native"]