EricLBuehler · EricLBuehler · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Currently, candle-vllm supports chat serving for the following models.
 | #8 | ChatGLM |TBD|TBD|TBD |-|
 | #9 | **QWen2 (1.8B, 7B)** |✅|148 tks/s (1.8B)|784 tks/s (1.8B) |-|
 | #10 | **Google Gemma** |✅|130 tks/s (2B)|TBD |-|
+| #10 | **Google Gemma 2** |✅|TBD|TBD |-|
 | #11 | Blip-large (Multimodal) |TBD|TBD|TBD |-|
 | #12 | Moondream-2 (Multimodal LLM) |TBD|TBD|TBD |-|
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -134,6 +134,25 @@ pub enum ModelSelected {
         quant: Option<String>,
     },
 
+    /// Select the gemma 2 model (default 2b).
+    Gemma2 {
+        /// Control the application of repeat penalty for the last n tokens
+        #[arg(long)]
+        repeat_last_n: Option<usize>,
+
+        #[arg(long)]
+        temperature: Option<f32>,
+
+        #[arg(long)]
+        penalty: Option<f32>,
+
+        #[arg(long)]
+        max_gen_tokens: Option<usize>,
+
+        #[arg(long)]
+        quant: Option<String>,
+    },
+
     /// Select the mistral model (default 7b).
     Mistral {
         /// Control the application of repeat penalty for the last n tokens
@@ -217,6 +236,7 @@ impl Display for ModelSelected {
                 quant: _,
             } => write!(f, "qwen2"),
             ModelSelected::Gemma { .. } => write!(f, "gemma"),
+            ModelSelected::Gemma2 { .. } => write!(f, "gemma2"),
             ModelSelected::Mistral { .. } => write!(f, "mistral"),
             ModelSelected::Yi { .. } => write!(f, "yi"),
             ModelSelected::StableLM { .. } => write!(f, "stablelm"),
@@ -416,6 +436,31 @@ pub fn get_model_loader(
                 "google/gemma-2b-it".to_string()
             },
         ),
+        ModelSelected::Gemma2 {
+            repeat_last_n,
+            temperature,
+            penalty,
+            max_gen_tokens,
+            quant,
+        } => (
+            Box::new(DefaultLoader::new(
+                SpecificConfig::new(
+                    repeat_last_n,
+                    temperature,
+                    None,
+                    None,
+                    penalty,
+                    max_gen_tokens,
+                    quant,
+                ),
+                "gemma2".to_string(),
+            )),
+            if let Some(model_id) = model_id {
+                model_id
+            } else {
+                "google/gemma-2-2b-it".to_string()
+            },
+        ),
         ModelSelected::Mistral {
             repeat_last_n,
             temperature,

diff --git a/src/openai/models/gemma.rs b/src/openai/models/gemma.rs
@@ -72,6 +72,10 @@ impl GemmaConfig {
             use_qkv_bias: None,
             custom_stop_tokens: None,
             specific_config: scfg.clone(),
+            query_pre_attn_scalar: None,
+            head_dim: None,
+            attn_logit_softcapping: None,
+            final_logit_softcapping: None,
         }
     }
 }