Enable vllm multimodal minicpm-v-2-6 (#12074)

* enable minicpm-v-2-6 * add image_url readme
intel-analytics · Sep 13, 2024 · d703e4f · d703e4f
1 parent a767438
commit d703e4f
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 0 deletions.
diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md
@@ -128,6 +128,35 @@ curl http://localhost:8000/v1/completions \
  }' &
 ```
 
+##### Image input
+
+image input only supports [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)now.
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "MiniCPM-V-2_6",
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "图片里有什么?"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 128
+  }'
+```
+
 #### Tensor parallel
 
 > Note: We recommend to use docker for tensor parallel deployment.

diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
@@ -102,6 +102,12 @@ def _ipex_llm_load_model(self) -> None:
                 modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"]
             else:
                 modules = None
+            if "minicpm" in self.model_config.model.lower():
+                modules = ["vpm", "resampler"]
+            # only for minicpm_2_6
+            if "minicpm-v" in self.model_config.model.lower():
+                from ipex_llm.transformers.models.minicpmv import merge_qkv
+                self.model.vpm.apply(merge_qkv)
             optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype,
                            modules_to_not_convert=modules)
             self.model = self.model.to(device=self.device_config.device,