From d703e4f127c23ea9066e5bf7807c73beb2674fa6 Mon Sep 17 00:00:00 2001
From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:28:35 +0800
Subject: [PATCH] Enable vllm multimodal minicpm-v-2-6 (#12074)

* enable minicpm-v-2-6

* add image_url readme
---
 python/llm/example/GPU/vLLM-Serving/README.md | 29 +++++++++++++++++++
 .../src/ipex_llm/vllm/xpu/model_convert.py    |  6 ++++
 2 files changed, 35 insertions(+)

diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md
index 3d213de3e4a..e192bc1ecb6 100644
--- a/python/llm/example/GPU/vLLM-Serving/README.md
+++ b/python/llm/example/GPU/vLLM-Serving/README.md
@@ -128,6 +128,35 @@ curl http://localhost:8000/v1/completions \
  }' &
 ```
 
+##### Image input
+
+image input only supports [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)now.
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "MiniCPM-V-2_6",
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "图片里有什么?"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 128
+  }'
+```
+
 #### Tensor parallel
 
 > Note: We recommend to use docker for tensor parallel deployment.
diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
index d8ece35863b..355f3dc2578 100644
--- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
+++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py
@@ -102,6 +102,12 @@ def _ipex_llm_load_model(self) -> None:
                 modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"]
             else:
                 modules = None
+            if "minicpm" in self.model_config.model.lower():
+                modules = ["vpm", "resampler"]
+            # only for minicpm_2_6
+            if "minicpm-v" in self.model_config.model.lower():
+                from ipex_llm.transformers.models.minicpmv import merge_qkv
+                self.model.vpm.apply(merge_qkv)
             optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype,
                            modules_to_not_convert=modules)
             self.model = self.model.to(device=self.device_config.device,