From d703e4f127c23ea9066e5bf7807c73beb2674fa6 Mon Sep 17 00:00:00 2001 From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:28:35 +0800 Subject: [PATCH] Enable vllm multimodal minicpm-v-2-6 (#12074) * enable minicpm-v-2-6 * add image_url readme --- python/llm/example/GPU/vLLM-Serving/README.md | 29 +++++++++++++++++++ .../src/ipex_llm/vllm/xpu/model_convert.py | 6 ++++ 2 files changed, 35 insertions(+) diff --git a/python/llm/example/GPU/vLLM-Serving/README.md b/python/llm/example/GPU/vLLM-Serving/README.md index 3d213de3e4a..e192bc1ecb6 100644 --- a/python/llm/example/GPU/vLLM-Serving/README.md +++ b/python/llm/example/GPU/vLLM-Serving/README.md @@ -128,6 +128,35 @@ curl http://localhost:8000/v1/completions \ }' & ``` +##### Image input + +image input only supports [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)now. +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "MiniCPM-V-2_6", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "图片里有什么?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg" + } + } + ] + } + ], + "max_tokens": 128 + }' +``` + #### Tensor parallel > Note: We recommend to use docker for tensor parallel deployment. diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index d8ece35863b..355f3dc2578 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -102,6 +102,12 @@ def _ipex_llm_load_model(self) -> None: modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"] else: modules = None + if "minicpm" in self.model_config.model.lower(): + modules = ["vpm", "resampler"] + # only for minicpm_2_6 + if "minicpm-v" in self.model_config.model.lower(): + from ipex_llm.transformers.models.minicpmv import merge_qkv + self.model.vpm.apply(merge_qkv) optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype, modules_to_not_convert=modules) self.model = self.model.to(device=self.device_config.device,