diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 31a327ecd8a7..b1ce42b95d94 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -18,6 +18,7 @@ from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image +from vllm.assets.video import VideoAsset _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -202,19 +203,27 @@ async def _predict(self, request, context, streaming=False): # Extract image paths and process images prompt = request.Prompt + image_paths = request.Images image_data = [self.load_image(img_path) for img_path in image_paths] + videos_path = request.Videos + video_data = [self.load_video(video_path) for video_path in videos_path] + # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template if not request.Prompt and request.UseTokenizerTemplate and request.Messages: prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) # Generate text using the LLM engine request_id = random_uuid() + print(f"Generating text with request_id: {request_id}", file=sys.stderr) outputs = self.llm.generate( { "prompt": prompt, - "multi_modal_data": {"image": image_data} if image_data else None, + "multi_modal_data": { + "image": image_data if image_data else None, + "video": video_data if video_data else None, + } if image_data or video_data else None, }, sampling_params=sampling_params, request_id=request_id, @@ -251,7 +260,7 @@ async def _predict(self, request, context, streaming=False): # Sending the final generated text yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) - def load_image(self, image_path: str) -> Image: + def load_image(self, image_path: str): """ Load an image from the given file path. @@ -265,6 +274,23 @@ def load_image(self, image_path: str) -> Image: return Image.open(image_path) except Exception as e: print(f"Error loading image {image_path}: {e}", file=sys.stderr) + return self.load_video(image_path) + + def load_video(self, video_path: str): + """ + Load a video from the given file path. + + Args: + video_path (str): The path to the image file. + + Returns: + Image: The loaded image. + """ + try: + video = VideoAsset(name=video_path).np_ndarrays + return video + except Exception as e: + print(f"Error loading video {image_path}: {e}", file=sys.stderr) return None async def serve(address):