huggingface · zucchini-nlp · Jan 28, 2025 · Jan 28, 2025 · Jan 30, 2025 · Jan 31, 2025
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
@@ -530,21 +530,33 @@ def get_uniform_frame_indices(total_num_frames: int, num_frames: Optional[int] =
     return indices
 
 
-def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
+def read_video_opencv(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with open-cv decoder.
 
     Args:
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     """
     video = cv2.VideoCapture(video_path)
     total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    video_fps = video.get(cv2.CAP_PROP_FPS)
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
     indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)
 
     index = 0
@@ -563,42 +575,66 @@ def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
     return np.stack(frames)
 
 
-def read_video_decord(video_path: str, num_frames: Optional[int] = None):
+def read_video_decord(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with Decord decoder.
 
     Args:
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     """
     vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
-    indices = get_uniform_frame_indices(total_num_frames=len(vr), num_frames=num_frames)
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
+    indices = get_uniform_frame_indices(total_num_frames=total_num_frames, num_frames=num_frames)
     frames = vr.get_batch(indices).asnumpy()
     return frames
 
 
-def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
+def read_video_pyav(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with PyAV decoder.
 
     Args:
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     """
     container = av.open(video_path)
 
-    # sample uniformly "num_frames" frames from the video
     total_num_frames = container.streams.video[0].frames
+    video_fps = container.streams.video[0].average_rate  # should we better use `av_guess_frame_rate`?
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
     indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)
 
     frames = []
@@ -612,15 +648,19 @@ def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
 
-def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
+def read_video_torchvision(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
     """
     Decode the video with torchvision decoder.
 
     Args:
         video_path (`str`):
             Path to the video file.
         num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
 
     Returns:
         np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
@@ -632,6 +672,15 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
         pts_unit="sec",
         output_format="TCHW",
     )
+    video_fps = info["video_fps"]
+    total_num_frames = video.size(0) - 1
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
 
     if num_frames is not None:
         idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
@@ -648,7 +697,12 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
 }
 
 
-def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None, backend: str = "opencv") -> np.array:
+def load_video(
+    video: Union[str, "VideoInput"],
+    num_frames: Optional[int] = None,
+    fps: Optional[int] = None,
+    backend: str = "opencv",
+) -> np.array:
     """
     Loads `video` to a numpy array.
 
@@ -657,12 +711,19 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
             The video to convert to the numpy array format. Can be a link to video or local path.
         num_frames (`int`, *optional*):
             Number of frames to sample uniformly. If not passed, the whole video is loaded.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
         backend (`str`, *optional*, defaults to `"opencv"`):
             The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
 
     Returns:
         `np.array`: A numpy array of shape (num_frames, channels, height, width).
     """
+
+    if fps is not None and num_frames is not None:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
     if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
         if not is_yt_dlp_available():
             raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
@@ -703,7 +764,7 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
         )
 
     video_decoder = VIDEO_DECODERS[backend]
-    video = video_decoder(file_obj)
+    video = video_decoder(file_obj, num_frames=num_frames, fps=fps)
     return video
 
 

diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py
@@ -110,6 +110,8 @@ class AriaImageProcessor(BaseImageProcessor):
             The resampling filter to use if resizing the image.
     """
 
+    model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
+
     def __init__(
         self,
         image_mean: List[float] = None,

diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
@@ -499,6 +499,8 @@ class AriaImageProcessor(BaseImageProcessor):
             The resampling filter to use if resizing the image.
     """
 
+    model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
+
     def __init__(
         self,
         image_mean: List[float] = None,
@@ -997,6 +999,10 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
+        # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
+        # otherwise `self.image_processor.model_input_names` is also modified
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 

diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
@@ -158,6 +158,10 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
+        # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
+        # otherwise `self.image_processor.model_input_names` is also modified
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 

diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
@@ -132,7 +132,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
             The spatial downsample factor the image will be downsampled in feature extracting phase
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "image_sizes"]
 
     def __init__(
         self,

diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
@@ -63,6 +63,7 @@ class Emu3Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
     tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
     image_processor_class = "Emu3ImageProcessor"
 
@@ -179,7 +180,7 @@ def __call__(
         data = self.tokenizer(text, **output_kwargs["text_kwargs"])
         data.update(**image_features)
 
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None))
 
     def calculate_generate_size(self, ratio, image_area, spatial_factor):
         width, height = map(int, ratio.split(":"))

diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -184,7 +184,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
             strategy was first introduced in https://arxiv.org/abs/2311.06607.
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def __init__(
         self,

diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -289,7 +289,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
             sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def __init__(
         self,

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
@@ -129,6 +129,7 @@ class Idefics3Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["image_seq_len", "chat_template"]
     image_processor_class = "Idefics3ImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
@@ -354,7 +355,7 @@ def decode(self, *args, **kwargs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))
 
 
 __all__ = ["Idefics3Processor"]
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -163,7 +163,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
             Whether to convert the image to RGB.
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["pixel_values", "image_sizes"]
 
     def __init__(
         self,

diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
@@ -202,14 +202,17 @@ class MllamaProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
             The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
 
     """
 
     attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
     image_processor_class = "MllamaImageProcessor"
     tokenizer_class = "PreTrainedTokenizerFast"
 
-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, chat_template=None):
         if not hasattr(tokenizer, "image_token"):
             self.image_token = "<|image|>"
             self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
@@ -220,8 +223,7 @@ def __init__(self, image_processor, tokenizer):
         self.python_token = "<|python_tag|>"
         self.python_token_id = tokenizer.convert_tokens_to_ids(self.python_token)
         self.bos_token = tokenizer.bos_token
-        self.chat_template = tokenizer.chat_template
-        super().__init__(image_processor, tokenizer)
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
         self,
@@ -364,6 +366,10 @@ def post_process_image_text_to_text(self, generated_outputs):
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
+
+        # Remove `num_tiles`, it is popped and used only when processing. Make a copy of list when remocing
+        # otherwise `self.image_processor.model_input_names` is also modified
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_tiles"]
         return list(tokenizer_input_names + image_processor_input_names + ["cross_attention_mask"])