Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chat template: update for processor #35953

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 73 additions & 12 deletions src/transformers/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,21 +530,33 @@ def get_uniform_frame_indices(total_num_frames: int, num_frames: Optional[int] =
return indices


def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
def read_video_opencv(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
"""
Decode the video with open-cv decoder.

Args:
video_path (`str`):
Path to the video file.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not specified, all frames are sampled.
Number of frames to sample uniformly. Should be passed only when `fps=None`.
If not specified and `fps==None`, all frames are sampled.
fps (`int`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
Comment on lines +541 to +544
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth adding a check that just one of the args provided?

Copy link
Member Author

@zucchini-nlp zucchini-nlp Jan 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think neither of them has a special priority, and users should choose only one sampling method to sample frames. Otherwise we assume it was user error, as we can't infer their actual intentions

Right, that was a question hehe, agreed and added yes. I'll look again where exactly it fits the best :)

If not specified and `num_frames==None`, all frames are sampled.

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
"""
video = cv2.VideoCapture(video_path)
total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
video_fps = video.get(cv2.CAP_PROP_FPS)
if num_frames is None and fps is not None:
num_frames = int(total_num_frames / video_fps * fps)
if num_frames > total_num_frames:
raise ValueError(
f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
)
indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)

index = 0
Expand All @@ -563,42 +575,66 @@ def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
return np.stack(frames)


def read_video_decord(video_path: str, num_frames: Optional[int] = None):
def read_video_decord(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
"""
Decode the video with Decord decoder.

Args:
video_path (`str`):
Path to the video file.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not specified, all frames are sampled.
Number of frames to sample uniformly. Should be passed only when `fps=None`.
If not specified and `fps==None`, all frames are sampled.
fps (`int`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
If not specified and `num_frames==None`, all frames are sampled.

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
"""
vr = VideoReader(uri=video_path, ctx=cpu(0)) # decord has problems with gpu
indices = get_uniform_frame_indices(total_num_frames=len(vr), num_frames=num_frames)
video_fps = vr.get_avg_fps()
total_num_frames = len(vr)
if num_frames is None and fps is not None:
num_frames = int(total_num_frames / video_fps * fps)
if num_frames > total_num_frames:
raise ValueError(
f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
)
indices = get_uniform_frame_indices(total_num_frames=total_num_frames, num_frames=num_frames)
frames = vr.get_batch(indices).asnumpy()
return frames


def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
def read_video_pyav(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
"""
Decode the video with PyAV decoder.

Args:
video_path (`str`):
Path to the video file.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not specified, all frames are sampled.
Number of frames to sample uniformly. Should be passed only when `fps=None`.
If not specified and `fps==None`, all frames are sampled.
fps (`int`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
If not specified and `num_frames==None`, all frames are sampled.

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
"""
container = av.open(video_path)

# sample uniformly "num_frames" frames from the video
total_num_frames = container.streams.video[0].frames
video_fps = container.streams.video[0].average_rate # should we better use `av_guess_frame_rate`?
if num_frames is None and fps is not None:
num_frames = int(total_num_frames / video_fps * fps)
if num_frames > total_num_frames:
raise ValueError(
f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
)
indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)

frames = []
Expand All @@ -612,15 +648,19 @@ def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
def read_video_torchvision(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
"""
Decode the video with torchvision decoder.

Args:
video_path (`str`):
Path to the video file.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not specified, all frames are sampled.
Number of frames to sample uniformly. Should be passed only when `fps=None`.
If not specified and `fps==None`, all frames are sampled.
fps (`int`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
If not specified and `num_frames==None`, all frames are sampled.

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
Expand All @@ -632,6 +672,15 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
pts_unit="sec",
output_format="TCHW",
)
video_fps = info["video_fps"]
total_num_frames = video.size(0) - 1
if num_frames is None and fps is not None:
num_frames = int(total_num_frames / video_fps * fps)
if num_frames > total_num_frames:
raise ValueError(
f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
)

if num_frames is not None:
idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
Expand All @@ -648,7 +697,12 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
}


def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None, backend: str = "opencv") -> np.array:
def load_video(
video: Union[str, "VideoInput"],
num_frames: Optional[int] = None,
fps: Optional[int] = None,
backend: str = "opencv",
) -> np.array:
"""
Loads `video` to a numpy array.

Expand All @@ -657,12 +711,19 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
The video to convert to the numpy array format. Can be a link to video or local path.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not passed, the whole video is loaded.
fps (`int`, *optional*):
Number of frames to sample per second. Should be passed only when `num_frames=None`.
If not specified and `num_frames==None`, all frames are sampled.
backend (`str`, *optional*, defaults to `"opencv"`):
The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".

Returns:
`np.array`: A numpy array of shape (num_frames, channels, height, width).
"""

if fps is not None and num_frames is not None:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")

Comment on lines +724 to +726
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh, I see you are doing a check here.. probably better to delegate it to the function itself, but it's up to you

if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
if not is_yt_dlp_available():
raise ImportError("To load a video from YouTube url you have to install `yt_dlp` first.")
Expand Down Expand Up @@ -703,7 +764,7 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
)

video_decoder = VIDEO_DECODERS[backend]
video = video_decoder(file_obj)
video = video_decoder(file_obj, num_frames=num_frames, fps=fps)
return video


Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/aria/image_processing_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ class AriaImageProcessor(BaseImageProcessor):
The resampling filter to use if resizing the image.
"""

model_input_names = ["pixel_values", "pixel_mask", "num_crops"]

def __init__(
self,
image_mean: List[float] = None,
Expand Down
6 changes: 6 additions & 0 deletions src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,8 @@ class AriaImageProcessor(BaseImageProcessor):
The resampling filter to use if resizing the image.
"""

model_input_names = ["pixel_values", "pixel_mask", "num_crops"]

def __init__(
self,
image_mean: List[float] = None,
Expand Down Expand Up @@ -997,6 +999,10 @@ def decode(self, *args, **kwargs):
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names

# Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
# otherwise `self.image_processor.model_input_names` is also modified
image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
qubvel marked this conversation as resolved.
Show resolved Hide resolved
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/aria/processing_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ def decode(self, *args, **kwargs):
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names

# Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing
# otherwise `self.image_processor.model_input_names` is also modified
image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/emu3/image_processing_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
The spatial downsample factor the image will be downsampled in feature extracting phase
"""

model_input_names = ["pixel_values"]
model_input_names = ["pixel_values", "image_sizes"]

def __init__(
self,
Expand Down
3 changes: 2 additions & 1 deletion src/transformers/models/emu3/processing_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class Emu3Processor(ProcessorMixin):
"""

attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]
tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast")
image_processor_class = "Emu3ImageProcessor"

Expand Down Expand Up @@ -179,7 +180,7 @@ def __call__(
data = self.tokenizer(text, **output_kwargs["text_kwargs"])
data.update(**image_features)

return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None))

def calculate_generate_size(self, ratio, image_area, spatial_factor):
width, height = map(int, ratio.split(":"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
strategy was first introduced in https://arxiv.org/abs/2311.06607.
"""

model_input_names = ["pixel_values"]
model_input_names = ["pixel_values", "pixel_attention_mask"]

def __init__(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
"""

model_input_names = ["pixel_values"]
model_input_names = ["pixel_values", "pixel_attention_mask"]

def __init__(
self,
Expand Down
3 changes: 2 additions & 1 deletion src/transformers/models/idefics3/processing_idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ class Idefics3Processor(ProcessorMixin):
"""

attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["image_seq_len", "chat_template"]
image_processor_class = "Idefics3ImageProcessor"
tokenizer_class = "AutoTokenizer"

Expand Down Expand Up @@ -354,7 +355,7 @@ def decode(self, *args, **kwargs):
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))


__all__ = ["Idefics3Processor"]
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
Whether to convert the image to RGB.
"""

model_input_names = ["pixel_values"]
model_input_names = ["pixel_values", "image_sizes"]

def __init__(
self,
Expand Down
12 changes: 9 additions & 3 deletions src/transformers/models/mllama/processing_mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,14 +202,17 @@ class MllamaProcessor(ProcessorMixin):
The image processor is a required input.
tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.

"""

attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]
image_processor_class = "MllamaImageProcessor"
tokenizer_class = "PreTrainedTokenizerFast"

def __init__(self, image_processor, tokenizer):
def __init__(self, image_processor, tokenizer, chat_template=None):
if not hasattr(tokenizer, "image_token"):
self.image_token = "<|image|>"
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
Expand All @@ -220,8 +223,7 @@ def __init__(self, image_processor, tokenizer):
self.python_token = "<|python_tag|>"
self.python_token_id = tokenizer.convert_tokens_to_ids(self.python_token)
self.bos_token = tokenizer.bos_token
self.chat_template = tokenizer.chat_template
super().__init__(image_processor, tokenizer)
super().__init__(image_processor, tokenizer, chat_template=chat_template)

def __call__(
self,
Expand Down Expand Up @@ -364,6 +366,10 @@ def post_process_image_text_to_text(self, generated_outputs):
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names

# Remove `num_tiles`, it is popped and used only when processing. Make a copy of list when remocing
# otherwise `self.image_processor.model_input_names` is also modified
image_processor_input_names = [name for name in image_processor_input_names if name != "num_tiles"]
return list(tokenizer_input_names + image_processor_input_names + ["cross_attention_mask"])


Expand Down
Loading