forked from huggingface/transformers
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix support for image processors modifications in modular (huggingfac…
…e#34866) * add fix and examples * fix camel case naming
- Loading branch information
1 parent
54be2d7
commit 3a8eb74
Showing
3 changed files
with
297 additions
and
1 deletion.
There are no files selected for viewing
287 changes: 287 additions & 0 deletions
287
examples/modular-transformers/image_processing_new_imgproc_model.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,287 @@ | ||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 | ||
# This file was automatically generated from examples/modular-transformers/modular_new_imgproc_model.py. | ||
# Do NOT edit this file manually as any edits will be overwritten by the generation of | ||
# the file from the modular. If any change should be done, please apply the change to the | ||
# modular_new_imgproc_model.py file directly. One of our CI enforces this. | ||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 | ||
from typing import Dict, List, Optional, Union | ||
|
||
import numpy as np | ||
import torch | ||
|
||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict | ||
from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format | ||
from ...image_utils import ( | ||
OPENAI_CLIP_MEAN, | ||
OPENAI_CLIP_STD, | ||
ChannelDimension, | ||
ImageInput, | ||
PILImageResampling, | ||
infer_channel_dimension_format, | ||
is_scaled_image, | ||
make_list_of_images, | ||
to_numpy_array, | ||
valid_images, | ||
validate_preprocess_arguments, | ||
) | ||
from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging | ||
|
||
|
||
if is_vision_available(): | ||
import PIL | ||
|
||
|
||
logger = logging.get_logger(__name__) | ||
|
||
|
||
class ImgprocModelImageProcessor(BaseImageProcessor): | ||
r""" | ||
Constructs a NEW_IMGPROC_MODEL image processor. | ||
Args: | ||
do_resize (`bool`, *optional*, defaults to `True`): | ||
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the | ||
`do_resize` parameter in the `preprocess` method. | ||
size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): | ||
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` | ||
method. | ||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): | ||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be | ||
overridden by the `resample` parameter in the `preprocess` method. | ||
do_rescale (`bool`, *optional*, defaults to `True`): | ||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the | ||
`do_rescale` parameter in the `preprocess` method. | ||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): | ||
Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be | ||
overridden by the `rescale_factor` parameter in the `preprocess` method. | ||
do_normalize (`bool`, *optional*, defaults to `True`): | ||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` | ||
method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. | ||
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): | ||
Mean to use if normalizing the image. This is a float or list of floats the length of the number of | ||
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be | ||
overridden by the `image_mean` parameter in the `preprocess` method. | ||
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): | ||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the | ||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. | ||
Can be overridden by the `image_std` parameter in the `preprocess` method. | ||
do_convert_rgb (`bool`, *optional*, defaults to `True`): | ||
Whether to convert the image to RGB. | ||
""" | ||
|
||
model_input_names = ["pixel_values"] | ||
|
||
def __init__( | ||
self, | ||
do_resize: bool = True, | ||
size: Dict[str, int] = None, | ||
resample: PILImageResampling = PILImageResampling.BICUBIC, | ||
do_rescale: bool = True, | ||
rescale_factor: Union[int, float] = 1 / 255, | ||
do_normalize: bool = True, | ||
image_mean: Optional[Union[float, List[float]]] = None, | ||
image_std: Optional[Union[float, List[float]]] = None, | ||
do_convert_rgb: bool = True, | ||
**kwargs, | ||
) -> None: | ||
super().__init__(**kwargs) | ||
size = size if size is not None else {"height": 384, "width": 384} | ||
size = get_size_dict(size, default_to_square=True) | ||
|
||
self.do_resize = do_resize | ||
self.size = size | ||
self.resample = resample | ||
self.do_rescale = do_rescale | ||
self.rescale_factor = rescale_factor | ||
self.do_normalize = do_normalize | ||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN | ||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD | ||
self.do_convert_rgb = do_convert_rgb | ||
|
||
def resize( | ||
self, | ||
image: np.ndarray, | ||
size: Dict[str, int], | ||
resample: PILImageResampling = PILImageResampling.BICUBIC, | ||
data_format: Optional[Union[str, ChannelDimension]] = None, | ||
input_data_format: Optional[Union[str, ChannelDimension]] = None, | ||
**kwargs, | ||
) -> np.ndarray: | ||
""" | ||
Resize an image to `(size["height"], size["width"])`. | ||
Args: | ||
image (`np.ndarray`): | ||
Image to resize. | ||
size (`Dict[str, int]`): | ||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. | ||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): | ||
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. | ||
data_format (`ChannelDimension` or `str`, *optional*): | ||
The channel dimension format for the output image. If unset, the channel dimension format of the input | ||
image is used. Can be one of: | ||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. | ||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. | ||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. | ||
input_data_format (`ChannelDimension` or `str`, *optional*): | ||
The channel dimension format for the input image. If unset, the channel dimension format is inferred | ||
from the input image. Can be one of: | ||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. | ||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. | ||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. | ||
Returns: | ||
`np.ndarray`: The resized image. | ||
""" | ||
size = get_size_dict(size) | ||
if "height" not in size or "width" not in size: | ||
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") | ||
output_size = (size["height"], size["width"]) | ||
return resize( | ||
image, | ||
size=output_size, | ||
resample=resample, | ||
data_format=data_format, | ||
input_data_format=input_data_format, | ||
**kwargs, | ||
) | ||
|
||
@filter_out_non_signature_kwargs() | ||
def preprocess( | ||
self, | ||
images: ImageInput, | ||
do_resize: Optional[bool] = None, | ||
size: Optional[Dict[str, int]] = None, | ||
resample: PILImageResampling = None, | ||
do_rescale: Optional[bool] = None, | ||
rescale_factor: Optional[float] = None, | ||
do_normalize: Optional[bool] = None, | ||
image_mean: Optional[Union[float, List[float]]] = None, | ||
image_std: Optional[Union[float, List[float]]] = None, | ||
return_tensors: Optional[Union[str, TensorType]] = None, | ||
do_convert_rgb: bool = None, | ||
data_format: ChannelDimension = ChannelDimension.FIRST, | ||
input_data_format: Optional[Union[str, ChannelDimension]] = None, | ||
) -> PIL.Image.Image: | ||
""" | ||
Preprocess an image or batch of images. | ||
Args: | ||
images (`ImageInput`): | ||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If | ||
passing in images with pixel values between 0 and 1, set `do_rescale=False`. | ||
do_resize (`bool`, *optional*, defaults to `self.do_resize`): | ||
Whether to resize the image. | ||
size (`Dict[str, int]`, *optional*, defaults to `self.size`): | ||
Controls the size of the image after `resize`. The shortest edge of the image is resized to | ||
`size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image | ||
is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest | ||
edge equal to `int(size["shortest_edge"] * (1333 / 800))`. | ||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`): | ||
Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. | ||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): | ||
Whether to rescale the image values between [0 - 1]. | ||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): | ||
Rescale factor to rescale the image by if `do_rescale` is set to `True`. | ||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): | ||
Whether to normalize the image. | ||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): | ||
Image mean to normalize the image by if `do_normalize` is set to `True`. | ||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): | ||
Image standard deviation to normalize the image by if `do_normalize` is set to `True`. | ||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): | ||
Whether to convert the image to RGB. | ||
return_tensors (`str` or `TensorType`, *optional*): | ||
The type of tensors to return. Can be one of: | ||
- Unset: Return a list of `np.ndarray`. | ||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. | ||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. | ||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. | ||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. | ||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): | ||
The channel dimension format for the output image. Can be one of: | ||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. | ||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. | ||
- Unset: Use the channel dimension format of the input image. | ||
input_data_format (`ChannelDimension` or `str`, *optional*): | ||
The channel dimension format for the input image. If unset, the channel dimension format is inferred | ||
from the input image. Can be one of: | ||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. | ||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. | ||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. | ||
""" | ||
do_resize = do_resize if do_resize is not None else self.do_resize | ||
resample = resample if resample is not None else self.resample | ||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale | ||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor | ||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize | ||
image_mean = image_mean if image_mean is not None else self.image_mean | ||
image_std = image_std if image_std is not None else self.image_std | ||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb | ||
|
||
size = size if size is not None else self.size | ||
size = get_size_dict(size, default_to_square=False) | ||
|
||
images = make_list_of_images(images) | ||
|
||
if not valid_images(images): | ||
raise ValueError( | ||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " | ||
"torch.Tensor, tf.Tensor or jax.ndarray." | ||
) | ||
|
||
validate_preprocess_arguments( | ||
do_rescale=do_rescale, | ||
rescale_factor=rescale_factor, | ||
do_normalize=do_normalize, | ||
image_mean=image_mean, | ||
image_std=image_std, | ||
do_resize=do_resize, | ||
size=size, | ||
resample=resample, | ||
) | ||
# PIL RGBA images are converted to RGB | ||
if do_convert_rgb: | ||
images = [convert_to_rgb(image) for image in images] | ||
|
||
# All transformations expect numpy arrays. | ||
images = [to_numpy_array(image) for image in images] | ||
|
||
if is_scaled_image(images[0]) and do_rescale: | ||
logger.warning_once( | ||
"It looks like you are trying to rescale already rescaled images. If the input" | ||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." | ||
) | ||
|
||
if input_data_format is None: | ||
# We assume that all images have the same channel dimension format. | ||
input_data_format = infer_channel_dimension_format(images[0]) | ||
|
||
if do_resize: | ||
images = [ | ||
self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) | ||
for image in images | ||
] | ||
|
||
if do_rescale: | ||
images = [ | ||
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) | ||
for image in images | ||
] | ||
|
||
if do_normalize: | ||
images = [ | ||
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) | ||
for image in images | ||
] | ||
|
||
images = [ | ||
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images | ||
] | ||
|
||
encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) | ||
|
||
return encoded_outputs | ||
|
||
def new_image_processing_method(self, pixel_values: torch.FloatTensor): | ||
return pixel_values / 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import torch | ||
import torch.utils.checkpoint | ||
|
||
from transformers.models.blip.image_processing_blip import BlipImageProcessor | ||
|
||
|
||
class ImgprocModelImageProcessor(BlipImageProcessor): | ||
def new_image_processing_method(self, pixel_values: torch.FloatTensor): | ||
return pixel_values / 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters