From bfd616694598fb85ec6f87d46a05165ab5a7371b Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Wed, 22 Jan 2025 23:34:38 +0000 Subject: [PATCH] Add granite vision docs Replace multimodal granite refs with granite vision Add granite vision / llava next alias Signed-off-by: Alex-Brooks --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/granitevision.md | 85 +++++++++++++++++++ .../models/auto/configuration_auto.py | 3 + .../models/vipllava/test_modeling_vipllava.py | 2 +- 4 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/model_doc/granitevision.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 40780d24d51c..b7e5402374a7 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -452,6 +452,8 @@ title: Granite - local: model_doc/granitemoe title: GraniteMoe + - local: model_doc/granitevision + title: GraniteVision - local: model_doc/helium title: Helium - local: model_doc/herbert diff --git a/docs/source/en/model_doc/granitevision.md b/docs/source/en/model_doc/granitevision.md new file mode 100644 index 000000000000..069999f6d894 --- /dev/null +++ b/docs/source/en/model_doc/granitevision.md @@ -0,0 +1,85 @@ + + +# Granite Vision + +## Overview + +The Granite Vision model is a variant of [LLaVA-NeXT](llava_next), leveraging a [Granite](granite) language model alongside a [SigLIP](SigLIP) visual encoder. It utilizes multiple concatenated vision hidden states as its image features, similar to [VipLlava](vipllava). It also uses a larger set of image grid pinpoints than the original LlaVa-NeXT models to support additional aspect ratios. + +Tips: +- This model is loaded into Transformers as an instance of LlaVA-Next. The usage and tips from [LLaVA-NeXT](llava_next) apply to this model as well. + +- You can apply the chat template on the tokenizer / processor in the same way as well. Example chat format: +```bash +"<|user|>\nWhat’s shown in this image?\n<|assistant|>\nThis image shows a red stop sign.<|end_of_text|><|user|>\nDescribe the image in more details.\n<|assistant|>\n" +``` + +Sample inference: +```python +from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration +from PIL import Image +import requests + +# Note: These docs were written prior to the public model release, +# and this path is subject to change. +# Please see https://huggingface.co/ibm-granite for the current model list. +model_path = "ibm-granite/granite-3.1-2b-instruct-vision" +processor = LlavaNextProcessor.from_pretrained(model_path) + +model = LlavaNextForConditionalGeneration.from_pretrained(model_path).to("cuda") + +# prepare image and text prompt, using the appropriate prompt template +url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" +image = Image.open(requests.get(url, stream=True).raw) + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(image, prompt, return_tensors="pt").to("cuda") + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=100) + +print(processor.decode(output[0], skip_special_tokens=True)) +``` + +This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9944). + +## LlavaNextConfig + +[[autodoc]] LlavaNextConfig + +## LlavaNextImageProcessor + +[[autodoc]] LlavaNextImageProcessor + - preprocess + +## LlavaNextProcessor + +[[autodoc]] LlavaNextProcessor + +## LlavaNextForConditionalGeneration + +[[autodoc]] LlavaNextForConditionalGeneration + - forward diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 659fc4415121..fff95788367e 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -134,6 +134,7 @@ ("gptsan-japanese", "GPTSanJapaneseConfig"), ("granite", "GraniteConfig"), ("granitemoe", "GraniteMoeConfig"), + ("granitevision", "LlavaNextConfig"), ("graphormer", "GraphormerConfig"), ("grounding-dino", "GroundingDinoConfig"), ("groupvit", "GroupViTConfig"), @@ -456,6 +457,7 @@ ("gptsan-japanese", "GPTSAN-japanese"), ("granite", "Granite"), ("granitemoe", "GraniteMoeMoe"), + ("granitevision", "LLaVA-NeXT"), ("graphormer", "Graphormer"), ("grounding-dino", "Grounding DINO"), ("groupvit", "GroupViT"), @@ -725,6 +727,7 @@ ("siglip_vision_model", "siglip"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), + ("granitevision", "llava_next"), ] ) diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 61c4d31abcdc..816189e15cbd 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -273,7 +273,7 @@ def test_vision_feature_layers(self, vision_feature_layers): """ # NOTE: vipllava uses vision_feature_layers instead of vision_feature_layer as the # config key. The reason is that other llava classes supported one vision feature layer - # and added support for a list of layers with multimodal granite support, while vipllava + # and added support for a list of layers with granite vision support, while vipllava # originally supported multiple feature layers, and added support for a single layer for # for compatibility reasons. config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()