huggingface · Spycsh · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
@@ -35,6 +35,8 @@ Models that have been validated:
   - [meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)
   - [tiiuae/falcon-11B-vlm](https://huggingface.co/tiiuae/falcon-11B-vlm)
   - [google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)
+  - [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
+
 
 ### Inference with BF16
 
@@ -122,6 +124,24 @@ python3 run_pipeline.py \
     --sdp_on_bf16
 ```
 
+To run Qwen/Qwen2-VL-2B-Instruct inference, use the following command:
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \
+    --use_hpu_graphs \
+    --bf16
+```
+
+To run Qwen/Qwen2-VL-7B-Instruct inference, use the following command:
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path Qwen/Qwen2-VL-7B-Instruct \
+    --use_hpu_graphs \
+    --bf16
+```
+
 ### Inference with FP8
 Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch.
 

@@ -199,7 +199,7 @@ def main():
 
     config = AutoConfig.from_pretrained(args.model_name_or_path)
     model_type = config.model_type
-    if args.image_path is None and model_type in ["llava", "idefics2", "mllama"]:
+    if args.image_path is None and model_type in ["llava", "idefics2", "mllama", "qwen2_vl"]:
         args.image_path = ["https://llava-vl.github.io/static/images/view.jpg"]
     elif args.image_path is None and model_type == "paligemma":
         args.image_path = [
@@ -210,7 +210,7 @@ def main():
             "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
         ]
 
-    if model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma"]:
+    if model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma", "qwen2_vl"]:
         processor = AutoProcessor.from_pretrained(args.model_name_or_path)
         if args.prompt is None:
             if processor.chat_template is not None:
@@ -289,13 +289,19 @@ def main():
         generator = pipeline(
             "image-to-text",
             model=args.model_name_or_path,
+            config=args.model_name_or_path,
+            tokenizer=args.model_name_or_path,
+            image_processor=args.model_name_or_path,
             torch_dtype=model_dtype,
             device="hpu",
         )
         if args.use_hpu_graphs:
             from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-            generator.model = wrap_in_hpu_graph(generator.model)
+            if "Qwen2-VL" in args.model_name_or_path:
+                # only wrap language model part
+                generator.model.model = wrap_in_hpu_graph(generator.model.model)
+            else:
+                generator.model = wrap_in_hpu_graph(generator.model)
 
     if "falcon-11B-vlm" in args.model_name_or_path:
         # WA falcon vlm issue that image_token_id == embed size.
@@ -310,7 +316,7 @@ def main():
         "limit_hpu_graphs": args.limit_hpu_graphs,
     }
 
-    if args.sdp_on_bf16:
+    if args.sdp_on_bf16 and "Llama-3.2-11B-Vision-Instruct" in args.model_name_or_path:
         torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
     if args.use_kv_cache:
@@ -321,7 +327,7 @@ def main():
         htcore.hpu_initialize(generator.model)
 
     # delete once pipeline integrate AutoProcessor as preprocess engine
-    if model_type in ["idefics2", "mllama", "paligemma"]:
+    if model_type in ["idefics2", "mllama", "paligemma", "qwen2_vl"]:
         from transformers.image_utils import load_image
 
         def preprocess(self, image, prompt=None, timeout=None):

@@ -109,6 +109,7 @@
     "qwen2_moe",
     "xglm",
     "whisper",
+    "qwen2_vl",
     "paligemma",
     "idefics2",
     "mllama",
@@ -860,7 +861,11 @@ def _prepare_cache_for_generation(
 
         # Use tuples by default (.i.e. legacy format).
         else:
-            return
+            model_kwargs[cache_name] = (
+                DynamicCache()
+                if not requires_cross_attention_cache
+                else EncoderDecoderCache(DynamicCache(), DynamicCache())
+            )
 
     @torch.no_grad()
     def generate(

@@ -28,6 +28,7 @@
     gaudi_StoppingCriteriaList_call,
 )
 from .models import (
+    GAUDI_QWEN2_VL_ATTENTION_CLASSES,
     GAUDI_WHISPER_ATTENTION_CLASSES,
     BaichuanConfig,
     BaichuanForCausalLM,
@@ -140,6 +141,10 @@
     GaudiQwen2MoeForCausalLM,
     GaudiQwen2MoeMLP,
     GaudiQwen2MoeModel,
+    GaudiQwen2VLDecoderLayer,
+    GaudiQwen2VLForConditionalGeneration,
+    GaudiQwen2VLModel,
+    GaudiQwen2VLSdpaAttention,
     GaudiStableLmAttention,
     GaudiStableLmDecoderLayer,
     GaudiStableLmForCausalLM,
@@ -684,6 +689,15 @@ def adapt_transformers_to_gaudi():
     transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration = GaudiWhisperForConditionalGeneration
     transformers.models.whisper.modeling_whisper.WHISPER_ATTENTION_CLASSES = GAUDI_WHISPER_ATTENTION_CLASSES
 
+    # Optimization for Qwen2-VL on Gaudi
+    transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_ATTENTION_CLASSES = GAUDI_QWEN2_VL_ATTENTION_CLASSES
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLDecoderLayer = GaudiQwen2VLDecoderLayer
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLSdpaAttention = GaudiQwen2VLSdpaAttention
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLModel = GaudiQwen2VLModel
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLForConditionalGeneration = (
+        GaudiQwen2VLForConditionalGeneration
+    )
+
     # Optimization for mllama on Gaudi
     transformers.models.mllama.modeling_mllama.MllamaSelfAttentionDecoderLayer = GaudiMllamaSelfAttentionDecoderLayer
     transformers.models.mllama.modeling_mllama.MllamaCrossAttentionDecoderLayer = GaudiMllamaCrossAttentionDecoderLayer

@@ -247,6 +247,13 @@
     gaudi_qwen2moe_block_sparse_moe_forward,
     gaudi_qwen2moe_rmsnorm_forward,
 )
+from .qwen2_vl import (
+    GAUDI_QWEN2_VL_ATTENTION_CLASSES,
+    GaudiQwen2VLDecoderLayer,
+    GaudiQwen2VLForConditionalGeneration,
+    GaudiQwen2VLModel,
+    GaudiQwen2VLSdpaAttention,
+)
 from .seamless_m4t import (
     gaudi_SeamlessM4TAttention_forward,
     gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths,

@@ -0,0 +1,7 @@
+from .modeling_qwen2_vl import (
+    GAUDI_QWEN2_VL_ATTENTION_CLASSES,
+    GaudiQwen2VLDecoderLayer,
+    GaudiQwen2VLForConditionalGeneration,
+    GaudiQwen2VLModel,
+    GaudiQwen2VLSdpaAttention,
+)