Fix: adjust num_image_tokens calculation in VideoLlavaProcessor

huggingface · zucchini-nlp · Jan 8, 2025 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
commit 114283b8109644a7395f889f3ac4887c12522986
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
@@ -187,7 +187,7 @@ def __call__(
             ) + self.num_additional_image_tokens
             num_video_tokens = num_image_tokens * num_frames
             if self.vision_feature_select_strategy == "default":
-                num_image_tokens -= self.num_additional_image_tokens
+                num_image_tokens -= 1
 
             prompt_strings = []
             for sample in text: