huggingface · zucchini-nlp · Jan 29, 2025
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
@@ -198,6 +198,11 @@ def __call__(
             elif not isinstance(text, list) and not isinstance(text[0], str):
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
+            add_special_tokens = True
+            if self.bos_token is not None and text[0].startswith(self.bos_token):
+                add_special_tokens = False
+            output_kwargs["text_kwargs"]["add_special_tokens"] = add_special_tokens
+
             # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
             fake_image_token = self.fake_image_token.content
             image_token = self.image_token.content

diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
@@ -156,6 +156,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
             ]
         }
         tokenizer.add_special_tokens(tokens_to_add)
+        self.bos_token = self.tokenizer.bos_token
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
@@ -249,6 +250,11 @@ def __call__(
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
             n_images_in_text = [sample.count(self.image_token.content) for sample in text]
 
+        add_special_tokens = True
+        if self.bos_token is not None and text[0].startswith(self.bos_token):
+            add_special_tokens = False
+        output_kwargs["text_kwargs"]["add_special_tokens"] = add_special_tokens
+
         if images is not None:
             if is_image_or_image_url(images):
                 images = [[images]]