From 3cc90510d25e2fff9a26ff9998a6a20a378893b8 Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Fri, 6 Dec 2024 23:05:16 -0500
Subject: [PATCH] 0.40.0 +smolvlm, +paligemma2, -deprec, updates

---
 Dockerfile              |   7 +-
 README.md               |  42 +++++--
 backend/llavanextgit.py |   2 +
 backend/ovis16.py       |   2 +
 backend/paligemma.py    |  61 ++++++++++
 backend/qwen2-vl.py     |  18 ++-
 backend/smolvlm.py      |  47 ++++++++
 model_conf_tests.json   |  18 ++-
 requirements.txt        |   3 +-
 vision.sample.env       | 240 ++++++++++++++++++++--------------------
 vision_qna.py           |  12 +-
 11 files changed, 293 insertions(+), 159 deletions(-)
 create mode 100644 backend/paligemma.py
 create mode 100644 backend/smolvlm.py
diff --git a/Dockerfile b/Dockerfile
index f6e40e1..bd731ad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,20 +6,16 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install --upgrade pip
 
 WORKDIR /app
 RUN git clone https://github.com/TIGER-AI-Lab/Mantis.git --single-branch /app/Mantis && \
-    git clone https://github.com/togethercomputer/Dragonfly --single-branch /app/Dragonfly && \
     git clone https://github.com/baaivision/Emu3 --single-branch /app/Emu3
 
 COPY requirements.txt .
 ARG VERSION=latest
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "transformers>=4.45.2" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "transformers>=4.47.0" >> requirements.txt ; fi
 RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
 
 WORKDIR /app/Mantis
 RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -e .
 
-WORKDIR /app/Dragonfly
-RUN --mount=type=cache,target=/root/.cache/pip pip install --no-deps -e .
-
 WORKDIR /app
 
 COPY *.py model_conf_tests.json README.md LICENSE /app/
@@ -31,6 +27,7 @@ ARG GROUP_ID=1000
 ENV GROUP_ID=${GROUP_ID}
 RUN groupadd -g ${GROUP_ID} openedai && \
     useradd -r -u ${USER_ID} -g ${GROUP_ID} -M -d /app openedai
+RUN chown openedai:openedai /app # for .triton, .config/matplotlib
 
 USER openedai
 ENV CLI_COMMAND="python vision.py"
diff --git a/README.md b/README.md
index 6941e9c..f9ddb79 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,9 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 <summary>Full list of supported models</summary>
 
 - [X] [AIDC-AI](https://huggingface.co/AIDC-AI)
+- - [X] [Ovis1.6-Llama3.2-3B](https://huggingface.co/AIDC-AI/Ovis1.6-Llama3.2-3B)
 - - [X] [Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)
+- - [X] [Ovis1.6-Gemma2-27B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B)
 - - [X] [Ovis1.5-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.5-Gemma2-9B)
 - - [X] [Ovis1.5-Llama3-8B](https://huggingface.co/AIDC-AI/Ovis1.5-Llama3-8B)
 - [X] [Ai2](https://huggingface.co/allenai)
@@ -23,6 +25,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - - [X] [Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924)
 - - [X] [MolmoE-1B-0924](https://huggingface.co/allenai/MolmoE-1B-0924)
 - [X] [BAAI](https://huggingface.co/BAAI/)
+- - [X] [BAAI/Aquila-VL-2B-llava-qwen](https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen)
 - - [X] [BAAI/Bunny-v1_0-2B-zh](https://huggingface.co/BAAI/Bunny-v1_0-2B-zh)
 - - [X] [BAAI/Bunny-v1_0-3B-zh](https://huggingface.co/BAAI/Bunny-v1_0-3B-zh)
 - - [X] [BAAI/Bunny-v1_0-3B](https://huggingface.co/BAAI/Bunny-v1_0-3B)
@@ -45,19 +48,24 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - - [X] [joy-caption-alpha-two](https://huggingface.co/spaces/fancyfeast/joy-caption-alpha-two) (with experimental multi-image support)
 - - [X] [joy-caption-pre-alpha](https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha) (caption only)
 - [X] [fuyu-8b](https://huggingface.co/adept/fuyu-8b) [pretrain]
-- [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) 
+- [X] [Google](https://huggingface.co/google)
+- - [X] [paligemma2-3b](https://huggingface.co/google/paligemma2-3b-ft-docci-448)
+- - [X] [paligemma2-10b](https://huggingface.co/google/paligemma2-10b-ft-docci-448)
+- [X] [HuggingFaceM4](https://huggingface.co/HuggingFaceM4) 
 - - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split, alternate docker only)
 - - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split, alternate docker only)
 - - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split, alternate docker only)
 - - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split, alternate docker only)
+- [X] [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)
+- - [X] [SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct)
 - [X] [InternLM](https://huggingface.co/internlm/)
 - - [X] [XComposer2-2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b) (wont gpu split)
 - - [X] [XComposer2-4KHD-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b) (wont gpu split)
-- - [X] [XComposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (wont gpu split)
-- - [X] [XComposer2-7b-4bit](https://huggingface.co/internlm/internlm-xcomposer2-7b-4bit) (not recommended)
-- - [X] [XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] (wont gpu split)
-- - [X] [XComposer2-VL-4bit](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b-4bit)
-- - [X] [XComposer2-VL-1.8b](https://huggingface.co/internlm/internlm-xcomposer2-vl-1_8b)
+- - [X] [XComposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (wont gpu split) (0.39.2 only)
+- - [X] [XComposer2-7b-4bit](https://huggingface.co/internlm/internlm-xcomposer2-7b-4bit) (not recommended) (0.39.2 only)
+- - [X] [XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] (wont gpu split) (0.39.2 only)
+- - [X] [XComposer2-VL-4bit](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b-4bit) (0.39.2 only)
+- - [X] [XComposer2-VL-1.8b](https://huggingface.co/internlm/internlm-xcomposer2-vl-1_8b) (0.39.2 only)
 - [X] [LMMs-Lab](https://huggingface.co/lmms-lab)
 - - [X] [llava-onevision-qwen2-0.5b-ov](https://huggingface.co/lmms-lab/llava-onevision-qwen2-0.5b-ov)
 - - [X] [llava-onevision-qwen2-7b-ov](https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov)
@@ -82,7 +90,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - [X] [Mistral AI](https://huggingface.co/mistralai)
 - - [X] [Pixtral-12B](https://huggingface.co/mistralai/Pixtral-12B-2409)
 - [X] [mx262/MiniMonkey](https://huggingface.co/mx262/MiniMonkey)
-- [X] [nvidia/NVLM-D-72B](https://huggingface.co/nvidia/NVLM-D-72B)
+- [X] [nvidia/NVLM-D-72B](https://huggingface.co/nvidia/NVLM-D-72B) (0.39.2 only)
 - [X] [omlab/omchat-v2.0-13B-single-beta_hf](https://huggingface.co/omlab/omchat-v2.0-13B-single-beta_hf) (alt docker)
 - [X] [openbmb](https://huggingface.co/openbmb)
 - - [X] [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) (video not supported yet)
@@ -121,8 +129,8 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - - [X] [Mantis-8B-clip-llama3](https://huggingface.co/TIGER-Lab/Mantis-8B-clip-llama3) (wont gpu split, alt docker)
 - - [X] [Mantis-8B-Fuyu](https://huggingface.co/TIGER-Lab/Mantis-8B-Fuyu) (wont gpu split)
 - [X] [Together.ai](https://huggingface.co/togethercomputer)
-- - [X] [Llama-3-8B-Dragonfly-v1](https://huggingface.co/togethercomputer/Llama-3-8B-Dragonfly-v1)
-- - [X] [Llama-3-8B-Dragonfly-Med-v1](https://huggingface.co/togethercomputer/Llama-3-8B-Dragonfly-Med-v1) 
+- - [X] [Llama-3-8B-Dragonfly-v1](https://huggingface.co/togethercomputer/Llama-3-8B-Dragonfly-v1) (0.39.2 only)
+- - [X] [Llama-3-8B-Dragonfly-Med-v1](https://huggingface.co/togethercomputer/Llama-3-8B-Dragonfly-Med-v1) (0.39.2 only)
 - [X] [qihoo360](https://huggingface.co/qihoo360)
 - - [X] [360VL-8B](https://huggingface.co/qihoo360/360VL-8B) (alt docker)
 - - [X] [360VL-70B](https://huggingface.co/qihoo360/360VL-70B) (untested)
@@ -132,12 +140,16 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - [X] [qresearch](https://huggingface.co/qresearch/)
 - - [X] [llama-3-vision-alpha-hf](https://huggingface.co/qresearch/llama-3-vision-alpha-hf) (wont gpu split)
 - [X] [Qwen](https://huggingface.co/Qwen/)
+- - [X] [Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct) (untested)
 - - [X] [Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)
+- - [X] [Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)
 - - [X] [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)
 - - [X] [Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)
+- - [X] [Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)
 - - [X] [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
 - - [X] [Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)
-- - [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
+- - [X] [Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)
+  - [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
 - [X] [stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0) (ocr only model)
 - [X] [vikhyatk](https://huggingface.co/vikhyatk)
 - - [X] [moondream2](https://huggingface.co/vikhyatk/moondream2)
@@ -159,6 +171,16 @@ If you can't find your favorite model, you can [open a new issue](https://github
 
 ## Recent updates
 
+Version 0.40.0
+
+- new model support: AIDC-AI/Ovis1.6-Llama3.2-3B, AIDC-AI/Ovis1.6-Gemma2-27B
+- new model support: BAAI/Aquila-VL-2B-llava-qwen
+- new model support: HuggingFaceTB/SmolVLM-Instruct
+- new model support: google/paligemma2 family of models (very limited instruct/chat training so far)
+- Qwen2-VL: unpin Qwen2-VL-7B & remove Qwen hacks, GTPT-Int4/8 working again (still slow - why?)
+- pin bitsandbytes==0.44.1
+- ⚠️ DEPRECATED MODELS (use the `0.39.2` docker image for support of these models): internlm-xcomposer2-7b, internlm-xcomposer2-7b-4bit, internlm-xcomposer2-vl-1_8b, internlm-xcomposer2-vl-7b, internlm-xcomposer2-vl-7b-4bit, nvidia/NVLM-D-72B, Llama-3-8B-Dragonfly-Med-v1, Llama-3-8B-Dragonfly-v1
+
 Version 0.39.2
 
 - performance: use float16 with Qwen2 AWQ, small performance improvement
diff --git a/backend/llavanextgit.py b/backend/llavanextgit.py
index e703199..045aa2b 100644
--- a/backend/llavanextgit.py
+++ b/backend/llavanextgit.py
@@ -9,6 +9,8 @@
 # lmms-lab/llava-onevision-qwen2-72b-ov
 # lmms-lab/llava-onevision-qwen2-72b-si
 
+# BAAI/Aquila-VL-2B-llava-qwen
+
 import warnings
 warnings.filterwarnings("ignore")
 
diff --git a/backend/ovis16.py b/backend/ovis16.py
index 1233869..cbd73f3 100644
--- a/backend/ovis16.py
+++ b/backend/ovis16.py
@@ -2,7 +2,9 @@
 
 from vision_qna import *
 
+# AIDC-AI/Ovis1.6-Llama3.2-3B
 # AIDC-AI/Ovis1.6-Gemma2-9B
+# AIDC-AI/Ovis1.6-Gemma2-27B
 
 IMAGE_TOKEN = "<image>"
 
diff --git a/backend/paligemma.py b/backend/paligemma.py
new file mode 100644
index 0000000..8fd8ccc
--- /dev/null
+++ b/backend/paligemma.py
@@ -0,0 +1,61 @@
+# "google/paligemma2-3b-ft-docci-448"
+# "google/paligemma2-10b-ft-docci-448"
+# "google/paligemma2-28b-pt-896" - pretrain
+
+from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
+from vision_qna import *
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "paligemma2"
+    format: str = "gemma" # doesn't seem to actually be instruction trained
+    visual_layers: List[str] = ["vision_tower", "multi_modal_projector"]
+    
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        if not format:
+            self.format = guess_model_format(model_id)
+
+        for i in ['trust_remote_code']:
+            del self.params[i]
+
+        self.model = PaliGemmaForConditionalGeneration.from_pretrained(**self.params).eval()
+        self.processor = PaliGemmaProcessor.from_pretrained(model_id)
+
+        # bitsandbytes already moves the model to the device, so we don't need to do it again.
+        if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
+           self.model = self.model.to(self.device)
+
+        self.loaded_banner()
+
+    async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
+        images, prompt = await prompt_from_messages(request.messages, self.format)
+
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            prompt = "<image>\n" + prompt
+
+        # Instruct the model to create a caption in English
+        #prompt = "caption en"
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt").to(dtype=self.dtype, device=self.device)
+
+        default_params = {
+            'do_sample': False,
+#            'eos_token_id': self.processor.tokenizer.eos_token_id,
+#            'pad_token_id': self.processor.tokenizer.eos_token_id,
+        }
+
+        params = self.get_generation_params(request, default_params=default_params)
+
+        generation_kwargs = dict(
+            **inputs,
+            **params,
+        )
+
+        for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
+            end = new_text.find(self.processor.tokenizer.eos_token)
+            if end == -1:
+                yield new_text
+            else:
+                yield new_text[:end]
+                break
diff --git a/backend/qwen2-vl.py b/backend/qwen2-vl.py
index 053d78d..def5fca 100644
--- a/backend/qwen2-vl.py
+++ b/backend/qwen2-vl.py
@@ -9,10 +9,17 @@
 # Qwen/Qwen2-VL-7B-Instruct-AWQ
 # Qwen/Qwen2-VL-7B-Instruct
 # Qwen/Qwen2-VL-72B-Instruct-AWQ
+# Qwen/Qwen2-VL-72B-Instruct
+# Not recommended:
 # X Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
 # X Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8
 # X Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
 # X Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8
+# X Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4
+# X Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8
+
+# https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
+# Performance: for A100 80GB Qwen claim 30-40 T/s, I can't reproduce with this setup, I see more like 5-10 T/s.
 
 class VisionQnA(VisionQnABase):
     model_name: str = "qwen2-vl"
@@ -22,16 +29,13 @@ class VisionQnA(VisionQnABase):
     def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
         super().__init__(model_id, device, device_map, extra_params, format)
 
-        if 'awq' in model_id.lower() and self.dtype == torch.bfloat16:
+        if ('awq' in model_id.lower() or 'gptq' in model_id.lower()) and self.dtype == torch.bfloat16:
             self.dtype = self.params['torch_dtype'] = torch.float16  # recommended
 
         self.processor = AutoProcessor.from_pretrained(model_id)
         
         del self.params['trust_remote_code']
 
-        if model_id == 'Qwen/Qwen2-VL-7B-Instruct-AWQ':
-            self.params['revision'] = '9d72ae62396aaa1817b006e07ddbbd121024f50d' # re: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4
-
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(**self.params).eval()
 
         self.loaded_banner()
@@ -46,12 +50,6 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
                 msg = { 'role': m.role, 'content': [] }
                 for c in m.content:
                     if c.type == 'image_url':
-                        # hack around https://github.com/QwenLM/Qwen2-VL/issues/202'
-                        if c.image_url.url.startswith('data:image'):
-                            parts = c.image_url.url.split(';')
-                            if parts[1].startswith('charset='):
-                                c.image_url.url = parts[0] + ';' + parts[2]
-
                         msg['content'].extend([{'type': c.type, 'image': c.image_url.url}])
                     elif c.type == 'text':
                         msg['content'].extend([{'type': c.type, 'text': c.text}])
diff --git a/backend/smolvlm.py b/backend/smolvlm.py
new file mode 100644
index 0000000..fe894b8
--- /dev/null
+++ b/backend/smolvlm.py
@@ -0,0 +1,47 @@
+from transformers import AutoProcessor, AutoModelForVision2Seq
+
+from vision_qna import *
+
+# HuggingFaceTB/SmolVLM-Instruct
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "generic"
+    format: str = "internal"
+    visual_layers: List[str] = ["vision_model"]
+
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.model = AutoModelForVision2Seq.from_pretrained(**self.params).eval()
+
+        # bitsandbytes already moves the model to the device, so we don't need to do it again.
+        if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
+           self.model = self.model.to(self.device)
+
+        self.loaded_banner()
+
+    async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
+        images, messages = await images_hfmessages_from_messages(request.messages)
+        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            prompt = "<image>\n" + prompt
+
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt").to(self.device)
+
+        params = self.get_generation_params(request)
+
+        generation_kwargs = dict(
+            **inputs,
+            **params,
+        )
+
+        for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
+            end = new_text.find(self.processor.tokenizer.eos_token)
+            if end == -1:
+                yield new_text
+            else:
+                yield new_text[:end]
+                break
diff --git a/model_conf_tests.json b/model_conf_tests.json
index f5d9b4a..0789a2c 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -1,7 +1,11 @@
 [
+  ["AIDC-AI/Ovis1.6-Llama3.2-3B", "-A", "flash_attention_2"],
   ["AIDC-AI/Ovis1.6-Gemma2-9B", "-A", "flash_attention_2"],
+  ["AIDC-AI/Ovis1.6-Gemma2-27B", "-A", "flash_attention_2"],
   ["AIDC-AI/Ovis1.5-Gemma2-9B", "-A", "flash_attention_2"],
   ["AIDC-AI/Ovis1.5-Llama3-8B", "-A", "flash_attention_2"],
+  ["BAAI/Aquila-VL-2B-llava-qwen", "-A", "flash_attention_2", "--load-in-4bit"],
+  ["BAAI/Aquila-VL-2B-llava-qwen", "-A", "flash_attention_2"],
   ["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"],
   ["BAAI/Bunny-v1_0-2B-zh"],
   ["BAAI/Bunny-v1_0-3B", "--load-in-4bit"],
@@ -15,6 +19,8 @@
   ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"],
   ["BAAI/Emu3-Chat", "--load-in-4bit", "-A", "flash_attention_2"],
   ["BAAI/Emu3-Chat", "-A", "flash_attention_2"],
+  ["HuggingFaceTB/SmolVLM-Instruct", "-A", "flash_attention_2", "--load-in-4bit"],
+  ["HuggingFaceTB/SmolVLM-Instruct", "-A", "flash_attention_2"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"],
@@ -68,14 +74,11 @@
   ["fancyfeast/joy-caption-alpha-two", "-A", "flash_attention_2"],
   ["fancyfeast/joy-caption-pre-alpha", "--load-in-4bit", "-A", "flash_attention_2"],
   ["fancyfeast/joy-caption-pre-alpha", "-A", "flash_attention_2"],
+  ["google/paligemma2-3b-ft-docci-448", "-A", "flash_attention_2"],
+  ["google/paligemma2-10b-ft-docci-448", "-A", "flash_attention_2"],
   ["internlm/internlm-xcomposer2d5-7b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2d5-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-4khd-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-7b-4bit", "-A", "flash_attention_2"],
-  ["internlm/internlm-xcomposer2-vl-1_8b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-vl-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-vl-7b-4bit", "-A", "flash_attention_2"],
   ["llava-hf/llava-1.5-13b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["llava-hf/llava-1.5-13b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["llava-hf/llava-1.5-7b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -102,7 +105,6 @@
   ["mistralai/Pixtral-12B-2409"],
   ["mx262/MiniMonkey", "-A", "flash_attention_2", "--load-in-4bit"],
   ["mx262/MiniMonkey", "-A", "flash_attention_2"],
-  ["nvidia/NVLM-D-72B", "-A", "flash_attention_2", "--load-in-4bit"],
   ["openbmb/MiniCPM-V-2_6-int4", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0"],
@@ -115,10 +117,6 @@
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0", "--load-in-4bit"],
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0"],
   ["rhymes-ai/Aria", "-A", "flash_attention_2"],
-  ["togethercomputer/Llama-3-8B-Dragonfly-Med-v1", "--load-in-4bit"],
-  ["togethercomputer/Llama-3-8B-Dragonfly-Med-v1"],
-  ["togethercomputer/Llama-3-8B-Dragonfly-v1", "--load-in-4bit"],
-  ["togethercomputer/Llama-3-8B-Dragonfly-v1"],
   ["vikhyatk/moondream2", "-A", "flash_attention_2", "--load-in-4bit"],
   ["vikhyatk/moondream2", "-A", "flash_attention_2"]
 ]
diff --git a/requirements.txt b/requirements.txt
index 1946ef4..9313d3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,8 @@ accelerate
 # auto_gptq and autoawq lack pre-compiled support for python 3.12
 auto_gptq; python_version != "3.12"
 autoawq; python_version != "3.12"
-bitsandbytes
+autoawq_kernels; python_version != "3.12"
+bitsandbytes==0.44.1
 fastapi
 # See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases
 # And: https://github.com/Dao-AILab/flash-attention/releases for linux.
diff --git a/vision.sample.env b/vision.sample.env
index 208cc1e..6925709 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -6,125 +6,123 @@ HF_HUB_ENABLE_HF_TRANSFER=1
 #CUDA_VISIBLE_DEVICES=1,0
 #OPENEDAI_DEVICE_MAP="sequential"
 
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 32.3s, mem: 22.8GB, 13/13 tests passed, (133/10.2s) 13.1 T/s
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 9.1s, mem: 23.2GB, 13/13 tests passed, (32/2.6s) 12.5 T/s
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2"  # test pass✅, time: 5.5s, mem: 19.2GB, 13/13 tests passed, (32/1.4s) 22.8 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 8.0s, mem: 9.5GB, 13/13 tests passed, (39/1.5s) 25.3 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.9s, mem: 10.8GB, 13/13 tests passed, (38/1.1s) 36.1 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 10.1s, mem: 8.5GB, 13/13 tests passed, (59/2.7s) 21.8 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 8.4s, mem: 12.0GB, 13/13 tests passed, (70/2.2s) 32.5 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 7.4s, mem: 12.7GB, 13/13 tests passed, (37/1.7s) 21.6 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 8.7s, mem: 5.1GB, 13/13 tests passed, (48/2.5s) 19.4 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 6.4s, mem: 12.2GB, 13/13 tests passed, (44/1.8s) 24.5 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 10.2s, mem: 5.8GB, 13/13 tests passed, (44/2.9s) 15.3 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 7.6s, mem: 13.0GB, 13/13 tests passed, (35/2.2s) 16.0 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 32.2s, mem: 29.4GB, 13/13 tests passed, (83/10.3s) 8.1 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 21.6s, mem: 71.9GB, 13/13 tests passed, (103/8.3s) 12.4 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat --load-in-4bit -A flash_attention_2"  # test pass✅, time: 63.3s, mem: 65.8GB, 13/13 tests passed, (137/20.4s) 6.7 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat -A flash_attention_2"  # test pass✅, time: 67.3s, mem: 76.1GB, 13/13 tests passed, (159/21.8s) 7.3 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 22.7s, mem: 27.6GB, 13/13 tests passed, (60/7.1s) 8.4 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 29.3s, mem: 30.9GB, 13/13 tests passed, (58/9.3s) 6.2 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 25.0s, mem: 55.8GB, 13/13 tests passed, (45/7.9s) 5.7 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 18.6s, mem: 52.7GB, 13/13 tests passed, (50/5.8s) 8.7 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 30.1s, mem: 2.0GB, 13/13 tests passed, (271/9.8s) 27.5 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0"  # test pass✅, time: 7.6s, mem: 2.7GB, 13/13 tests passed, (77/2.1s) 36.4 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 16.4s, mem: 5.5GB, 13/13 tests passed, (156/5.1s) 30.6 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0"  # test pass✅, time: 8.1s, mem: 7.3GB, 13/13 tests passed, (90/2.4s) 37.1 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 9.2GB, 13/13 tests passed, (43/2.7s) 15.9 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 19.0GB, 13/13 tests passed, (43/2.2s) 19.6 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 25.1s, mem: 27.6GB, 13/13 tests passed, (75/8.0s) 9.4 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0"  # test pass✅, time: 19.9s, mem: 52.8GB, 13/13 tests passed, (59/6.3s) 9.4 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 37.9s, mem: 31.6GB, 13/13 tests passed, (82/11.9s) 6.9 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0"  # test pass✅, time: 47.0s, mem: 76.6GB, 13/13 tests passed, (140/15.1s) 9.3 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 40.1s, mem: 51.4GB, 13/13 tests passed, (40/12.6s) 3.2 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 6.2s, mem: 5.8GB, 13/13 tests passed, (42/1.7s) 24.3 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 7.1s, mem: 8.1GB, 13/13 tests passed, (42/2.0s) 20.8 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.3s, mem: 10.0GB, 13/13 tests passed, (48/1.8s) 27.4 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 5.5s, mem: 7.7GB, 13/13 tests passed, (48/1.5s) 32.9 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 9.1s, mem: 11.2GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 6.6s, mem: 19.6GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 12.5s, mem: 6.9GB, 13/13 tests passed, (44/3.9s) 11.4 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2"  # test pass✅, time: 19.0s, mem: 16.4GB, 13/13 tests passed, (36/5.6s) 6.4 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 22.9s, mem: 18.6GB, 13/13 tests passed, (36/6.9s) 5.2 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2"  # test pass✅, time: 19.6s, mem: 27.6GB, 13/13 tests passed, (31/6.1s) 5.1 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 35.4s, mem: 45.3GB, 13/13 tests passed, (31/11.1s) 2.8 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"  # test pass✅, time: 8.1s, mem: 9.4GB, 13/13 tests passed, (68/2.3s) 29.4 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"  # test pass✅, time: 3.4s, mem: 9.4GB, 13/13 tests passed, (10/0.7s) 13.5 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"  # test pass✅, time: 8.8s, mem: 9.4GB, 13/13 tests passed, (73/2.4s) 30.3 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 8.8s, mem: 9.9GB, 13/13 tests passed, (74/2.5s) 30.1 T/s
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.1s, mem: 11.6GB, 13/13 tests passed, (14/2.6s) 5.3 T/s
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.6s, mem: 20.8GB, 13/13 tests passed, (22/2.5s) 8.8 T/s
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 21.6s, mem: 15.9GB, 13/13 tests passed, (92/6.9s) 13.4 T/s
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 16.6s, mem: 26.0GB, 13/13 tests passed, (79/5.1s) 15.6 T/s
-#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 123.4s, mem: 6.1GB, 13/13 tests passed, (104/40.5s) 2.6 T/s
-#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 91.0s, mem: 6.4GB, 13/13 tests passed, (84/29.6s) 2.8 T/s
-#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2"  # test pass✅, time: 26.0s, mem: 15.3GB, 13/13 tests passed, (40/8.2s) 4.9 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 47.3s, mem: 7.9GB, 13/13 tests passed, (318/15.4s) 20.6 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 39.0s, mem: 8.3GB, 13/13 tests passed, (310/12.4s) 24.9 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2"  # test pass✅, time: 30.4s, mem: 18.2GB, 13/13 tests passed, (302/9.6s) 31.5 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 39.8s, mem: 8.6GB, 13/13 tests passed, (214/12.7s) 16.9 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 36.4s, mem: 8.9GB, 13/13 tests passed, (214/11.8s) 18.2 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2"  # test pass✅, time: 28.6s, mem: 18.6GB, 13/13 tests passed, (208/9.4s) 22.1 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 127.7s, mem: 43.3GB, 13/13 tests passed, (285/41.8s) 6.8 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 110.3s, mem: 48.3GB, 13/13 tests passed, (271/35.8s) 7.6 T/s
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 14.7s, mem: 15.8GB, 13/13 tests passed, (49/4.4s) 11.0 T/s
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.2s, mem: 21.8GB, 13/13 tests passed, (32/3.4s) 9.5 T/s
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.1s, mem: 7.5GB, 13/13 tests passed, (37/2.9s) 12.6 T/s
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2"  # test pass✅, time: 8.6s, mem: 12.5GB, 13/13 tests passed, (37/2.5s) 14.9 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two --load-in-4bit -A flash_attention_2"  # test pass✅, time: 58.7s, mem: 9.2GB, 13/13 tests passed, (192/15.8s) 12.1 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two -A flash_attention_2"  # test pass✅, time: 26.0s, mem: 18.8GB, 13/13 tests passed, (131/7.3s) 18.1 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2"  # test pass✅, time: 114.8s, mem: 8.5GB, 13/13 tests passed, (703/39.3s) 17.9 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2"  # test pass✅, time: 58.2s, mem: 18.1GB, 13/13 tests passed, (692/19.1s) 36.2 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Llama3.2-3B -A flash_attention_2"  # test pass✅, time: 12.9s, mem: 10.6GB, 13/13 tests passed, (125/3.9s) 32.3 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 33.6s, mem: 22.7GB, 13/13 tests passed, (133/10.8s) 12.3 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Gemma2-27B -A flash_attention_2"  # test pass✅, time: 29.1s, mem: 59.2GB, 13/13 tests passed, (68/9.2s) 7.4 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 9.3s, mem: 23.1GB, 13/13 tests passed, (32/2.6s) 12.2 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2"  # test pass✅, time: 6.0s, mem: 19.1GB, 13/13 tests passed, (32/1.5s) 21.6 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Aquila-VL-2B-llava-qwen -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 26.2s, mem: 9.7GB, 13/13 tests passed, (27/8.0s) 3.4 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Aquila-VL-2B-llava-qwen -A flash_attention_2"  # test pass✅, time: 9.5s, mem: 11.0GB, 13/13 tests passed, (27/2.7s) 10.1 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 7.4s, mem: 8.3GB, 13/13 tests passed, (39/1.4s) 27.2 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.5s, mem: 10.7GB, 13/13 tests passed, (38/1.1s) 33.1 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 10.0s, mem: 8.1GB, 13/13 tests passed, (59/2.6s) 22.8 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 8.8s, mem: 11.7GB, 13/13 tests passed, (70/2.2s) 31.4 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 7.7s, mem: 12.2GB, 13/13 tests passed, (37/1.8s) 21.1 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 8.3s, mem: 7.1GB, 13/13 tests passed, (48/2.3s) 20.7 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 6.8s, mem: 12.0GB, 13/13 tests passed, (44/1.8s) 24.1 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 9.9s, mem: 7.9GB, 13/13 tests passed, (44/2.9s) 15.2 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 8.4s, mem: 12.9GB, 13/13 tests passed, (35/2.3s) 15.2 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 30.0s, mem: 29.2GB, 13/13 tests passed, (72/8.3s) 8.7 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 19.4s, mem: 71.7GB, 13/13 tests passed, (85/6.6s) 12.8 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat --load-in-4bit -A flash_attention_2"  # test pass✅, time: 63.1s, mem: 54.3GB, 13/13 tests passed, (137/20.3s) 6.7 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat -A flash_attention_2"  # test pass✅, time: 67.1s, mem: 64.6GB, 13/13 tests passed, (159/21.7s) 7.3 T/s
+#CLI_COMMAND="python vision.py -m HuggingFaceTB/SmolVLM-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 9.6s, mem: 4.7GB, 13/13 tests passed, (33/2.7s) 12.4 T/s
+#CLI_COMMAND="python vision.py -m HuggingFaceTB/SmolVLM-Instruct -A flash_attention_2"  # test pass✅, time: 8.0s, mem: 7.6GB, 13/13 tests passed, (33/2.1s) 15.5 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 21.8s, mem: 26.7GB, 13/13 tests passed, (60/6.7s) 8.9 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 28.6s, mem: 29.9GB, 13/13 tests passed, (58/9.0s) 6.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 25.2s, mem: 54.5GB, 13/13 tests passed, (45/7.9s) 5.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 18.8s, mem: 51.9GB, 13/13 tests passed, (50/5.8s) 8.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 25.9s, mem: 1.8GB, 13/13 tests passed, (271/8.4s) 32.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0"  # test pass✅, time: 7.5s, mem: 2.5GB, 13/13 tests passed, (77/2.1s) 37.2 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.6s, mem: 4.6GB, 13/13 tests passed, (156/4.8s) 32.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0"  # test pass✅, time: 9.3s, mem: 6.5GB, 13/13 tests passed, (90/2.4s) 37.3 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.2s, mem: 8.4GB, 13/13 tests passed, (43/2.6s) 16.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 18.1GB, 13/13 tests passed, (43/2.3s) 19.0 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 24.9s, mem: 26.7GB, 13/13 tests passed, (75/7.7s) 9.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0"  # test pass✅, time: 20.1s, mem: 51.9GB, 13/13 tests passed, (59/6.3s) 9.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 33.7s, mem: 31.3GB, 13/13 tests passed, (82/10.7s) 7.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0"  # test pass✅, time: 44.0s, mem: 76.3GB, 13/13 tests passed, (140/14.2s) 9.9 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 37.9s, mem: 51.2GB, 13/13 tests passed, (40/11.9s) 3.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 6.3s, mem: 5.0GB, 13/13 tests passed, (42/1.7s) 24.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 7.1s, mem: 6.7GB, 13/13 tests passed, (42/2.0s) 21.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.7s, mem: 8.6GB, 13/13 tests passed, (48/1.8s) 26.1 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 6.0s, mem: 6.9GB, 13/13 tests passed, (48/1.5s) 31.5 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 9.0s, mem: 11.7GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 6.6s, mem: 19.3GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 14.1s, mem: 6.7GB, 13/13 tests passed, (44/3.7s) 11.8 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2"  # test pass✅, time: 14.6s, mem: 16.1GB, 13/13 tests passed, (36/4.3s) 8.3 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 18.9s, mem: 18.3GB, 13/13 tests passed, (36/5.7s) 6.3 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2"  # test pass✅, time: 15.7s, mem: 27.4GB, 13/13 tests passed, (31/4.8s) 6.5 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 40.7s, mem: 44.3GB, 13/13 tests passed, (31/12.7s) 2.4 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"  # test pass✅, time: 8.5s, mem: 9.2GB, 13/13 tests passed, (68/2.3s) 29.0 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"  # test pass✅, time: 3.8s, mem: 9.2GB, 13/13 tests passed, (10/0.7s) 13.3 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"  # test pass✅, time: 9.1s, mem: 9.2GB, 13/13 tests passed, (73/2.5s) 28.9 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 9.3s, mem: 9.7GB, 13/13 tests passed, (74/2.6s) 29.0 T/s
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.6s, mem: 11.3GB, 13/13 tests passed, (14/2.7s) 5.1 T/s
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.9s, mem: 20.1GB, 13/13 tests passed, (22/2.7s) 8.3 T/s
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 20.8s, mem: 15.7GB, 13/13 tests passed, (92/6.7s) 13.8 T/s
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 17.0s, mem: 24.8GB, 13/13 tests passed, (79/5.2s) 15.1 T/s
+#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 109.5s, mem: 5.7GB, 13/13 tests passed, (104/35.7s) 2.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 80.0s, mem: 6.0GB, 13/13 tests passed, (84/26.0s) 3.2 T/s
+#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2"  # test pass✅, time: 26.8s, mem: 15.1GB, 13/13 tests passed, (40/8.3s) 4.8 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 45.2s, mem: 7.7GB, 13/13 tests passed, (318/14.5s) 21.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 38.6s, mem: 8.1GB, 13/13 tests passed, (310/12.3s) 25.2 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2"  # test pass✅, time: 31.1s, mem: 17.9GB, 13/13 tests passed, (302/10.0s) 30.3 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 38.6s, mem: 8.3GB, 13/13 tests passed, (214/12.4s) 17.3 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 33.8s, mem: 8.6GB, 13/13 tests passed, (214/10.9s) 19.7 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2"  # test pass✅, time: 30.0s, mem: 18.3GB, 13/13 tests passed, (208/9.5s) 21.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 126.3s, mem: 43.0GB, 13/13 tests passed, (285/41.1s) 6.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 110.6s, mem: 48.1GB, 13/13 tests passed, (271/35.8s) 7.6 T/s
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 14.4s, mem: 15.8GB, 13/13 tests passed, (49/4.3s) 11.4 T/s
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.7s, mem: 21.3GB, 13/13 tests passed, (32/3.5s) 9.2 T/s
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.4s, mem: 7.2GB, 13/13 tests passed, (37/3.0s) 12.5 T/s
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2"  # test pass✅, time: 9.2s, mem: 12.1GB, 13/13 tests passed, (37/2.6s) 14.1 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two --load-in-4bit -A flash_attention_2"  # test pass✅, time: 54.3s, mem: 9.5GB, 13/13 tests passed, (281/22.4s) 12.5 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two -A flash_attention_2"  # test pass✅, time: 34.8s, mem: 18.9GB, 13/13 tests passed, (206/11.9s) 17.3 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2"  # test pass✅, time: 106.5s, mem: 8.7GB, 13/13 tests passed, (689/35.6s) 19.4 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2"  # test pass✅, time: 60.5s, mem: 18.2GB, 13/13 tests passed, (686/20.2s) 33.9 T/s
+#CLI_COMMAND="python vision.py -m google/paligemma2-3b-ft-docci-448 -A flash_attention_2"  # test pass✅, time: 51.7s, mem: 7.2GB, 13/13 tests passed, (425/16.8s) 25.3 T/s
+#CLI_COMMAND="python vision.py -m google/paligemma2-10b-ft-docci-448 -A flash_attention_2"  # test fail❌, time: 28.7s, mem: 20.1GB, 7/13 tests passed, (129/8.9s) 14.4 T/s
 #CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.8s, mem: 8.9GB, 1/13 tests passed
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 36.1s, mem: 30.0GB, 13/13 tests passed, (97/13.0s) 7.5 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 20.6s, mem: 22.1GB, 13/13 tests passed, (38/6.0s) 6.3 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 30.6s, mem: 18.3GB, 13/13 tests passed, (76/9.4s) 8.0 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2"  # test pass✅, time: 56.0s, mem: 8.8GB, 13/13 tests passed, (83/21.6s) 3.8 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.0s, mem: 7.7GB, 13/13 tests passed, (34/2.5s) 13.7 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 27.8s, mem: 20.7GB, 13/13 tests passed, (60/8.6s) 7.0 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2"  # test pass✅, time: 48.0s, mem: 10.5GB, 13/13 tests passed, (62/17.8s) 3.5 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.0s, mem: 9.3GB, 13/13 tests passed, (58/4.5s) 12.9 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 10.7s, mem: 26.5GB, 13/13 tests passed, (59/3.1s) 19.3 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.7s, mem: 5.5GB, 13/13 tests passed, (62/3.7s) 16.7 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 8.8s, mem: 14.4GB, 13/13 tests passed, (65/2.6s) 25.5 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 68.0s, mem: 22.3GB, 13/13 tests passed, (184/22.0s) 8.4 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2"  # test pass✅, time: 87.5s, mem: 67.4GB, 13/13 tests passed, (246/28.5s) 8.6 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 18.8s, mem: 12.6GB, 13/13 tests passed, (55/5.7s) 9.6 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2"  # test pass✅, time: 17.4s, mem: 29.7GB, 13/13 tests passed, (55/5.3s) 10.3 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 21.4s, mem: 7.9GB, 13/13 tests passed, (88/6.6s) 13.4 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2"  # test pass✅, time: 15.8s, mem: 16.7GB, 13/13 tests passed, (82/4.9s) 16.9 T/s
-#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2"  # test pass✅, time: 8.2s, mem: 23.6GB, 13/13 tests passed, (37/2.3s) 16.2 T/s
-#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2"  # test pass✅, time: 18.2s, mem: 36.8GB, 13/13 tests passed, (51/5.7s) 9.0 T/s
-#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 27.1s, mem: 8.9GB, 13/13 tests passed, (150/9.1s) 16.5 T/s
-#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2"  # test pass✅, time: 25.8s, mem: 22.6GB, 13/13 tests passed, (138/6.9s) 20.0 T/s
-#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-90B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 119.0s, mem: 51.0GB, 13/13 tests passed, (190/34.1s) 5.6 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.1s, mem: 1.1GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.6s, mem: 1.3GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.3s, mem: 1.6GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.6s, mem: 2.5GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 12.8s, mem: 7.5GB, 13/13 tests passed, (51/3.7s) 13.7 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2"  # test pass✅, time: 8.9s, mem: 12.4GB, 13/13 tests passed, (37/2.4s) 15.1 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 8.7s, mem: 4.7GB, 13/13 tests passed, (37/2.5s) 14.9 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 7.5s, mem: 9.6GB, 13/13 tests passed, (41/2.1s) 19.3 T/s
-#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 15.9s, mem: 35.9GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.5s, mem: 14.3GB, 13/13 tests passed, (37/3.1s) 11.9 T/s
-#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 16.6GB, 13/13 tests passed, (37/2.8s) 13.2 T/s
-#CLI_COMMAND="python vision.py -m nvidia/NVLM-D-72B -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 61.7s, mem: 56.8GB, 13/13 tests passed, (67/19.7s) 3.4 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 19.1s, mem: 9.6GB, 13/13 tests passed, (93/4.9s) 18.8 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 16.6s, mem: 9.8GB, 13/13 tests passed, (115/4.8s) 24.0 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 11.8s, mem: 19.2GB, 13/13 tests passed, (98/3.2s) 30.9 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 28.2s, mem: 9.3GB, 13/13 tests passed, (64/8.4s) 7.6 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 24.5s, mem: 19.3GB, 13/13 tests passed, (63/7.3s) 8.7 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 8.2GB, 13/13 tests passed, (52/2.0s) 25.5 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 8.6GB, 13/13 tests passed, (53/1.4s) 38.7 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.4s, mem: 8.2GB, 13/13 tests passed, (40/1.6s) 24.4 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 5.8s, mem: 8.6GB, 13/13 tests passed, (53/1.3s) 39.4 T/s
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 12.1s, mem: 6.8GB, 13/13 tests passed, (80/3.9s) 20.3 T/s
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 8.4s, mem: 16.8GB, 13/13 tests passed, (76/2.4s) 31.1 T/s
-#CLI_COMMAND="python vision.py -m rhymes-ai/Aria -A flash_attention_2"  # test pass✅, time: 71.8s, mem: 49.2GB, 13/13 tests passed, (210/22.8s) 9.2 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 10.9s, mem: 8.1GB, 13/13 tests passed, (43/3.1s) 13.8 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 9.1s, mem: 18.6GB, 13/13 tests passed, (51/2.6s) 19.8 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 12.9s, mem: 8.1GB, 13/13 tests passed, (61/3.8s) 16.0 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 12.3s, mem: 18.6GB, 13/13 tests passed, (95/3.7s) 25.9 T/s
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 5.9s, mem: 2.8GB, 13/13 tests passed, (63/1.7s) 37.9 T/s
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2"  # test pass✅, time: 4.3s, mem: 4.6GB, 13/13 tests passed, (63/1.1s) 54.8 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 20.7s, mem: 28.2GB, 13/13 tests passed, (30/6.1s) 4.9 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 23.4s, mem: 21.3GB, 13/13 tests passed, (38/6.3s) 6.0 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.2s, mem: 9.3GB, 13/13 tests passed, (58/3.1s) 18.4 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.1s, mem: 26.4GB, 13/13 tests passed, (59/2.6s) 23.1 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.3s, mem: 5.6GB, 13/13 tests passed, (62/2.6s) 23.9 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 7.8s, mem: 14.4GB, 13/13 tests passed, (65/2.2s) 29.9 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 54.0s, mem: 22.3GB, 13/13 tests passed, (184/17.3s) 10.6 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2"  # test pass✅, time: 73.5s, mem: 67.1GB, 13/13 tests passed, (246/23.9s) 10.3 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 14.5s, mem: 12.6GB, 13/13 tests passed, (55/4.3s) 12.7 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2"  # test pass✅, time: 14.2s, mem: 30.0GB, 13/13 tests passed, (55/4.3s) 12.9 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 15.2s, mem: 7.8GB, 13/13 tests passed, (88/4.5s) 19.5 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2"  # test pass✅, time: 11.6s, mem: 16.3GB, 13/13 tests passed, (82/3.4s) 24.2 T/s
+#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2"  # test pass✅, time: 8.0s, mem: 9.0GB, 13/13 tests passed, (37/2.2s) 16.6 T/s
+#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2"  # test pass✅, time: 17.9s, mem: 22.2GB, 13/13 tests passed, (51/5.5s) 9.3 T/s
+#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 37.9s, mem: 8.7GB, 13/13 tests passed, (123/7.1s) 17.4 T/s
+#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2"  # test pass✅, time: 31.2s, mem: 22.4GB, 13/13 tests passed, (103/5.4s) 18.9 T/s
+#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-90B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 113.8s, mem: 50.9GB, 13/13 tests passed, (154/27.7s) 5.6 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 2.6s, mem: 1.0GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.2s, mem: 1.3GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 2.9s, mem: 1.5GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.3s, mem: 2.5GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 12.0s, mem: 6.7GB, 13/13 tests passed, (51/3.4s) 14.8 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2"  # test pass✅, time: 8.8s, mem: 11.6GB, 13/13 tests passed, (37/2.5s) 15.0 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 8.3s, mem: 4.7GB, 13/13 tests passed, (37/2.3s) 16.1 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 8.0s, mem: 9.5GB, 13/13 tests passed, (41/2.2s) 18.6 T/s
+#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 19.6s, mem: 35.8GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 7.9s, mem: 7.7GB, 13/13 tests passed, (37/2.2s) 16.8 T/s
+#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2"  # test pass✅, time: 7.3s, mem: 9.6GB, 13/13 tests passed, (37/2.0s) 18.3 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 15.2s, mem: 9.5GB, 13/13 tests passed, (99/4.8s) 20.8 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 13.9s, mem: 9.8GB, 13/13 tests passed, (89/3.7s) 23.9 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 13.0s, mem: 19.2GB, 13/13 tests passed, (121/3.8s) 31.7 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 27.4s, mem: 9.2GB, 13/13 tests passed, (62/8.1s) 7.7 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 25.1s, mem: 19.2GB, 13/13 tests passed, (60/7.4s) 8.1 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.6s, mem: 7.4GB, 13/13 tests passed, (49/1.8s) 27.0 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.2s, mem: 8.1GB, 13/13 tests passed, (58/1.5s) 38.1 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.2s, mem: 7.4GB, 13/13 tests passed, (49/1.8s) 27.6 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 8.1GB, 13/13 tests passed, (61/1.6s) 38.1 T/s
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 11.9s, mem: 6.8GB, 13/13 tests passed, (80/3.5s) 22.8 T/s
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 8.2s, mem: 16.7GB, 13/13 tests passed, (77/2.5s) 30.4 T/s
+#CLI_COMMAND="python vision.py -m rhymes-ai/Aria -A flash_attention_2"  # test pass✅, time: 68.6s, mem: 48.6GB, 13/13 tests passed, (210/21.9s) 9.6 T/s
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 5.3s, mem: 2.7GB, 13/13 tests passed, (63/1.5s) 42.9 T/s
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2"  # test pass✅, time: 4.6s, mem: 4.6GB, 13/13 tests passed, (63/1.2s) 51.8 T/s
diff --git a/vision_qna.py b/vision_qna.py
index dda0936..b37ba19 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -823,11 +823,11 @@ def guess_model_format(model_name: str) -> str:
     model_id = model_name.lower()
 
     model_format_match_map = {
-        'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b', 'internvl2-', 'llava-onevision'],
+        'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b', 'internvl2-', 'llava-onevision', 'aquila'],
         'falcon': ['falcon'],
         'florence': ['florence'],
         'fuyu': ['fuyu'],
-        'gemma': ['gemma', '-2b'],
+        'gemma': ['gemma'],
         'glm4v': ['glm-4v'],
         'llama2': ['bakllava', '8x7b', 'mistral', 'mixtral'],
         'llama3': ['llama-3-vision', '360vl', 'llama3'],
@@ -852,6 +852,9 @@ def guess_model_format(model_name: str) -> str:
 def guess_backend(model_name: str) -> str:
     model_id = model_name.lower()
 
+    if 'paligemma' in model_id:
+        return 'paligemma'
+
     if 'llama-3.2' in model_id: # and vision 
         return 'mllama'
 
@@ -863,6 +866,8 @@ def guess_backend(model_name: str) -> str:
             return 'llavanext'
         elif 'onevision' in model_id:
             return 'llavanextgit'
+        elif 'aquila' in model_id:
+            return 'llavanextgit'
         return 'llava'
 
     if 'qwen2' in model_id:
@@ -947,6 +952,9 @@ def guess_backend(model_name: str) -> str:
 
     if 'idefics2' in model_id:
         return 'idefics2'
+    
+    if 'smolvlm' in model_id:
+        return 'smolvlm'
 
     if 'llama-3-vision-alpha' in model_id:
         return 'llama3vision'