From c80e1f2202c56cb7c2cdc90b10f11d56c67368b4 Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Thu, 10 Oct 2024 16:45:37 -0400
Subject: [PATCH] 0.39.0: +aria, docker changes

---
 .github/workflows/build-docker.yml |  17 +-
 Dockerfile                         |  12 +-
 README.md                          |   8 +
 backend/aria.py                    |  56 +++++++
 docker-compose.alt.yml             |   3 +
 docker-compose.yml                 |   3 +
 model_conf_tests.json              |   1 +
 requirements.txt                   |   4 +
 test_api_model.py                  |  74 ++++-----
 vision.sample.env                  | 243 +++++++++++++++--------------
 vision_qna.py                      |  60 +++----
 11 files changed, 273 insertions(+), 208 deletions(-)
 create mode 100644 backend/aria.py

diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
index 5879b24..8dc4a70 100644
--- a/.github/workflows/build-docker.yml
+++ b/.github/workflows/build-docker.yml
@@ -133,19 +133,6 @@ jobs:
         with:
           images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
 
-      # Build and push the Docker image to GHCR for the main branch or specific tags
-      - name: Build and Push Docker Image
-        if: github.ref == 'refs/heads/main'
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          build-args: |
-            VERSION =alt
-          file: Dockerfile
-          push: true
-          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
-          labels: version=${{ github.run_id }}
-
       # Build and push the Docker image to GHCR for the main branch or specific tags
       - name: Build and Push Docker Image (dev)
         if: github.ref == 'refs/heads/dev'
@@ -167,6 +154,8 @@ jobs:
             VERSION =alt
           file: Dockerfile
           push: true
-          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          tags: |
+            ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+            ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
           labels: version=${{ github.run_id }}
 
diff --git a/Dockerfile b/Dockerfile
index de613aa..6756db2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,9 +5,9 @@ RUN apt-get update && apt-get install -y git gcc \
 RUN --mount=type=cache,target=/root/.cache/pip pip install --upgrade pip
 
 WORKDIR /app
-RUN git clone https://github.com/TIGER-AI-Lab/Mantis.git --single-branch /app/Mantis
-RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app/Dragonfly
-RUN git clone https://github.com/baaivision/Emu3 --single-branch /app/Emu3
+RUN git clone https://github.com/TIGER-AI-Lab/Mantis.git --single-branch /app/Mantis && \
+    git clone https://github.com/togethercomputer/Dragonfly --single-branch /app/Dragonfly && \
+    git clone https://github.com/baaivision/Emu3 --single-branch /app/Emu3
 
 COPY requirements.txt .
 ARG VERSION=latest
@@ -26,5 +26,11 @@ COPY *.py .
 COPY backend /app/backend
 COPY model_conf_tests.json .
 
+ARG USER_ID
+ARG GROUP_ID
+RUN groupadd -g $GROUP_ID openedai && \
+    useradd -r -u $USER_ID -g $GROUP_ID -M -d /app openedai
+
+USER openedai
 ENV CLI_COMMAND="python vision.py"
 CMD $CLI_COMMAND
diff --git a/README.md b/README.md
index 7fae2e5..dcd6fd3 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - - [ ] [InternVL-Chat-V1-5-AWQ](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-AWQ) (wont gpu split yet)
 - - [X] [Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5) (alternate docker only)
 - - [X] [Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
+- [X] [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria)
 - [X] [Salesforce](https://huggingface.co/Salesforce)
 - - [X] [xgen-mm-phi3-mini-instruct-singleimage-r-v1.5](https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-singleimage-r-v1.5)
 - - [X] [xgen-mm-phi3-mini-instruct-interleave-r-v1](https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5)
@@ -159,6 +160,13 @@ If you can't find your favorite model, you can [open a new issue](https://github
 
 ## Recent updates
 
+Version 0.39.0
+
+- new model support: rhymes-ai/Aria
+- improved support for multi-image in various models.
+- docker package: The latest release will now be tagged with `:latest`, rather than latest commit.
+- ⚠️ docker: docker will now run as a user instead of root. Your `hf_home` volume may need the ownership fixed, you can use this command: `sudo chown $(id -u):$(id -g) -R hf_home`
+
 Version 0.38.2
 
 - Fix: multi-image for ovis 1.6
diff --git a/backend/aria.py b/backend/aria.py
new file mode 100644
index 0000000..5933921
--- /dev/null
+++ b/backend/aria.py
@@ -0,0 +1,56 @@
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+from vision_qna import *
+
+# rhymes-ai/Aria
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "aria"
+    format: str = "chatml"
+    visual_layers: List[str] = ["vision_tower", "multi_modal_projector"]
+    
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
+
+        self.eos_token = '<|im_end|>'
+
+        self.loaded_banner()
+
+    async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
+        images, prompt = await chatml_prompt_from_messages(request.messages, img_tok = "<fim_prefix><|img|><fim_suffix>")
+
+        prompt = prompt.replace("<fim_suffix><fim_prefix>", "<fim_suffix>\n<fim_prefix>")#.replace('<|im_end|>', '<|im_end|>\n')
+
+        if len(images) < 1:
+            prompt = "<fim_prefix><|img|><fim_suffix>" + prompt
+            images = [await url_to_image(transparent_pixel_url)]
+
+        inputs = self.processor(images=images, text=prompt, return_tensors="pt")
+        inputs["pixel_values"] = inputs["pixel_values"].to(self.model.dtype)
+        inputs = inputs.to(self.model.device)
+
+        default_params = {
+            'max_new_tokens': 500,
+            'do_sample': False,
+#            'temperature': 0.9, # random test failures, ex. OCR
+            'stop_strings': [self.eos_token],
+        }
+
+        params = self.get_generation_params(request, default_params=default_params)
+
+        generation_kwargs = dict(
+            tokenizer=self.processor.tokenizer,
+            **inputs,
+            **params,
+        )
+
+        for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
+            end = new_text.find(self.eos_token)
+            if end == -1:
+                yield new_text
+            else:
+                yield new_text[:end]
+                break
diff --git a/docker-compose.alt.yml b/docker-compose.alt.yml
index 689cfd5..cd9d80a 100644
--- a/docker-compose.alt.yml
+++ b/docker-compose.alt.yml
@@ -3,7 +3,10 @@ services:
     build:
       args:
         - VERSION=alt
+        - USER_ID=${UID:-1000}
+        - GROUP_ID=${GID:-1000}
       dockerfile: Dockerfile
+    user: ${UID:-1000}:${GID:-1000}
     container_name: openedai-vision-alt
     image: ghcr.io/matatonic/openedai-vision-alt
     env_file: vision-alt.env # your settings go here
diff --git a/docker-compose.yml b/docker-compose.yml
index 1e47538..3c5fd68 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,7 +3,10 @@ services:
     build:
       args:
         - VERSION=latest
+        - USER_ID=${UID:-1000}
+        - GROUP_ID=${GID:-1000}
       dockerfile: Dockerfile
+    user: ${UID:-1000}:${GID:-1000}
     container_name: openedai-vision
     image: ghcr.io/matatonic/openedai-vision
     env_file: vision.env # your settings go here
diff --git a/model_conf_tests.json b/model_conf_tests.json
index d3f8d18..f5d9b4a 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -114,6 +114,7 @@
   ["qnguyen3/nanoLLaVA-1.5", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0", "--load-in-4bit"],
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0"],
+  ["rhymes-ai/Aria", "-A", "flash_attention_2"],
   ["togethercomputer/Llama-3-8B-Dragonfly-Med-v1", "--load-in-4bit"],
   ["togethercomputer/Llama-3-8B-Dragonfly-Med-v1"],
   ["togethercomputer/Llama-3-8B-Dragonfly-v1", "--load-in-4bit"],
diff --git a/requirements.txt b/requirements.txt
index 7cce63c..3473940 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,3 +60,7 @@ mistral_common[opencv]
 
 # got-ocr2
 verovio
+
+# Aria. needs to build a bunch and doesn't work without many extra packages
+# BYOB, use it if you need it
+#grouped_gemm
diff --git a/test_api_model.py b/test_api_model.py
index 44812d6..a8f9c3e 100755
--- a/test_api_model.py
+++ b/test_api_model.py
@@ -89,13 +89,16 @@ def record_result(cmd_args, results, t, mem, note):
         params['top_p'] = args.top_p
 
     def generate_response(image_url, prompt):
-
         messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
-        messages.extend([
-            { "role": "user", "content": [
-                { "type": "image_url", "image_url": { "url": image_url } },
-                { "type": "text", "text": prompt },
-            ]}])
+
+        if isinstance(image_url, str):
+            image_url = [image_url]
+
+        content = []
+        for url in image_url:
+            content.extend([{ "type": "image_url", "image_url": { "url": url } }])
+        content.extend([{ "type": "text", "text": prompt }])
+        messages.extend([{ "role": "user", "content": content }])
 
         response = client.chat.completions.create(model=args.openai_model, messages=messages, **params)
         completion_tokens = 0
@@ -106,11 +109,15 @@ def generate_response(image_url, prompt):
 
     def generate_stream_response(image_url, prompt):
         messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
-        messages.extend([
-            { "role": "user", "content": [
-                { "type": "image_url", "image_url": { "url": image_url } },
-                { "type": "text", "text": prompt },
-            ]}])
+
+        if isinstance(image_url, str):
+            image_url = [image_url]
+
+        content = []
+        for url in image_url:
+            content.extend([{ "type": "image_url", "image_url": { "url": url } }])
+        content.extend([{ "type": "text", "text": prompt }])
+        messages.extend([{ "role": "user", "content": content }])
 
         response = client.chat.completions.create(model=args.openai_model, messages=messages, **params, stream=True)
         answer = ''
@@ -129,18 +136,18 @@ def generate_stream_response(image_url, prompt):
         ### Single round
         timing = []
 
-        def single_test(url, question, label, generator=generate_response):
+        def single_test(url, question, right_answer, label, generator=generate_response):
             tps_time = time.time()
             answer, tok = generator(url, question)
             tps_time = time.time() - tps_time
-            correct = name in answer.lower()
+            correct = right_answer in answer.lower()
             results.extend([correct])
             if not correct:
-                print(f"{name}[{label}]: fail, got: {answer}")
+                print(f"{right_answer}[{label}]: {red_fail}, got: {answer}")
                 #if args.abort_on_fail:
                 #    break
             else:
-                print(f"{name}[{label}]: pass{', got: ' + answer if args.verbose else ''}")
+                print(f"{right_answer}[{label}]: {green_pass}{', got: ' + answer if args.verbose else ''}")
             if tok > 1:
                 timing.extend([(tok, tps_time)])
 
@@ -148,11 +155,11 @@ def single_test(url, question, label, generator=generate_response):
 
         # url tests
         for name, url in urls.items():
-            single_test(url, "What is the subject of the image?", "url", generate_response)
+            single_test(url, "What is the subject of the image?", name, "url", generate_response)
 
             data_url = data_url_from_url(url)
-            single_test(data_url, "What is the subject of the image?", "data", generate_response)
-            single_test(data_url, "What is the subject of the image?", "data_stream", generate_stream_response)
+            single_test(data_url, "What is the subject of the image?", name, "data", generate_response)
+            single_test(data_url, "What is the subject of the image?", name, "data_stream", generate_stream_response)
 
 
         ## OCR tests
@@ -162,31 +169,26 @@ def single_test(url, question, label, generator=generate_response):
         }
         for name, question in quality_urls.items():
             prompt, data_url = question
-            single_test(data_url, prompt, "quality", generate_stream_response)
+            single_test(data_url, prompt, name, "quality", generate_stream_response)
 
         # No image tests
         no_image = { 
-            '5': 'In the sequence of numbers: 1, 2, 3, 4, ... What number comes next after 4?'
+            '5': 'In the sequence of numbers: 1, 2, 3, 4, ... What number comes next after 4? Answer only the number.'
         }
 
-        def no_image_response(prompt):
-            messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
-            messages.extend([{ "role": "user", "content": prompt }])
+        for name, prompt in no_image.items():
+            single_test([], prompt, name, 'no_img', generate_response)
 
-            response = client.chat.completions.create(model=args.openai_model, messages=messages, **params, max_tokens=5)
-            answer = response.choices[0].message.content
-            return answer
+        # Multi-image test
+        multi_image = {
+            "water": ("What natural element is common in both images?", 
+            [ 'https://images.freeimages.com/images/large-previews/e59/autumn-tree-1408307.jpg',
+            'https://images.freeimages.com/images/large-previews/242/waterfall-1537490.jpg'])
+        }
 
-        for name, prompt in no_image.items():
-            answer = no_image_response(prompt)
-            correct = True #name in answer.lower() # - no exceptions is enough.
-            results.extend([correct])
-            if not correct:
-                print(f"{name}[no_img]: fail, got: {answer}")
-                if args.abort_on_fail:
-                    break
-            else:
-                print(f"{name}[no_img]: pass{', got: ' + answer if args.verbose else ''}")
+        for name, question in multi_image.items():
+            prompt, data_url = question
+            single_test(data_url, prompt, name, "multi-image", generate_stream_response)
 
         test_time = time.time() - test_time
 
diff --git a/vision.sample.env b/vision.sample.env
index 4324af5..208cc1e 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -6,124 +6,125 @@ HF_HUB_ENABLE_HF_TRANSFER=1
 #CUDA_VISIBLE_DEVICES=1,0
 #OPENEDAI_DEVICE_MAP="sequential"
 
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 32.8s, mem: 22.8GB, 13/13 tests passed, (133/10.4s) 12.8 T/s
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 23.2GB, 13/13 tests passed, (32/2.6s) 12.4 T/s
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2"  # test pass✅, time: 5.9s, mem: 19.2GB, 13/13 tests passed, (32/1.4s) 22.1 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 8.5s, mem: 9.5GB, 13/13 tests passed, (39/1.6s) 24.4 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.5s, mem: 10.8GB, 13/13 tests passed, (38/1.1s) 34.8 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 10.7s, mem: 8.5GB, 13/13 tests passed, (59/2.8s) 21.1 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 9.1s, mem: 12.0GB, 13/13 tests passed, (70/2.2s) 32.0 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 7.9s, mem: 12.7GB, 13/13 tests passed, (37/1.8s) 20.9 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 9.2s, mem: 5.1GB, 13/13 tests passed, (48/2.6s) 18.7 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 7.3s, mem: 12.2GB, 13/13 tests passed, (44/1.8s) 23.9 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 10.7s, mem: 5.8GB, 13/13 tests passed, (44/3.1s) 14.3 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 8.4s, mem: 13.0GB, 13/13 tests passed, (35/2.3s) 15.4 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 32.2s, mem: 29.2GB, 13/13 tests passed, (79/9.7s) 8.1 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 20.8s, mem: 71.8GB, 13/13 tests passed, (71/5.8s) 12.2 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat --load-in-4bit -A flash_attention_2"  # test pass✅, time: 63.4s, mem: 65.8GB, 13/13 tests passed, (137/20.5s) 6.7 T/s
-#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat -A flash_attention_2"  # test pass✅, time: 67.8s, mem: 76.2GB, 13/13 tests passed, (159/21.8s) 7.3 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 23.7s, mem: 27.6GB, 13/13 tests passed, (60/7.3s) 8.2 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 30.1s, mem: 31.0GB, 13/13 tests passed, (58/9.4s) 6.1 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 25.5s, mem: 55.9GB, 13/13 tests passed, (45/7.9s) 5.7 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 19.1s, mem: 52.7GB, 13/13 tests passed, (50/5.9s) 8.5 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 31.8s, mem: 2.0GB, 13/13 tests passed, (271/10.2s) 26.5 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0"  # test pass✅, time: 7.6s, mem: 2.7GB, 13/13 tests passed, (77/2.2s) 34.5 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 17.4s, mem: 5.5GB, 13/13 tests passed, (156/5.4s) 28.8 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0"  # test pass✅, time: 8.3s, mem: 7.4GB, 13/13 tests passed, (90/2.4s) 36.7 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.1s, mem: 9.3GB, 13/13 tests passed, (43/2.9s) 14.8 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 19.1GB, 13/13 tests passed, (43/2.2s) 19.2 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 26.7s, mem: 27.7GB, 13/13 tests passed, (75/8.3s) 9.0 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0"  # test pass✅, time: 20.0s, mem: 52.8GB, 13/13 tests passed, (59/6.3s) 9.4 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 39.0s, mem: 31.7GB, 13/13 tests passed, (82/12.3s) 6.7 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0"  # test pass✅, time: 47.5s, mem: 76.8GB, 13/13 tests passed, (140/15.3s) 9.1 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 40.8s, mem: 51.5GB, 13/13 tests passed, (40/12.8s) 3.1 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 6.9s, mem: 6.0GB, 13/13 tests passed, (42/1.8s) 23.0 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 7.8s, mem: 8.2GB, 13/13 tests passed, (42/2.1s) 19.8 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.8s, mem: 10.1GB, 13/13 tests passed, (48/1.8s) 26.5 T/s
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 6.1s, mem: 7.8GB, 13/13 tests passed, (48/1.6s) 30.7 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 10.2s, mem: 11.3GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 6.9s, mem: 19.6GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 12.9s, mem: 7.0GB, 13/13 tests passed, (44/3.8s) 11.6 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2"  # test pass✅, time: 18.5s, mem: 16.4GB, 13/13 tests passed, (36/5.6s) 6.4 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 22.5s, mem: 18.5GB, 13/13 tests passed, (36/6.7s) 5.4 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2"  # test pass✅, time: 16.6s, mem: 27.5GB, 13/13 tests passed, (31/4.9s) 6.4 T/s
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 36.1s, mem: 45.2GB, 13/13 tests passed, (31/11.4s) 2.7 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"  # test pass✅, time: 9.0s, mem: 9.3GB, 13/13 tests passed, (68/2.4s) 28.2 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"  # test pass✅, time: 3.8s, mem: 9.3GB, 13/13 tests passed, (10/0.8s) 13.1 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"  # test pass✅, time: 9.1s, mem: 9.3GB, 13/13 tests passed, (73/2.4s) 29.9 T/s
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 9.5s, mem: 9.8GB, 13/13 tests passed, (74/2.6s) 28.0 T/s
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.7s, mem: 11.4GB, 13/13 tests passed, (14/2.8s) 5.0 T/s
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.9s, mem: 20.7GB, 13/13 tests passed, (22/2.7s) 8.3 T/s
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 23.2s, mem: 15.8GB, 13/13 tests passed, (92/7.4s) 12.4 T/s
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 17.3s, mem: 26.0GB, 13/13 tests passed, (79/5.3s) 14.8 T/s
-#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 124.4s, mem: 6.1GB, 13/13 tests passed, (104/40.4s) 2.6 T/s
-#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 95.7s, mem: 6.5GB, 13/13 tests passed, (84/31.6s) 2.7 T/s
-#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2"  # test pass✅, time: 26.6s, mem: 15.4GB, 13/13 tests passed, (40/8.4s) 4.8 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 49.6s, mem: 8.1GB, 13/13 tests passed, (318/16.0s) 19.9 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 40.6s, mem: 8.4GB, 13/13 tests passed, (310/13.1s) 23.6 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2"  # test pass✅, time: 31.9s, mem: 18.3GB, 13/13 tests passed, (302/10.3s) 29.5 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 43.7s, mem: 8.7GB, 13/13 tests passed, (214/14.0s) 15.3 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 36.8s, mem: 8.9GB, 13/13 tests passed, (214/11.8s) 18.1 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2"  # test pass✅, time: 29.7s, mem: 18.6GB, 13/13 tests passed, (208/9.5s) 21.9 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 130.7s, mem: 43.4GB, 13/13 tests passed, (285/42.8s) 6.7 T/s
-#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 111.3s, mem: 48.2GB, 13/13 tests passed, (271/36.1s) 7.5 T/s
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 15.4s, mem: 15.8GB, 13/13 tests passed, (49/4.5s) 10.8 T/s
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.7s, mem: 21.8GB, 13/13 tests passed, (32/3.4s) 9.4 T/s
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 11.1s, mem: 7.5GB, 13/13 tests passed, (37/3.1s) 11.9 T/s
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 12.5GB, 13/13 tests passed, (37/2.6s) 14.2 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two --load-in-4bit -A flash_attention_2"  # test pass✅, time: 49.3s, mem: 9.3GB, 13/13 tests passed, (209/18.6s) 11.2 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two -A flash_attention_2"  # test pass✅, time: 38.1s, mem: 18.8GB, 13/13 tests passed, (166/9.3s) 17.9 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2"  # test pass✅, time: 116.5s, mem: 8.4GB, 13/13 tests passed, (658/37.8s) 17.4 T/s
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2"  # test pass✅, time: 63.1s, mem: 18.1GB, 13/13 tests passed, (644/19.2s) 33.5 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 3.1s, mem: 9.0GB, 1/13 tests passed
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 25.9s, mem: 30.0GB, 13/13 tests passed, (27/5.9s) 4.6 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 23.7s, mem: 22.1GB, 13/13 tests passed, (40/6.3s) 6.3 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 26.2s, mem: 18.4GB, 13/13 tests passed, (60/8.1s) 7.4 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2"  # test pass✅, time: 64.5s, mem: 8.8GB, 13/13 tests passed, (70/19.0s) 3.7 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 7.8s, mem: 7.7GB, 13/13 tests passed, (17/1.8s) 9.5 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 26.6s, mem: 20.8GB, 13/13 tests passed, (61/8.7s) 7.0 T/s
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2"  # test pass✅, time: 69.0s, mem: 10.6GB, 13/13 tests passed, (85/23.0s) 3.7 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.2s, mem: 9.3GB, 13/13 tests passed, (58/4.5s) 13.0 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 10.8s, mem: 26.4GB, 13/13 tests passed, (59/3.0s) 19.3 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.9s, mem: 5.4GB, 13/13 tests passed, (62/3.8s) 16.5 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.2s, mem: 14.2GB, 13/13 tests passed, (65/2.8s) 23.6 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 70.6s, mem: 22.1GB, 13/13 tests passed, (184/23.0s) 8.0 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2"  # test pass✅, time: 88.1s, mem: 67.2GB, 13/13 tests passed, (246/28.7s) 8.6 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 19.7s, mem: 12.3GB, 13/13 tests passed, (55/6.0s) 9.2 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2"  # test pass✅, time: 17.8s, mem: 29.4GB, 13/13 tests passed, (55/5.4s) 10.3 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 22.2s, mem: 7.7GB, 13/13 tests passed, (88/6.9s) 12.7 T/s
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2"  # test pass✅, time: 16.4s, mem: 16.5GB, 13/13 tests passed, (82/4.9s) 16.7 T/s
-#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2"  # test pass✅, time: 8.8s, mem: 23.4GB, 13/13 tests passed, (37/2.4s) 15.7 T/s
-#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2"  # test pass✅, time: 18.8s, mem: 36.5GB, 13/13 tests passed, (51/5.7s) 8.9 T/s
-#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 38.0s, mem: 8.7GB, 13/13 tests passed, (181/11.3s) 16.0 T/s
-#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2"  # test pass✅, time: 34.7s, mem: 22.4GB, 13/13 tests passed, (129/6.8s) 18.9 T/s
-#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-90B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 123.9s, mem: 50.9GB, 13/13 tests passed, (347/59.3s) 5.9 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.3s, mem: 1.0GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.4s, mem: 1.3GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.6s, mem: 1.5GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.5s, mem: 2.4GB, 13/13 tests passed
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 13.0s, mem: 7.4GB, 13/13 tests passed, (51/3.8s) 13.5 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 12.4GB, 13/13 tests passed, (37/2.6s) 14.3 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 9.4s, mem: 4.7GB, 13/13 tests passed, (37/2.6s) 14.2 T/s
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 8.1s, mem: 9.6GB, 13/13 tests passed, (41/2.2s) 18.6 T/s
-#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 16.8s, mem: 35.8GB, 13/13 tests passed (manual calc) 12.7 T/s
-#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 11.2s, mem: 14.3GB, 13/13 tests passed, (37/3.2s) 11.5 T/s
-#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2"  # test pass✅, time: 10.1s, mem: 16.6GB, 13/13 tests passed, (37/2.9s) 12.9 T/s
-#CLI_COMMAND="python vision.py -m nvidia/NVLM-D-72B -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 63.4s, mem: 56.8GB, 13/13 tests passed, (67/20.3s) 3.3 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 19.8s, mem: 9.5GB, 13/13 tests passed, (96/5.4s) 17.8 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.6s, mem: 9.8GB, 13/13 tests passed, (104/4.7s) 22.1 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 13.2s, mem: 19.2GB, 13/13 tests passed, (97/3.4s) 28.8 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 30.6s, mem: 9.3GB, 13/13 tests passed, (80/9.4s) 8.5 T/s
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 27.6s, mem: 19.3GB, 13/13 tests passed, (75/8.6s) 8.7 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.0s, mem: 8.2GB, 13/13 tests passed, (69/2.7s) 25.7 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.7s, mem: 8.6GB, 13/13 tests passed, (52/1.4s) 36.0 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.6s, mem: 8.2GB, 13/13 tests passed, (37/1.7s) 22.1 T/s
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 7.2s, mem: 8.7GB, 13/13 tests passed, (63/1.8s) 34.5 T/s
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 13.5s, mem: 6.8GB, 13/13 tests passed, (81/4.2s) 19.4 T/s
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 8.6s, mem: 16.7GB, 13/13 tests passed, (69/2.3s) 29.6 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 11.4s, mem: 8.1GB, 13/13 tests passed, (43/3.2s) 13.4 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 9.5s, mem: 18.6GB, 13/13 tests passed, (51/2.6s) 19.8 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 13.8s, mem: 8.1GB, 13/13 tests passed, (61/4.1s) 15.0 T/s
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 12.9s, mem: 18.5GB, 13/13 tests passed, (95/3.8s) 24.7 T/s
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 6.0s, mem: 2.7GB, 13/13 tests passed, (63/1.7s) 36.4 T/s
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2"  # test pass✅, time: 4.8s, mem: 4.6GB, 13/13 tests passed, (63/1.2s) 51.2 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 32.3s, mem: 22.8GB, 13/13 tests passed, (133/10.2s) 13.1 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 9.1s, mem: 23.2GB, 13/13 tests passed, (32/2.6s) 12.5 T/s
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2"  # test pass✅, time: 5.5s, mem: 19.2GB, 13/13 tests passed, (32/1.4s) 22.8 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 8.0s, mem: 9.5GB, 13/13 tests passed, (39/1.5s) 25.3 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.9s, mem: 10.8GB, 13/13 tests passed, (38/1.1s) 36.1 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 10.1s, mem: 8.5GB, 13/13 tests passed, (59/2.7s) 21.8 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 8.4s, mem: 12.0GB, 13/13 tests passed, (70/2.2s) 32.5 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 7.4s, mem: 12.7GB, 13/13 tests passed, (37/1.7s) 21.6 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 8.7s, mem: 5.1GB, 13/13 tests passed, (48/2.5s) 19.4 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 6.4s, mem: 12.2GB, 13/13 tests passed, (44/1.8s) 24.5 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 10.2s, mem: 5.8GB, 13/13 tests passed, (44/2.9s) 15.3 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 7.6s, mem: 13.0GB, 13/13 tests passed, (35/2.2s) 16.0 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 32.2s, mem: 29.4GB, 13/13 tests passed, (83/10.3s) 8.1 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 21.6s, mem: 71.9GB, 13/13 tests passed, (103/8.3s) 12.4 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat --load-in-4bit -A flash_attention_2"  # test pass✅, time: 63.3s, mem: 65.8GB, 13/13 tests passed, (137/20.4s) 6.7 T/s
+#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat -A flash_attention_2"  # test pass✅, time: 67.3s, mem: 76.1GB, 13/13 tests passed, (159/21.8s) 7.3 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 22.7s, mem: 27.6GB, 13/13 tests passed, (60/7.1s) 8.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 29.3s, mem: 30.9GB, 13/13 tests passed, (58/9.3s) 6.2 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 25.0s, mem: 55.8GB, 13/13 tests passed, (45/7.9s) 5.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 18.6s, mem: 52.7GB, 13/13 tests passed, (50/5.8s) 8.7 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 30.1s, mem: 2.0GB, 13/13 tests passed, (271/9.8s) 27.5 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0"  # test pass✅, time: 7.6s, mem: 2.7GB, 13/13 tests passed, (77/2.1s) 36.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 16.4s, mem: 5.5GB, 13/13 tests passed, (156/5.1s) 30.6 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0"  # test pass✅, time: 8.1s, mem: 7.3GB, 13/13 tests passed, (90/2.4s) 37.1 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 9.2GB, 13/13 tests passed, (43/2.7s) 15.9 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 19.0GB, 13/13 tests passed, (43/2.2s) 19.6 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 25.1s, mem: 27.6GB, 13/13 tests passed, (75/8.0s) 9.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0"  # test pass✅, time: 19.9s, mem: 52.8GB, 13/13 tests passed, (59/6.3s) 9.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 37.9s, mem: 31.6GB, 13/13 tests passed, (82/11.9s) 6.9 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0"  # test pass✅, time: 47.0s, mem: 76.6GB, 13/13 tests passed, (140/15.1s) 9.3 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 40.1s, mem: 51.4GB, 13/13 tests passed, (40/12.6s) 3.2 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 6.2s, mem: 5.8GB, 13/13 tests passed, (42/1.7s) 24.3 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 7.1s, mem: 8.1GB, 13/13 tests passed, (42/2.0s) 20.8 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.3s, mem: 10.0GB, 13/13 tests passed, (48/1.8s) 27.4 T/s
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 5.5s, mem: 7.7GB, 13/13 tests passed, (48/1.5s) 32.9 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 9.1s, mem: 11.2GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 6.6s, mem: 19.6GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 12.5s, mem: 6.9GB, 13/13 tests passed, (44/3.9s) 11.4 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2"  # test pass✅, time: 19.0s, mem: 16.4GB, 13/13 tests passed, (36/5.6s) 6.4 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 22.9s, mem: 18.6GB, 13/13 tests passed, (36/6.9s) 5.2 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2"  # test pass✅, time: 19.6s, mem: 27.6GB, 13/13 tests passed, (31/6.1s) 5.1 T/s
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 35.4s, mem: 45.3GB, 13/13 tests passed, (31/11.1s) 2.8 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"  # test pass✅, time: 8.1s, mem: 9.4GB, 13/13 tests passed, (68/2.3s) 29.4 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"  # test pass✅, time: 3.4s, mem: 9.4GB, 13/13 tests passed, (10/0.7s) 13.5 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"  # test pass✅, time: 8.8s, mem: 9.4GB, 13/13 tests passed, (73/2.4s) 30.3 T/s
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 8.8s, mem: 9.9GB, 13/13 tests passed, (74/2.5s) 30.1 T/s
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.1s, mem: 11.6GB, 13/13 tests passed, (14/2.6s) 5.3 T/s
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.6s, mem: 20.8GB, 13/13 tests passed, (22/2.5s) 8.8 T/s
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 21.6s, mem: 15.9GB, 13/13 tests passed, (92/6.9s) 13.4 T/s
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 16.6s, mem: 26.0GB, 13/13 tests passed, (79/5.1s) 15.6 T/s
+#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 123.4s, mem: 6.1GB, 13/13 tests passed, (104/40.5s) 2.6 T/s
+#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 91.0s, mem: 6.4GB, 13/13 tests passed, (84/29.6s) 2.8 T/s
+#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2"  # test pass✅, time: 26.0s, mem: 15.3GB, 13/13 tests passed, (40/8.2s) 4.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 47.3s, mem: 7.9GB, 13/13 tests passed, (318/15.4s) 20.6 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 39.0s, mem: 8.3GB, 13/13 tests passed, (310/12.4s) 24.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2"  # test pass✅, time: 30.4s, mem: 18.2GB, 13/13 tests passed, (302/9.6s) 31.5 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 39.8s, mem: 8.6GB, 13/13 tests passed, (214/12.7s) 16.9 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 36.4s, mem: 8.9GB, 13/13 tests passed, (214/11.8s) 18.2 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2"  # test pass✅, time: 28.6s, mem: 18.6GB, 13/13 tests passed, (208/9.4s) 22.1 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant"  # test pass✅, time: 127.7s, mem: 43.3GB, 13/13 tests passed, (285/41.8s) 6.8 T/s
+#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 110.3s, mem: 48.3GB, 13/13 tests passed, (271/35.8s) 7.6 T/s
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 14.7s, mem: 15.8GB, 13/13 tests passed, (49/4.4s) 11.0 T/s
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.2s, mem: 21.8GB, 13/13 tests passed, (32/3.4s) 9.5 T/s
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.1s, mem: 7.5GB, 13/13 tests passed, (37/2.9s) 12.6 T/s
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2"  # test pass✅, time: 8.6s, mem: 12.5GB, 13/13 tests passed, (37/2.5s) 14.9 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two --load-in-4bit -A flash_attention_2"  # test pass✅, time: 58.7s, mem: 9.2GB, 13/13 tests passed, (192/15.8s) 12.1 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two -A flash_attention_2"  # test pass✅, time: 26.0s, mem: 18.8GB, 13/13 tests passed, (131/7.3s) 18.1 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2"  # test pass✅, time: 114.8s, mem: 8.5GB, 13/13 tests passed, (703/39.3s) 17.9 T/s
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2"  # test pass✅, time: 58.2s, mem: 18.1GB, 13/13 tests passed, (692/19.1s) 36.2 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.8s, mem: 8.9GB, 1/13 tests passed
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 36.1s, mem: 30.0GB, 13/13 tests passed, (97/13.0s) 7.5 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 20.6s, mem: 22.1GB, 13/13 tests passed, (38/6.0s) 6.3 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 30.6s, mem: 18.3GB, 13/13 tests passed, (76/9.4s) 8.0 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2"  # test pass✅, time: 56.0s, mem: 8.8GB, 13/13 tests passed, (83/21.6s) 3.8 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.0s, mem: 7.7GB, 13/13 tests passed, (34/2.5s) 13.7 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 27.8s, mem: 20.7GB, 13/13 tests passed, (60/8.6s) 7.0 T/s
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2"  # test pass✅, time: 48.0s, mem: 10.5GB, 13/13 tests passed, (62/17.8s) 3.5 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.0s, mem: 9.3GB, 13/13 tests passed, (58/4.5s) 12.9 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 10.7s, mem: 26.5GB, 13/13 tests passed, (59/3.1s) 19.3 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.7s, mem: 5.5GB, 13/13 tests passed, (62/3.7s) 16.7 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 8.8s, mem: 14.4GB, 13/13 tests passed, (65/2.6s) 25.5 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 68.0s, mem: 22.3GB, 13/13 tests passed, (184/22.0s) 8.4 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2"  # test pass✅, time: 87.5s, mem: 67.4GB, 13/13 tests passed, (246/28.5s) 8.6 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 18.8s, mem: 12.6GB, 13/13 tests passed, (55/5.7s) 9.6 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2"  # test pass✅, time: 17.4s, mem: 29.7GB, 13/13 tests passed, (55/5.3s) 10.3 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 21.4s, mem: 7.9GB, 13/13 tests passed, (88/6.6s) 13.4 T/s
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2"  # test pass✅, time: 15.8s, mem: 16.7GB, 13/13 tests passed, (82/4.9s) 16.9 T/s
+#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2"  # test pass✅, time: 8.2s, mem: 23.6GB, 13/13 tests passed, (37/2.3s) 16.2 T/s
+#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2"  # test pass✅, time: 18.2s, mem: 36.8GB, 13/13 tests passed, (51/5.7s) 9.0 T/s
+#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 27.1s, mem: 8.9GB, 13/13 tests passed, (150/9.1s) 16.5 T/s
+#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2"  # test pass✅, time: 25.8s, mem: 22.6GB, 13/13 tests passed, (138/6.9s) 20.0 T/s
+#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-90B-Vision-Instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 119.0s, mem: 51.0GB, 13/13 tests passed, (190/34.1s) 5.6 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.1s, mem: 1.1GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.6s, mem: 1.3GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.3s, mem: 1.6GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.6s, mem: 2.5GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 12.8s, mem: 7.5GB, 13/13 tests passed, (51/3.7s) 13.7 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2"  # test pass✅, time: 8.9s, mem: 12.4GB, 13/13 tests passed, (37/2.4s) 15.1 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 8.7s, mem: 4.7GB, 13/13 tests passed, (37/2.5s) 14.9 T/s
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 7.5s, mem: 9.6GB, 13/13 tests passed, (41/2.1s) 19.3 T/s
+#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 15.9s, mem: 35.9GB, 13/13 tests passed
+#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.5s, mem: 14.3GB, 13/13 tests passed, (37/3.1s) 11.9 T/s
+#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 16.6GB, 13/13 tests passed, (37/2.8s) 13.2 T/s
+#CLI_COMMAND="python vision.py -m nvidia/NVLM-D-72B -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 61.7s, mem: 56.8GB, 13/13 tests passed, (67/19.7s) 3.4 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 19.1s, mem: 9.6GB, 13/13 tests passed, (93/4.9s) 18.8 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 16.6s, mem: 9.8GB, 13/13 tests passed, (115/4.8s) 24.0 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 11.8s, mem: 19.2GB, 13/13 tests passed, (98/3.2s) 30.9 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 28.2s, mem: 9.3GB, 13/13 tests passed, (64/8.4s) 7.6 T/s
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 24.5s, mem: 19.3GB, 13/13 tests passed, (63/7.3s) 8.7 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 8.2GB, 13/13 tests passed, (52/2.0s) 25.5 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 8.6GB, 13/13 tests passed, (53/1.4s) 38.7 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.4s, mem: 8.2GB, 13/13 tests passed, (40/1.6s) 24.4 T/s
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 5.8s, mem: 8.6GB, 13/13 tests passed, (53/1.3s) 39.4 T/s
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 12.1s, mem: 6.8GB, 13/13 tests passed, (80/3.9s) 20.3 T/s
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 8.4s, mem: 16.8GB, 13/13 tests passed, (76/2.4s) 31.1 T/s
+#CLI_COMMAND="python vision.py -m rhymes-ai/Aria -A flash_attention_2"  # test pass✅, time: 71.8s, mem: 49.2GB, 13/13 tests passed, (210/22.8s) 9.2 T/s
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 10.9s, mem: 8.1GB, 13/13 tests passed, (43/3.1s) 13.8 T/s
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 9.1s, mem: 18.6GB, 13/13 tests passed, (51/2.6s) 19.8 T/s
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 12.9s, mem: 8.1GB, 13/13 tests passed, (61/3.8s) 16.0 T/s
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 12.3s, mem: 18.6GB, 13/13 tests passed, (95/3.7s) 25.9 T/s
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 5.9s, mem: 2.8GB, 13/13 tests passed, (63/1.7s) 37.9 T/s
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2"  # test pass✅, time: 4.3s, mem: 4.6GB, 13/13 tests passed, (63/1.1s) 54.8 T/s
diff --git a/vision_qna.py b/vision_qna.py
index af1ab34..dda0936 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -302,16 +302,15 @@ async def vicuna0_prompt_from_messages(messages: list[Message], img_tok = "<imag
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"### Human: {img_tag}{text}\n"
         elif m.role == 'assistant':
             for c in m.content:
@@ -339,16 +338,15 @@ async def vicuna_prompt_from_messages(messages: list[Message], img_tok = "<image
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"USER: {img_tag}{text}\n"
         elif m.role == 'assistant':
             for c in m.content:
@@ -370,16 +368,15 @@ async def llama2_prompt_from_messages(messages: list[Message], img_tok = "<image
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"[INST] {img_tag}{text} [/INST]"
         elif m.role == 'assistant':
             for c in m.content:
@@ -402,14 +399,12 @@ async def llama3_prompt_from_messages(messages: list[Message], img_tok = "<image
         messages.pop(-1)
 
     for m in messages:
-        has_image = False
+        img_tag = ''
 
         for c in m.content:
             if c.type == 'image_url':
                 images.extend([ await url_to_image(c.image_url.url) ])
-                has_image = True
-
-        img_tag = img_tok if has_image else ''
+                img_tag += img_tok
 
         for c in m.content:
             if c.type == 'text':
@@ -431,16 +426,15 @@ async def chatml_prompt_from_messages(messages: list[Message], img_tok = "<image
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"<|im_start|>user\n{img_tag}{text}<|im_end|>"
         elif m.role == 'assistant':
             for c in m.content:
@@ -467,16 +461,15 @@ async def gemma_prompt_from_messages(messages: list[Message], img_tok = "<image>
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"<start_of_turn>user\n{img_tag}{text}<end_of_turn>"
         elif m.role == 'assistant':
             for c in m.content:
@@ -502,7 +495,7 @@ async def fuyu_prompt_from_messages(messages: list[Message], img_tok = "", img_e
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    p = img_tok + p + img_end
+                    p = img_tok + p + img_end # XXX
                 if c.type == 'text':
                     p += f"{c.text}\n\n" # Question:
             prompt += p
@@ -531,16 +524,15 @@ async def emu_images_prompt_system_from_messages(messages: list[Message], img_to
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f" [USER]: {img_tag}{text}"
         elif m.role == 'assistant':
             for c in m.content:
@@ -595,16 +587,15 @@ async def phintern_prompt_from_messages(messages: list[Message], img_tok = "<ima
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"<s><|user|>\n{img_tag}{text}<|end|>"
         elif m.role == 'assistant':
             for c in m.content:
@@ -627,16 +618,15 @@ async def falcon_prompt_from_messages(messages: list[Message], img_tok = "<image
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"User:{img_tag}{text} "
         elif m.role == 'assistant':
             for c in m.content:
@@ -664,7 +654,7 @@ async def prompt_history_images_system_from_messages(messages: list[Message], im
                 if c.type == 'image_url':
                     image = await url_handler(c.image_url.url)
                     images.extend([image])
-                    p = img_tok + p
+                    p = img_tok + p # XXX Wrong order?
                 if c.type == 'text':
                     p += c.text
 
@@ -743,16 +733,15 @@ async def pixtral_prompt_from_messages(messages: list[Message], img_tok = "[IMG]
     for m in messages:
         if m.role == 'user':
             text = ''
-            has_image = False
+            img_tag = ''
 
             for c in m.content:
                 if c.type == 'image_url':
                     images.extend([ await url_to_image(c.image_url.url) ])
-                    has_image = True
+                    img_tag += img_tok
                 if c.type == 'text':
                     text = c.text
 
-            img_tag = img_tok if has_image else ''
             prompt += f"[INST] {text}{img_tag} [/INST]"
         elif m.role == 'assistant':
             for c in m.content:
@@ -1009,4 +998,7 @@ def guess_backend(model_name: str) -> str:
         return 'omchat'
 
     if 'got-ocr2' in model_id:
-        return 'got_ocr2'
\ No newline at end of file
+        return 'got_ocr2'
+    
+    if 'aria' in model_id:
+        return 'aria'
\ No newline at end of file