diff --git a/Dockerfile b/Dockerfile
index df1177d..dbadc85 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,7 +12,7 @@ RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app
COPY requirements.txt .
ARG VERSION=latest
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.41.2\nautoawq>=0.2.5" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers==4.41.2\nautoawq>=0.2.5" >> requirements.txt ; fi
# TODO: nvidia apex wheel
RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
diff --git a/README.md b/README.md
index cebf43b..5e50a7a 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,13 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
## Model support
- [X] [OpenGVLab](https://huggingface.co/OpenGVLab)
+- - [X] [InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)
+- - [X] [InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)
+- - [X] [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)
+- - [X] [InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B) (alternate docker only)
+- - [X] [InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B)
+- - [X] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)
+- - [X] [InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B)
- - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) (wont gpu split yet)
- - [X] [InternVL-Chat-V1-5-Int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-Int8) (wont gpu split yet)
- - [X] [Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)
@@ -42,7 +49,7 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
- - [X] [360VL-8B](https://huggingface.co/qihoo360/360VL-8B)
- - [X] [360VL-70B](https://huggingface.co/qihoo360/360VL-70B) (untested)
- [X] [LlavaNext](https://huggingface.co/llava-hf)
-- - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (currently errors, use an image before 0.26.0)
+- - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)
- - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
- - [X] [llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)
- - [X] [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
@@ -106,6 +113,10 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
## Recent updates
+Version 0.27.0
+
+- new model support: OpenGVLab/InternVL2 series of models (1B, 2B, 4B, 8B*, 26B*, 40B*) - *(current top open source models)
+
Version 0.26.0
- new model support: cognitivecomputations/dolphin-vision-72b
diff --git a/backend/idefics2.py b/backend/idefics2.py
index a30b30f..d3b4ef5 100644
--- a/backend/idefics2.py
+++ b/backend/idefics2.py
@@ -9,6 +9,7 @@
# HuggingFaceM4/idefics2-8b-chatty-AWQ
class VisionQnA(VisionQnABase):
+ format: str = 'internal'
model_name: str = "idefics2"
vision_layers: List[str] = ['vision_model', 'connector']
diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py
index cc6c3fd..26cdb4d 100644
--- a/backend/internvl-chat-v1-5.py
+++ b/backend/internvl-chat-v1-5.py
@@ -9,7 +9,16 @@
# OpenGVLab/InternVL-Chat-V1-5
# OpenGVLab/InternVL-Chat-V1-5-Int8
# OpenGVLab/Mini-InternVL-Chat-2B-V1-5
-# OpenGVLab/Mini-InternVL-Chat-4B-V1-5
+# OpenGVLab/Mini-InternVL-Chat-4B-V1-5 (phintern)
+# OpenGVLab/InternVL2-1B
+# OpenGVLab/InternVL2-2B-AWQ (empty response)
+# OpenGVLab/InternVL2-2B
+# OpenGVLab/InternVL2-4B
+# OpenGVLab/InternVL2-4B (phintern)
+# OpenGVLab/InternVL2-8B
+# OpenGVLab/InternVL2-26B
+# OpenGVLab/InternVL2-40B (yi-34- nous-hermes-2)
+
MAX_TILES = 6
@@ -118,10 +127,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
- if self.format == 'phintern':
- images, prompt = await phintern_prompt_from_messages(request.messages, img_tok='')
- else:
- images, prompt = await chatml_prompt_from_messages(request.messages, img_tok='')
+ images, prompt = await prompt_from_messages(request.messages, self.format)
# TODO: use detail to set max tiles if detail=low (=512)
# if .detail == 'low': max_num=1
@@ -134,11 +140,11 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
pixel_values = None
if pixel_values is not None:
- image_tokens = '' + '' * self.model.num_image_token * pixel_values.shape[0] + '\n'
- else:
- image_tokens = ''
+ for img in images:
+ image_tokens = '' + '' * self.model.num_image_token * img.size(0) + ''
+ prompt = prompt.replace('', image_tokens, 1)
- model_inputs = self.tokenizer(image_tokens + prompt, return_tensors='pt')
+ model_inputs = self.tokenizer(prompt, return_tensors='pt')
input_ids = model_inputs['input_ids'].cuda()
attention_mask = model_inputs['attention_mask'].cuda()
diff --git a/backend/mantis.py b/backend/mantis.py
index 69bac5d..a80a577 100644
--- a/backend/mantis.py
+++ b/backend/mantis.py
@@ -4,6 +4,7 @@
from vision_qna import *
class VisionQnA(VisionQnABase):
+ format: str = 'internal'
model_name: str = "mantis"
vision_layers: List[str] = ["vision_tower", "multi_modal_projector"]
diff --git a/backend/minicpm.py b/backend/minicpm.py
index 9248aef..2605069 100644
--- a/backend/minicpm.py
+++ b/backend/minicpm.py
@@ -7,6 +7,7 @@
# openbmb/MiniCPM-V aka OmniLMM-3B
class VisionQnA(VisionQnABase):
+ format: str = 'internal'
model_name: str = "minicpm"
vision_layers: List[str] = ["resampler", "vpm"]
diff --git a/backend/omnilmm12b.py b/backend/omnilmm12b.py
index 86dcfb3..e2ad496 100644
--- a/backend/omnilmm12b.py
+++ b/backend/omnilmm12b.py
@@ -5,6 +5,7 @@
# openbmb/OmniLMM-12B
class VisionQnA(VisionQnABase):
+ format: str = 'internal'
model_name: str = "omnilmm12b"
def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
diff --git a/backend/phi3.py b/backend/phi3.py
index b46c904..ba988a6 100644
--- a/backend/phi3.py
+++ b/backend/phi3.py
@@ -6,6 +6,7 @@
# failspy/Phi-3-vision-128k-instruct-abliterated-alpha
class VisionQnA(VisionQnABase):
+ format: str = 'phi3'
model_name: str = "phi3"
vision_layers: List[str] = ["vision_embed_tokens"]
diff --git a/backend/xcomposer2-vl.py b/backend/xcomposer2-vl.py
index b4263b5..7e718bc 100644
--- a/backend/xcomposer2-vl.py
+++ b/backend/xcomposer2-vl.py
@@ -31,6 +31,7 @@ class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM):
]
class VisionQnA(VisionQnABase):
+ format: str = 'internal'
model_name: str = "xcomposer2-vl"
vision_layers: List[str] = ['vit', 'vision_proj']
diff --git a/backend/xcomposer2.py b/backend/xcomposer2.py
index afc404d..8ca90c0 100644
--- a/backend/xcomposer2.py
+++ b/backend/xcomposer2.py
@@ -26,6 +26,7 @@ class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM):
]
class VisionQnA(VisionQnABase):
+ format: str = 'internal'
model_name: str = "xcomposer2"
vision_layers: List[str] = ['vit', 'vision_proj']
diff --git a/backend/yi-vl.py b/backend/yi-vl.py
index eb31abf..31da906 100644
--- a/backend/yi-vl.py
+++ b/backend/yi-vl.py
@@ -18,7 +18,7 @@
class VisionQnA(VisionQnABase):
model_name: str = "qwen-vl"
- format: 'chatml'
+ format: str = 'chatml'
def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
super().__init__(model_id, device, device_map, extra_params, format)
diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json
index 32ee64b..52cf249 100644
--- a/model_conf_tests.alt.json
+++ b/model_conf_tests.alt.json
@@ -1,52 +1,19 @@
[
- ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
- ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
- ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
- ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5"],
- ["Qwen/Qwen-VL-Chat"],
- ["THUDM/cogagent-chat-hf", "--load-in-4bit"],
- ["THUDM/cogagent-chat-hf"],
- ["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
- ["THUDM/cogvlm-chat-hf"],
- ["THUDM/cogvlm2-llama3-chat-19B"],
- ["THUDM/cogvlm2-llama3-chinese-chat-19B"],
+ ["YanweiLi/MGM-7B", "--load-in-4bit", "--use-flash-attn"],
+ ["YanweiLi/MGM-7B", "--use-flash-attn"],
+ ["YanweiLi/MGM-7B-HD", "--load-in-4bit", "--use-flash-attn"],
+ ["YanweiLi/MGM-7B-HD", "--use-flash-attn"],
["YanweiLi/MGM-13B", "--load-in-4bit", "--use-flash-attn"],
["YanweiLi/MGM-13B", "--use-flash-attn"],
["YanweiLi/MGM-13B-HD", "--load-in-4bit", "--use-flash-attn"],
["YanweiLi/MGM-13B-HD", "--use-flash-attn"],
- ["YanweiLi/MGM-2B", "--use-flash-attn"],
["YanweiLi/MGM-34B", "--load-in-4bit", "--use-flash-attn"],
["YanweiLi/MGM-34B", "--use-flash-attn"],
["YanweiLi/MGM-34B-HD", "--load-in-4bit", "--use-flash-attn"],
["YanweiLi/MGM-34B-HD", "--use-flash-attn"],
- ["YanweiLi/MGM-7B", "--load-in-4bit", "--use-flash-attn"],
- ["YanweiLi/MGM-7B", "--use-flash-attn"],
- ["YanweiLi/MGM-7B-HD", "--load-in-4bit", "--use-flash-attn"],
- ["YanweiLi/MGM-7B-HD", "--use-flash-attn"],
["YanweiLi/MGM-8x7B", "--load-in-4bit", "--use-flash-attn"],
["YanweiLi/MGM-8x7B", "--use-flash-attn"],
["YanweiLi/MGM-8x7B-HD", "--load-in-4bit", "--use-flash-attn"],
["YanweiLi/MGM-8x7B-HD", "--use-flash-attn"],
- ["adept/fuyu-8b", "--device-map", "cuda:0"],
- ["echo840/Monkey"],
- ["echo840/Monkey-Chat"],
- ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0"],
- ["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0"],
- ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn", "--device", "cuda:0"],
- ["internlm/internlm-xcomposer2-vl-1_8b", "--use-flash-attn", "--device-map", "cuda:0"],
- ["internlm/internlm-xcomposer2-vl-7b", "--use-flash-attn", "--device-map", "cuda:0"],
- ["internlm/internlm-xcomposer2-vl-7b-4bit", "--use-flash-attn", "--device", "cuda:0"],
- ["llava-hf/bakLlava-v1-hf", "--load-in-4bit", "--use-flash-attn"],
- ["llava-hf/bakLlava-v1-hf", "--use-flash-attn", "--device-map", "cuda:0"],
- ["llava-hf/llava-1.5-13b-hf", "--load-in-4bit", "--use-flash-attn"],
- ["llava-hf/llava-1.5-13b-hf", "--use-flash-attn", "--device-map", "cuda:0"],
- ["llava-hf/llava-1.5-7b-hf", "--load-in-4bit", "--use-flash-attn"],
- ["llava-hf/llava-1.5-7b-hf", "--use-flash-attn", "--device-map", "cuda:0"],
- ["openbmb/MiniCPM-Llama3-V-2_5", "--use-flash-attn", "--device-map", "cuda:0"],
- ["openbmb/MiniCPM-V", "--use-flash-attn", "--device-map", "cuda:0"],
- ["openbmb/MiniCPM-V-2", "--use-flash-attn", "--device-map", "cuda:0"],
- ["qihoo360/360VL-8B", "--use-flash-attn", "--load-in-4bit"],
- ["qihoo360/360VL-8B", "--use-flash-attn"],
- ["vikhyatk/moondream1"],
- ["vikhyatk/moondream2", "--use-flash-attn"]
+ ["vikhyatk/moondream1"]
]
diff --git a/model_conf_tests.json b/model_conf_tests.json
index afb261f..21080cb 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -25,6 +25,18 @@
["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"],
["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0"],
["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"],
["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"],
["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"],
diff --git a/test_models.py b/test_models.py
index ab681ae..4144282 100755
--- a/test_models.py
+++ b/test_models.py
@@ -309,14 +309,18 @@ def no_image_response(prompt):
print(f"### End tests.")
- print("""# This sample env file can be used to set environment variables for the docker-compose.yml
+ fname = f"sample.env-{time.time()}"
+ with open(fname,'w') as results_file:
+ print("""# This sample env file can be used to set environment variables for the docker-compose.yml
# Copy this file to vision.env and uncomment the model of your choice.
HF_HOME=hf_home
HF_HUB_ENABLE_HF_TRANSFER=1
#HF_TOKEN=hf-...
-#CUDA_VISIBLE_DEVICES=1,0""")
+#CUDA_VISIBLE_DEVICES=1,0""", file=results_file)
- for r in all_results:
- cmdl = ' '.join(r['args'])
- result = all(r['results'])
- print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")
+ for r in all_results:
+ cmdl = ' '.join(r['args'])
+ result = all(r['results'])
+ print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}", file=results_file)
+
+ print(open(fname,'r').read())
diff --git a/vision-alt.sample.env b/vision-alt.sample.env
index e4838b4..0f01d86 100644
--- a/vision-alt.sample.env
+++ b/vision-alt.sample.env
@@ -2,54 +2,14 @@
# Copy this file to vision.env and uncomment the model of your choice.
HF_HOME=hf_home
HF_HUB_ENABLE_HF_TRANSFER=1
+#HF_TOKEN=hf-...
#CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 14.3s, mem: 52.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail❌, time: 17.1s, mem: 18.2GB, 2/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 27.0s, mem: 31.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 3.5s, mem: 7.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 4.5s, mem: 19.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 20.5s, mem: 12.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 14.7s, mem: 37.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 19.2s, mem: 12.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 13.2s, mem: 36.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 22.2s, mem: 40.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 64.7s, mem: 40.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn" # test pass✅, time: 31.6s, mem: 10.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn" # test pass✅, time: 22.4s, mem: 27.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 26.4s, mem: 14.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn" # test pass✅, time: 20.7s, mem: 31.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn" # test pass✅, time: 16.7s, mem: 21.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn" # test pass✅, time: 11.0s, mem: 67.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 180.1s, mem: 24.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn" # test pass✅, time: 118.0s, mem: 70.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn" # test pass✅, time: 11.4s, mem: 6.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn" # test pass✅, time: 5.5s, mem: 15.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 35.4s, mem: 9.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn" # test pass✅, time: 15.4s, mem: 18.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn" # test pass✅, time: 22.7s, mem: 26.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn" # test pass✅, time: 14.7s, mem: 91.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 24.4s, mem: 29.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn" # test pass✅, time: 18.4s, mem: 95.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 14.1s, mem: 24.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 6.1s, mem: 21.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 7.4s, mem: 21.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 17.9s, mem: 25.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.6s, mem: 18.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0" # test pass✅, time: 11.2s, mem: 9.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.3s, mem: 7.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.8s, mem: 20.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0" # test pass✅, time: 10.7s, mem: 10.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn" # test fail❌, time: 2.3s, mem: 5.8GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0" # test fail❌, time: 1.8s, mem: 15.5GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 10.0s, mem: 8.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.7s, mem: 26.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 9.1s, mem: 5.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.5s, mem: 14.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.9s, mem: 7.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.6s, mem: 11.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 7.7s, mem: 7.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 5.2s, mem: 17.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test pass✅, time: 4.0s, mem: 4.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 4.1s, mem: 4.5GB, 8/8 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn" # test pass✅, time: 8.0s, mem: 15.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn" # test pass✅, time: 23.9s, mem: 18.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn" # test pass✅, time: 31.3s, mem: 27.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn" # test pass✅, time: 27.9s, mem: 31.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn" # test pass✅, time: 16.2s, mem: 67.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn" # test pass✅, time: 147.6s, mem: 70.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn" # test pass✅, time: 20.8s, mem: 91.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn" # test pass✅, time: 26.6s, mem: 96.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test pass✅, time: 5.5s, mem: 4.9GB, 12/12 tests passed.
diff --git a/vision.sample.env b/vision.sample.env
index aab960b..11f81a1 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -4,114 +4,126 @@ HF_HOME=hf_home
HF_HUB_ENABLE_HF_TRANSFER=1
#HF_TOKEN=hf-...
#CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 9.2s, mem: 8.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 8.2s, mem: 19.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.9s, mem: 9.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.9s, mem: 11.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 12.9s, mem: 9.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 9.4s, mem: 12.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 8.1s, mem: 12.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 10.2s, mem: 5.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 7.3s, mem: 12.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 10.1s, mem: 5.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.9s, mem: 13.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 11.2s, mem: 9.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 10.1s, mem: 19.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 29.8s, mem: 29.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 22.1s, mem: 71.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.2s, mem: 13.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.4s, mem: 22.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 13.5s, mem: 13.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 16.9s, mem: 13.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0" # test pass✅, time: 14.0s, mem: 22.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 15.3s, mem: 12.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 20.5s, mem: 27.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 27.4s, mem: 30.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 26.5s, mem: 54.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 20.4s, mem: 52.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 40.4s, mem: 31.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.3s, mem: 5.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.7s, mem: 7.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.4s, mem: 9.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.6s, mem: 7.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 10.4s, mem: 6.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 13.6s, mem: 12.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40" # test pass✅, time: 13.1s, mem: 15.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 9.9s, mem: 11.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 9.8s, mem: 11.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.5s, mem: 19.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 10.6s, mem: 10.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 26.9s, mem: 13.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 22.0s, mem: 37.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 29.9s, mem: 12.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 22.1s, mem: 36.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 34.4s, mem: 22.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 33.1s, mem: 40.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 151.2s, mem: 22.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 104.0s, mem: 40.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 78.9s, mem: 16.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test pass✅, time: 53.7s, mem: 28.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.9s, mem: 11.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 9.2s, mem: 20.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.9s, mem: 7.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.1s, mem: 17.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 8.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.1s, mem: 18.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 6.2s, mem: 8.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.5s, mem: 16.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 19.0s, mem: 25.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit" # test pass✅, time: 53.8s, mem: 49.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit" # test pass✅, time: 9.0s, mem: 15.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 8.6s, mem: 22.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 12.5s, mem: 15.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.2s, mem: 22.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit" # test pass✅, time: 11.4s, mem: 7.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn" # test pass✅, time: 9.5s, mem: 12.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.5s, mem: 7.3GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 20.0s, mem: 20.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.8s, mem: 6.5GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 28.8s, mem: 19.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 15.8s, mem: 9.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.9s, mem: 3.1GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.5s, mem: 7.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.8s, mem: 6.6GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 21.9s, mem: 20.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 14.9s, mem: 11.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 13.1s, mem: 9.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.5s, mem: 26.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 5.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.2s, mem: 14.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit" # test fail❌, time: 3.9s, mem: 21.3GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test fail❌, time: 3.6s, mem: 66.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 23.6s, mem: 8.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 19.4s, mem: 17.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 15.8s, mem: 17.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 14.4s, mem: 34.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 13.7s, mem: 9.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 12.8s, mem: 19.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.0s, mem: 1.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.3s, mem: 1.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.7s, mem: 1.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.8s, mem: 2.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit" # test pass✅, time: 11.5s, mem: 7.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn" # test pass✅, time: 9.1s, mem: 12.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.4s, mem: 9.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.3s, mem: 19.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.8s, mem: 5.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 8.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.8s, mem: 5.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.8s, mem: 9.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 11.5s, mem: 8.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 8.8s, mem: 17.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.4s, mem: 8.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.5s, mem: 8.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 9.8s, mem: 8.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 10.8s, mem: 17.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 16.6s, mem: 17.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.8s, mem: 32.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 11.4s, mem: 8.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 10.1s, mem: 17.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 10.4s, mem: 8.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 14.4s, mem: 17.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit" # test pass✅, time: 7.4s, mem: 3.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 6.0s, mem: 4.8GB, 12/12 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 8.2s, mem: 8.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 7.6s, mem: 19.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.1s, mem: 9.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.6s, mem: 10.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 11.2s, mem: 8.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 8.3s, mem: 11.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 7.3s, mem: 12.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 8.7s, mem: 5.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 6.6s, mem: 12.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 9.2s, mem: 5.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.2s, mem: 13.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 9.6s, mem: 9.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 9.6s, mem: 19.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 25.0s, mem: 29.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 21.5s, mem: 71.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.7s, mem: 12.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.0s, mem: 22.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 12.7s, mem: 12.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.5s, mem: 12.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0" # test pass✅, time: 13.4s, mem: 22.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 14.1s, mem: 12.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 19.9s, mem: 26.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 26.2s, mem: 30.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 24.8s, mem: 54.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 17.9s, mem: 52.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 34.6s, mem: 31.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.6s, mem: 3.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 6.6s, mem: 4.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.8s, mem: 4.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.2s, mem: 6.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 5.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 7.7s, mem: 10.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.1s, mem: 8.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 7.4s, mem: 18.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 21.4s, mem: 26.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 19.9s, mem: 52.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 55.6s, mem: 32.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 68.4s, mem: 77.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.1s, mem: 5.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.8s, mem: 6.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.7s, mem: 8.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.7s, mem: 7.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 9.0s, mem: 6.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 9.3s, mem: 9.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40" # test pass✅, time: 8.4s, mem: 14.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 7.4s, mem: 11.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 8.7s, mem: 11.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.0s, mem: 19.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.7s, mem: 10.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 24.9s, mem: 13.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 21.0s, mem: 37.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 25.9s, mem: 12.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 19.3s, mem: 36.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 31.5s, mem: 22.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 31.8s, mem: 40.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 135.0s, mem: 22.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 99.3s, mem: 40.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 70.9s, mem: 16.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test pass✅, time: 50.9s, mem: 27.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.1s, mem: 11.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.7s, mem: 20.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.1s, mem: 7.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.7s, mem: 17.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.9s, mem: 8.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 18.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 5.3s, mem: 8.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.0s, mem: 15.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 18.6s, mem: 25.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit" # test pass✅, time: 48.0s, mem: 49.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit" # test pass✅, time: 9.3s, mem: 15.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 8.8s, mem: 21.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 12.6s, mem: 15.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.2s, mem: 21.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit" # test pass✅, time: 10.5s, mem: 7.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn" # test pass✅, time: 9.0s, mem: 12.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.4s, mem: 7.2GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 19.8s, mem: 20.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.4s, mem: 6.4GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 25.7s, mem: 19.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 13.7s, mem: 9.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.7s, mem: 3.0GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.6s, mem: 7.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.5s, mem: 6.5GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 24.1s, mem: 20.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 13.5s, mem: 10.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.1s, mem: 9.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.2s, mem: 26.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 5.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.9s, mem: 14.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 58.6s, mem: 23.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test pass✅, time: 69.8s, mem: 68.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 20.7s, mem: 8.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 18.6s, mem: 17.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 14.3s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 13.9s, mem: 33.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 13.0s, mem: 9.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 12.6s, mem: 19.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.0s, mem: 1.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.5s, mem: 1.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.4s, mem: 1.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.7s, mem: 2.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit" # test pass✅, time: 10.2s, mem: 7.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn" # test pass✅, time: 8.5s, mem: 12.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.1s, mem: 9.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.5s, mem: 19.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.0s, mem: 4.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.2s, mem: 8.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.3s, mem: 3.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.7s, mem: 8.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 9.4s, mem: 8.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 7.6s, mem: 17.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.0s, mem: 7.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 8.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 7.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.8s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 14.6s, mem: 17.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.3s, mem: 32.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 10.9s, mem: 7.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.8s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 9.6s, mem: 7.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 12.7s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit" # test pass✅, time: 6.2s, mem: 2.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 5.0s, mem: 4.6GB, 12/12 tests passed.
diff --git a/vision_qna.py b/vision_qna.py
index b240a68..4c956bc 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -89,7 +89,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
torch.set_grad_enabled(False)
def loaded_banner(self):
- logger.info(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype}")
+ logger.info(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype} and template: {self.format}")
def select_device(self):
return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
@@ -707,7 +707,7 @@ def guess_model_format(model_name: str) -> str:
model_id = model_name.lower()
model_format_match_map = {
- 'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b'],
+ 'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b', 'internvl2-'],
'falcon': ['falcon'],
'florence': ['florence'],
'fuyu': ['fuyu'],
@@ -717,14 +717,19 @@ def guess_model_format(model_name: str) -> str:
'llama3': ['llama-3-vision', '360vl'],
'phi15': ['moondream1', 'moondream2', 'monkey'],
'phi3': ['phi3', 'phi-3'],
- 'phintern': ['internvl-chat-4b'],
+ 'phintern': ['internvl-chat-4b', 'opengvlab/internvl2-4b'],
'vicuna': ['vicuna', '13b'],
'vicuna0': ['yi-vl'],
}
+ # Exact match first
+ for format, options in model_format_match_map.items():
+ if model_id in options:
+ return format
for format, options in model_format_match_map.items():
if any(x in model_id for x in options):
return format
+
return 'vicuna'
def guess_backend(model_name: str) -> str:
@@ -792,6 +797,9 @@ def guess_backend(model_name: str) -> str:
if 'internvl-chat' in model_id and '-v1-5' in model_id:
return 'internvl-chat-v1-5'
+ if 'internvl2-' in model_id:
+ return 'internvl-chat-v1-5'
+
if 'idefics2' in model_id:
return 'idefics2'