From 492520fa275b2078118689bcf5656db6385e92d8 Mon Sep 17 00:00:00 2001 From: matatonic Date: Sun, 14 Jul 2024 21:46:54 -0400 Subject: [PATCH 01/13] internvl2 wip --- backend/internvl-chat-v1-5.py | 9 +++++++++ model_conf_tests.json | 17 +++++++++++++++++ vision_qna.py | 5 ++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index cc6c3fd..1b7e5b0 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -10,6 +10,15 @@ # OpenGVLab/InternVL-Chat-V1-5-Int8 # OpenGVLab/Mini-InternVL-Chat-2B-V1-5 # OpenGVLab/Mini-InternVL-Chat-4B-V1-5 +# OpenGVLab/InternVL2-1B +# OpenGVLab/InternVL2-2B-AWQ +# OpenGVLab/InternVL2-2B +# OpenGVLab/InternVL2-4B +# OpenGVLab/InternVL2-4B +# OpenGVLab/InternVL2-8B +# OpenGVLab/InternVL2-26B +# OpenGVLab/InternVL2-40B + MAX_TILES = 6 diff --git a/model_conf_tests.json b/model_conf_tests.json index afb261f..82c78e7 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -1,4 +1,13 @@ [ + ["OpenGVLab/InternVL2-1B"], + ["OpenGVLab/InternVL2-2B-AWQ"], + ["OpenGVLab/InternVL2-2B"], + ["OpenGVLab/InternVL2-4B"], + ["OpenGVLab/InternVL2-4B"], + ["OpenGVLab/InternVL2-8B"], + ["OpenGVLab/InternVL2-26B"], + ["OpenGVLab/InternVL2-40B"], + ["BAAI/Bunny-Llama-3-8B-V", "--load-in-4bit"], ["BAAI/Bunny-Llama-3-8B-V"], ["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"], @@ -25,6 +34,14 @@ ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-1B"], + ["OpenGVLab/InternVL2-2B-AWQ"], + ["OpenGVLab/InternVL2-2B"], + ["OpenGVLab/InternVL2-4B"], + ["OpenGVLab/InternVL2-4B"], + ["OpenGVLab/InternVL2-8B"], + ["OpenGVLab/InternVL2-26B"], + ["OpenGVLab/InternVL2-40B"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"], diff --git a/vision_qna.py b/vision_qna.py index b240a68..633584d 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -707,7 +707,7 @@ def guess_model_format(model_name: str) -> str: model_id = model_name.lower() model_format_match_map = { - 'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b'], + 'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b', 'internvl2-'], 'falcon': ['falcon'], 'florence': ['florence'], 'fuyu': ['fuyu'], @@ -792,6 +792,9 @@ def guess_backend(model_name: str) -> str: if 'internvl-chat' in model_id and '-v1-5' in model_id: return 'internvl-chat-v1-5' + if 'internvl2-' in model_id: + return 'internvl-chat-v1-5' + if 'idefics2' in model_id: return 'idefics2' From 94fb0dfe7bf8a77e795651cce7dbc5ab43acdef3 Mon Sep 17 00:00:00 2001 From: matatonic Date: Sun, 14 Jul 2024 22:57:14 -0400 Subject: [PATCH 02/13] wip --- backend/internvl-chat-v1-5.py | 8 ++++---- model_conf_tests.json | 7 ------- vision_qna.py | 7 ++++++- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index 1b7e5b0..e141260 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -9,15 +9,15 @@ # OpenGVLab/InternVL-Chat-V1-5 # OpenGVLab/InternVL-Chat-V1-5-Int8 # OpenGVLab/Mini-InternVL-Chat-2B-V1-5 -# OpenGVLab/Mini-InternVL-Chat-4B-V1-5 +# OpenGVLab/Mini-InternVL-Chat-4B-V1-5 (phintern) # OpenGVLab/InternVL2-1B -# OpenGVLab/InternVL2-2B-AWQ +# OpenGVLab/InternVL2-2B-AWQ (empty response) # OpenGVLab/InternVL2-2B # OpenGVLab/InternVL2-4B -# OpenGVLab/InternVL2-4B +# OpenGVLab/InternVL2-4B (phintern) # OpenGVLab/InternVL2-8B # OpenGVLab/InternVL2-26B -# OpenGVLab/InternVL2-40B +# OpenGVLab/InternVL2-40B (yi-34- nous-hermes-2) MAX_TILES = 6 diff --git a/model_conf_tests.json b/model_conf_tests.json index 82c78e7..b00b9c8 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -1,11 +1,5 @@ [ - ["OpenGVLab/InternVL2-1B"], - ["OpenGVLab/InternVL2-2B-AWQ"], - ["OpenGVLab/InternVL2-2B"], - ["OpenGVLab/InternVL2-4B"], ["OpenGVLab/InternVL2-4B"], - ["OpenGVLab/InternVL2-8B"], - ["OpenGVLab/InternVL2-26B"], ["OpenGVLab/InternVL2-40B"], ["BAAI/Bunny-Llama-3-8B-V", "--load-in-4bit"], @@ -35,7 +29,6 @@ ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"], ["OpenGVLab/InternVL2-1B"], - ["OpenGVLab/InternVL2-2B-AWQ"], ["OpenGVLab/InternVL2-2B"], ["OpenGVLab/InternVL2-4B"], ["OpenGVLab/InternVL2-4B"], diff --git a/vision_qna.py b/vision_qna.py index 633584d..380eddf 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -717,14 +717,19 @@ def guess_model_format(model_name: str) -> str: 'llama3': ['llama-3-vision', '360vl'], 'phi15': ['moondream1', 'moondream2', 'monkey'], 'phi3': ['phi3', 'phi-3'], - 'phintern': ['internvl-chat-4b'], + 'phintern': ['internvl-chat-4b', 'opengvlab/internvl2-4b'], 'vicuna': ['vicuna', '13b'], 'vicuna0': ['yi-vl'], } + # Exact match first + for format, options in model_format_match_map.items(): + if options == model_id: + return format for format, options in model_format_match_map.items(): if any(x in model_id for x in options): return format + return 'vicuna' def guess_backend(model_name: str) -> str: From 0e247995cf9c457360d8797630f32f47bce62009 Mon Sep 17 00:00:00 2001 From: matatonic Date: Sun, 14 Jul 2024 23:19:31 -0400 Subject: [PATCH 03/13] wip --- vision_qna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_qna.py b/vision_qna.py index 380eddf..778dcda 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -89,7 +89,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p torch.set_grad_enabled(False) def loaded_banner(self): - logger.info(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype}") + logger.info(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype} and template: {self.format}") def select_device(self): return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' From f841008b65ee5563b3abbb4b3ea6c31e8ea25bbf Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 00:02:54 -0400 Subject: [PATCH 04/13] wip --- README.md | 12 +++++++++++ model_conf_tests.alt.json | 44 ++++++--------------------------------- model_conf_tests.json | 10 +++++---- vision_qna.py | 2 +- 4 files changed, 25 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index cebf43b..6ef48f5 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,13 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview` ## Model support - [X] [OpenGVLab](https://huggingface.co/OpenGVLab) +- - [ ] [InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B) (currently errors) +- - [X] [InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B) +- - [X] [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B) +- - [X] [InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B) (alternate docker only) +- - [X] [InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B) +- - [X] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ) +- - [X] [InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B) - - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) (wont gpu split yet) - - [X] [InternVL-Chat-V1-5-Int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-Int8) (wont gpu split yet) - - [X] [Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5) @@ -106,6 +113,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le ## Recent updates +Version 0.27.0 + +- new model support: OpenGVLab/InternVL2 series of models (40B still has errors, 4B requires alternate docker image) +- + Version 0.26.0 - new model support: cognitivecomputations/dolphin-vision-72b diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json index 32ee64b..d8ec19a 100644 --- a/model_conf_tests.alt.json +++ b/model_conf_tests.alt.json @@ -1,52 +1,20 @@ [ - ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], - ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"], - ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"], - ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5"], - ["Qwen/Qwen-VL-Chat"], - ["THUDM/cogagent-chat-hf", "--load-in-4bit"], - ["THUDM/cogagent-chat-hf"], - ["THUDM/cogvlm-chat-hf", "--load-in-4bit"], - ["THUDM/cogvlm-chat-hf"], - ["THUDM/cogvlm2-llama3-chat-19B"], - ["THUDM/cogvlm2-llama3-chinese-chat-19B"], + ["OpenGVLab/InternVL2-4B"], + ["YanweiLi/MGM-7B", "--load-in-4bit", "--use-flash-attn"], + ["YanweiLi/MGM-7B", "--use-flash-attn"], + ["YanweiLi/MGM-7B-HD", "--load-in-4bit", "--use-flash-attn"], + ["YanweiLi/MGM-7B-HD", "--use-flash-attn"], ["YanweiLi/MGM-13B", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-13B", "--use-flash-attn"], ["YanweiLi/MGM-13B-HD", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-13B-HD", "--use-flash-attn"], - ["YanweiLi/MGM-2B", "--use-flash-attn"], ["YanweiLi/MGM-34B", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-34B", "--use-flash-attn"], ["YanweiLi/MGM-34B-HD", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-34B-HD", "--use-flash-attn"], - ["YanweiLi/MGM-7B", "--load-in-4bit", "--use-flash-attn"], - ["YanweiLi/MGM-7B", "--use-flash-attn"], - ["YanweiLi/MGM-7B-HD", "--load-in-4bit", "--use-flash-attn"], - ["YanweiLi/MGM-7B-HD", "--use-flash-attn"], ["YanweiLi/MGM-8x7B", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-8x7B", "--use-flash-attn"], ["YanweiLi/MGM-8x7B-HD", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-8x7B-HD", "--use-flash-attn"], - ["adept/fuyu-8b", "--device-map", "cuda:0"], - ["echo840/Monkey"], - ["echo840/Monkey-Chat"], - ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0"], - ["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0"], - ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn", "--device", "cuda:0"], - ["internlm/internlm-xcomposer2-vl-1_8b", "--use-flash-attn", "--device-map", "cuda:0"], - ["internlm/internlm-xcomposer2-vl-7b", "--use-flash-attn", "--device-map", "cuda:0"], - ["internlm/internlm-xcomposer2-vl-7b-4bit", "--use-flash-attn", "--device", "cuda:0"], - ["llava-hf/bakLlava-v1-hf", "--load-in-4bit", "--use-flash-attn"], - ["llava-hf/bakLlava-v1-hf", "--use-flash-attn", "--device-map", "cuda:0"], - ["llava-hf/llava-1.5-13b-hf", "--load-in-4bit", "--use-flash-attn"], - ["llava-hf/llava-1.5-13b-hf", "--use-flash-attn", "--device-map", "cuda:0"], - ["llava-hf/llava-1.5-7b-hf", "--load-in-4bit", "--use-flash-attn"], - ["llava-hf/llava-1.5-7b-hf", "--use-flash-attn", "--device-map", "cuda:0"], - ["openbmb/MiniCPM-Llama3-V-2_5", "--use-flash-attn", "--device-map", "cuda:0"], - ["openbmb/MiniCPM-V", "--use-flash-attn", "--device-map", "cuda:0"], - ["openbmb/MiniCPM-V-2", "--use-flash-attn", "--device-map", "cuda:0"], - ["qihoo360/360VL-8B", "--use-flash-attn", "--load-in-4bit"], - ["qihoo360/360VL-8B", "--use-flash-attn"], - ["vikhyatk/moondream1"], - ["vikhyatk/moondream2", "--use-flash-attn"] + ["vikhyatk/moondream1"] ] diff --git a/model_conf_tests.json b/model_conf_tests.json index b00b9c8..ad86b0a 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -1,7 +1,4 @@ [ - ["OpenGVLab/InternVL2-4B"], - ["OpenGVLab/InternVL2-40B"], - ["BAAI/Bunny-Llama-3-8B-V", "--load-in-4bit"], ["BAAI/Bunny-Llama-3-8B-V"], ["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"], @@ -28,12 +25,17 @@ ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-1B", "--load-in-4bit"], ["OpenGVLab/InternVL2-1B"], + ["OpenGVLab/InternVL2-2B", "--load-in-4bit"], ["OpenGVLab/InternVL2-2B"], + ["OpenGVLab/InternVL2-4B", "--load-in-4bit"], ["OpenGVLab/InternVL2-4B"], - ["OpenGVLab/InternVL2-4B"], + ["OpenGVLab/InternVL2-8B", "--load-in-4bit"], ["OpenGVLab/InternVL2-8B"], + ["OpenGVLab/InternVL2-26B", "--load-in-4bit"], ["OpenGVLab/InternVL2-26B"], + ["OpenGVLab/InternVL2-40B", "--load-in-4bit"], ["OpenGVLab/InternVL2-40B"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"], diff --git a/vision_qna.py b/vision_qna.py index 778dcda..4c956bc 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -723,7 +723,7 @@ def guess_model_format(model_name: str) -> str: } # Exact match first for format, options in model_format_match_map.items(): - if options == model_id: + if model_id in options: return format for format, options in model_format_match_map.items(): if any(x in model_id for x in options): From 3272d930b08996113420ab2bfbcc9adb2953315f Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 01:10:45 -0400 Subject: [PATCH 05/13] wip --- test_models.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/test_models.py b/test_models.py index ab681ae..4144282 100755 --- a/test_models.py +++ b/test_models.py @@ -309,14 +309,18 @@ def no_image_response(prompt): print(f"### End tests.") - print("""# This sample env file can be used to set environment variables for the docker-compose.yml + fname = f"sample.env-{time.time()}" + with open(fname,'w') as results_file: + print("""# This sample env file can be used to set environment variables for the docker-compose.yml # Copy this file to vision.env and uncomment the model of your choice. HF_HOME=hf_home HF_HUB_ENABLE_HF_TRANSFER=1 #HF_TOKEN=hf-... -#CUDA_VISIBLE_DEVICES=1,0""") +#CUDA_VISIBLE_DEVICES=1,0""", file=results_file) - for r in all_results: - cmdl = ' '.join(r['args']) - result = all(r['results']) - print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}") + for r in all_results: + cmdl = ' '.join(r['args']) + result = all(r['results']) + print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}", file=results_file) + + print(open(fname,'r').read()) From e06d448ecaaf6f22377b962c75e0a0b30d7da305 Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 02:03:51 -0400 Subject: [PATCH 06/13] set format for all backends --- backend/idefics2.py | 1 + backend/internvl-chat-v1-5.py | 3 ++- backend/mantis.py | 1 + backend/minicpm.py | 1 + backend/omnilmm12b.py | 1 + backend/phi3.py | 1 + backend/xcomposer2-vl.py | 1 + backend/xcomposer2.py | 1 + backend/yi-vl.py | 2 +- 9 files changed, 10 insertions(+), 2 deletions(-) diff --git a/backend/idefics2.py b/backend/idefics2.py index a30b30f..d3b4ef5 100644 --- a/backend/idefics2.py +++ b/backend/idefics2.py @@ -9,6 +9,7 @@ # HuggingFaceM4/idefics2-8b-chatty-AWQ class VisionQnA(VisionQnABase): + format: str = 'internal' model_name: str = "idefics2" vision_layers: List[str] = ['vision_model', 'connector'] diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index e141260..47bc423 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -111,7 +111,8 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p self.max_tiles = extra_params.get('max_tiles', MAX_TILES) - self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) + use_fast = False if '40b' in model_id.lowe() else True + self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), use_fast=use_fast) self.model = AutoModel.from_pretrained(**self.params).eval() self.model.img_context_token_id = self.tokenizer.convert_tokens_to_ids('') diff --git a/backend/mantis.py b/backend/mantis.py index 69bac5d..a80a577 100644 --- a/backend/mantis.py +++ b/backend/mantis.py @@ -4,6 +4,7 @@ from vision_qna import * class VisionQnA(VisionQnABase): + format: str = 'internal' model_name: str = "mantis" vision_layers: List[str] = ["vision_tower", "multi_modal_projector"] diff --git a/backend/minicpm.py b/backend/minicpm.py index 9248aef..2605069 100644 --- a/backend/minicpm.py +++ b/backend/minicpm.py @@ -7,6 +7,7 @@ # openbmb/MiniCPM-V aka OmniLMM-3B class VisionQnA(VisionQnABase): + format: str = 'internal' model_name: str = "minicpm" vision_layers: List[str] = ["resampler", "vpm"] diff --git a/backend/omnilmm12b.py b/backend/omnilmm12b.py index 86dcfb3..e2ad496 100644 --- a/backend/omnilmm12b.py +++ b/backend/omnilmm12b.py @@ -5,6 +5,7 @@ # openbmb/OmniLMM-12B class VisionQnA(VisionQnABase): + format: str = 'internal' model_name: str = "omnilmm12b" def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): diff --git a/backend/phi3.py b/backend/phi3.py index b46c904..ba988a6 100644 --- a/backend/phi3.py +++ b/backend/phi3.py @@ -6,6 +6,7 @@ # failspy/Phi-3-vision-128k-instruct-abliterated-alpha class VisionQnA(VisionQnABase): + format: str = 'phi3' model_name: str = "phi3" vision_layers: List[str] = ["vision_embed_tokens"] diff --git a/backend/xcomposer2-vl.py b/backend/xcomposer2-vl.py index b4263b5..7e718bc 100644 --- a/backend/xcomposer2-vl.py +++ b/backend/xcomposer2-vl.py @@ -31,6 +31,7 @@ class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM): ] class VisionQnA(VisionQnABase): + format: str = 'internal' model_name: str = "xcomposer2-vl" vision_layers: List[str] = ['vit', 'vision_proj'] diff --git a/backend/xcomposer2.py b/backend/xcomposer2.py index afc404d..8ca90c0 100644 --- a/backend/xcomposer2.py +++ b/backend/xcomposer2.py @@ -26,6 +26,7 @@ class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM): ] class VisionQnA(VisionQnABase): + format: str = 'internal' model_name: str = "xcomposer2" vision_layers: List[str] = ['vit', 'vision_proj'] diff --git a/backend/yi-vl.py b/backend/yi-vl.py index eb31abf..31da906 100644 --- a/backend/yi-vl.py +++ b/backend/yi-vl.py @@ -18,7 +18,7 @@ class VisionQnA(VisionQnABase): model_name: str = "qwen-vl" - format: 'chatml' + format: str = 'chatml' def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): super().__init__(model_id, device, device_map, extra_params, format) From 404e97c6c7a62ba708715b0506ac9bfd0992aa4e Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 02:27:51 -0400 Subject: [PATCH 07/13] pin transformers --- Dockerfile | 2 +- backend/internvl-chat-v1-5.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index df1177d..dbadc85 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app COPY requirements.txt . ARG VERSION=latest -RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.41.2\nautoawq>=0.2.5" >> requirements.txt ; fi +RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers==4.41.2\nautoawq>=0.2.5" >> requirements.txt ; fi # TODO: nvidia apex wheel RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index 47bc423..11d3baa 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -111,7 +111,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p self.max_tiles = extra_params.get('max_tiles', MAX_TILES) - use_fast = False if '40b' in model_id.lowe() else True + use_fast = False if '40b' in model_id.lower() else True self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), use_fast=use_fast) self.model = AutoModel.from_pretrained(**self.params).eval() From 40892cffd91e3dff2a762658c5955d922d457204 Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 02:41:44 -0400 Subject: [PATCH 08/13] wip --- backend/internvl-chat-v1-5.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index 11d3baa..e337c6c 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -111,11 +111,13 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p self.max_tiles = extra_params.get('max_tiles', MAX_TILES) - use_fast = False if '40b' in model_id.lower() else True - self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), use_fast=use_fast) + self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) self.model = AutoModel.from_pretrained(**self.params).eval() self.model.img_context_token_id = self.tokenizer.convert_tokens_to_ids('') + print(f" = {self.tokenizer.convert_tokens_to_ids('')}") + print(f" = {self.tokenizer.convert_tokens_to_ids('')}") + print(f" = {self.tokenizer.convert_tokens_to_ids('')}") self.eos_token = '<|end|>' if self.format == 'phintern' else '<|im_end|>' From 9651c1f34ef7343977bb8923376684c305c89c5d Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 03:19:59 -0400 Subject: [PATCH 09/13] wip --- backend/internvl-chat-v1-5.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index e337c6c..cfcdeec 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -115,9 +115,6 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p self.model = AutoModel.from_pretrained(**self.params).eval() self.model.img_context_token_id = self.tokenizer.convert_tokens_to_ids('') - print(f" = {self.tokenizer.convert_tokens_to_ids('')}") - print(f" = {self.tokenizer.convert_tokens_to_ids('')}") - print(f" = {self.tokenizer.convert_tokens_to_ids('')}") self.eos_token = '<|end|>' if self.format == 'phintern' else '<|im_end|>' @@ -130,10 +127,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: - if self.format == 'phintern': - images, prompt = await phintern_prompt_from_messages(request.messages, img_tok='') - else: - images, prompt = await chatml_prompt_from_messages(request.messages, img_tok='') + images, prompt = await prompt_from_messages(request.messages) # TODO: use detail to set max tiles if detail=low (=512) # if .detail == 'low': max_num=1 @@ -146,11 +140,11 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener pixel_values = None if pixel_values is not None: - image_tokens = '' + '' * self.model.num_image_token * pixel_values.shape[0] + '\n' - else: - image_tokens = '' + for img in images: + image_tokens = '' + '' * self.model.num_image_token * img.size(0) + '' + prompt = prompt.replace('', image_tokens, 1) - model_inputs = self.tokenizer(image_tokens + prompt, return_tensors='pt') + model_inputs = self.tokenizer(prompt, return_tensors='pt') input_ids = model_inputs['input_ids'].cuda() attention_mask = model_inputs['attention_mask'].cuda() From 1fdc9d3202bcdbeb2a1bb98aa9ce4175c1dc18ce Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 03:21:56 -0400 Subject: [PATCH 10/13] wip --- backend/internvl-chat-v1-5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py index cfcdeec..26cdb4d 100644 --- a/backend/internvl-chat-v1-5.py +++ b/backend/internvl-chat-v1-5.py @@ -127,7 +127,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: - images, prompt = await prompt_from_messages(request.messages) + images, prompt = await prompt_from_messages(request.messages, self.format) # TODO: use detail to set max tiles if detail=low (=512) # if .detail == 'low': max_num=1 From 0e5d9457333067c0ea99ef9e1055785866123240 Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 04:51:33 -0400 Subject: [PATCH 11/13] wip --- model_conf_tests.alt.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json index d8ec19a..6a4ed97 100644 --- a/model_conf_tests.alt.json +++ b/model_conf_tests.alt.json @@ -1,5 +1,4 @@ [ - ["OpenGVLab/InternVL2-4B"], ["YanweiLi/MGM-7B", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-7B", "--use-flash-attn"], ["YanweiLi/MGM-7B-HD", "--load-in-4bit", "--use-flash-attn"], @@ -16,5 +15,9 @@ ["YanweiLi/MGM-8x7B", "--use-flash-attn"], ["YanweiLi/MGM-8x7B-HD", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-8x7B-HD", "--use-flash-attn"], + ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], + ["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], + ["internlm/internlm-xcomposer2-vl-1_8b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], + ["internlm/internlm-xcomposer2-vl-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], ["vikhyatk/moondream1"] ] From 72b248ee4fe60e61b9a3fac52ae1bbba731b7f5c Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 05:13:34 -0400 Subject: [PATCH 12/13] wip --- README.md | 7 +++---- model_conf_tests.alt.json | 4 ---- model_conf_tests.json | 24 ++++++++++++------------ 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 6ef48f5..5e50a7a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview` ## Model support - [X] [OpenGVLab](https://huggingface.co/OpenGVLab) -- - [ ] [InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B) (currently errors) +- - [X] [InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B) - - [X] [InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B) - - [X] [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B) - - [X] [InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B) (alternate docker only) @@ -49,7 +49,7 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview` - - [X] [360VL-8B](https://huggingface.co/qihoo360/360VL-8B) - - [X] [360VL-70B](https://huggingface.co/qihoo360/360VL-70B) (untested) - [X] [LlavaNext](https://huggingface.co/llava-hf) -- - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (currently errors, use an image before 0.26.0) +- - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) - - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) - - [X] [llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) - - [X] [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) @@ -115,8 +115,7 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le Version 0.27.0 -- new model support: OpenGVLab/InternVL2 series of models (40B still has errors, 4B requires alternate docker image) -- +- new model support: OpenGVLab/InternVL2 series of models (1B, 2B, 4B, 8B*, 26B*, 40B*) - *(current top open source models) Version 0.26.0 diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json index 6a4ed97..52cf249 100644 --- a/model_conf_tests.alt.json +++ b/model_conf_tests.alt.json @@ -15,9 +15,5 @@ ["YanweiLi/MGM-8x7B", "--use-flash-attn"], ["YanweiLi/MGM-8x7B-HD", "--load-in-4bit", "--use-flash-attn"], ["YanweiLi/MGM-8x7B-HD", "--use-flash-attn"], - ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], - ["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], - ["internlm/internlm-xcomposer2-vl-1_8b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], - ["internlm/internlm-xcomposer2-vl-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"], ["vikhyatk/moondream1"] ] diff --git a/model_conf_tests.json b/model_conf_tests.json index ad86b0a..21080cb 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -25,18 +25,18 @@ ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"], - ["OpenGVLab/InternVL2-1B", "--load-in-4bit"], - ["OpenGVLab/InternVL2-1B"], - ["OpenGVLab/InternVL2-2B", "--load-in-4bit"], - ["OpenGVLab/InternVL2-2B"], - ["OpenGVLab/InternVL2-4B", "--load-in-4bit"], - ["OpenGVLab/InternVL2-4B"], - ["OpenGVLab/InternVL2-8B", "--load-in-4bit"], - ["OpenGVLab/InternVL2-8B"], - ["OpenGVLab/InternVL2-26B", "--load-in-4bit"], - ["OpenGVLab/InternVL2-26B"], - ["OpenGVLab/InternVL2-40B", "--load-in-4bit"], - ["OpenGVLab/InternVL2-40B"], + ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0", "--load-in-4bit"], + ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0", "--load-in-4bit"], + ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"], + ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0", "--load-in-4bit"], + ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0", "--load-in-4bit"], + ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0"], + ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0", "--load-in-4bit"], + ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"], ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"], From 719c0b52ee9d9e7c3260d1d08488aa1571815550 Mon Sep 17 00:00:00 2001 From: matatonic Date: Mon, 15 Jul 2024 07:32:17 -0400 Subject: [PATCH 13/13] wip --- vision-alt.sample.env | 60 ++--------- vision.sample.env | 234 ++++++++++++++++++++++-------------------- 2 files changed, 133 insertions(+), 161 deletions(-) diff --git a/vision-alt.sample.env b/vision-alt.sample.env index e4838b4..0f01d86 100644 --- a/vision-alt.sample.env +++ b/vision-alt.sample.env @@ -2,54 +2,14 @@ # Copy this file to vision.env and uncomment the model of your choice. HF_HOME=hf_home HF_HUB_ENABLE_HF_TRANSFER=1 +#HF_TOKEN=hf-... #CUDA_VISIBLE_DEVICES=1,0 -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 14.3s, mem: 52.0GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail❌, time: 17.1s, mem: 18.2GB, 2/8 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 27.0s, mem: 31.3GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 3.5s, mem: 7.0GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 4.5s, mem: 19.5GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 20.5s, mem: 12.1GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 14.7s, mem: 37.1GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 19.2s, mem: 12.2GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 13.2s, mem: 36.4GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 22.2s, mem: 40.8GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 64.7s, mem: 40.8GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn" # test pass✅, time: 31.6s, mem: 10.2GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn" # test pass✅, time: 22.4s, mem: 27.9GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 26.4s, mem: 14.1GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn" # test pass✅, time: 20.7s, mem: 31.9GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn" # test pass✅, time: 16.7s, mem: 21.7GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn" # test pass✅, time: 11.0s, mem: 67.4GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 180.1s, mem: 24.4GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn" # test pass✅, time: 118.0s, mem: 70.5GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn" # test pass✅, time: 11.4s, mem: 6.7GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn" # test pass✅, time: 5.5s, mem: 15.6GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 35.4s, mem: 9.7GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn" # test pass✅, time: 15.4s, mem: 18.6GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn" # test pass✅, time: 22.7s, mem: 26.1GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn" # test pass✅, time: 14.7s, mem: 91.2GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 24.4s, mem: 29.3GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn" # test pass✅, time: 18.4s, mem: 95.9GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 14.1s, mem: 24.8GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 6.1s, mem: 21.7GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 7.4s, mem: 21.6GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 17.9s, mem: 25.5GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.6s, mem: 18.8GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0" # test pass✅, time: 11.2s, mem: 9.2GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.3s, mem: 7.1GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.8s, mem: 20.0GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0" # test pass✅, time: 10.7s, mem: 10.7GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn" # test fail❌, time: 2.3s, mem: 5.8GB, 0/8 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0" # test fail❌, time: 1.8s, mem: 15.5GB, 0/8 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 10.0s, mem: 8.8GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.7s, mem: 26.6GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 9.1s, mem: 5.3GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.5s, mem: 14.3GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.9s, mem: 7.4GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.6s, mem: 11.2GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 7.7s, mem: 7.5GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 5.2s, mem: 17.1GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test pass✅, time: 4.0s, mem: 4.6GB, 8/8 tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 4.1s, mem: 4.5GB, 8/8 tests passed. \ No newline at end of file +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn" # test pass✅, time: 8.0s, mem: 15.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn" # test pass✅, time: 23.9s, mem: 18.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn" # test pass✅, time: 31.3s, mem: 27.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn" # test pass✅, time: 27.9s, mem: 31.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn" # test pass✅, time: 16.2s, mem: 67.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn" # test pass✅, time: 147.6s, mem: 70.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn" # test pass✅, time: 20.8s, mem: 91.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn" # test pass✅, time: 26.6s, mem: 96.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test pass✅, time: 5.5s, mem: 4.9GB, 12/12 tests passed. diff --git a/vision.sample.env b/vision.sample.env index aab960b..11f81a1 100644 --- a/vision.sample.env +++ b/vision.sample.env @@ -4,114 +4,126 @@ HF_HOME=hf_home HF_HUB_ENABLE_HF_TRANSFER=1 #HF_TOKEN=hf-... #CUDA_VISIBLE_DEVICES=1,0 -#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 9.2s, mem: 8.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 8.2s, mem: 19.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.9s, mem: 9.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.9s, mem: 11.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 12.9s, mem: 9.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 9.4s, mem: 12.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 8.1s, mem: 12.5GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 10.2s, mem: 5.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 7.3s, mem: 12.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 10.1s, mem: 5.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.9s, mem: 13.2GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 11.2s, mem: 9.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 10.1s, mem: 19.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 29.8s, mem: 29.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 22.1s, mem: 71.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.2s, mem: 13.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.4s, mem: 22.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 13.5s, mem: 13.0GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 16.9s, mem: 13.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0" # test pass✅, time: 14.0s, mem: 22.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 15.3s, mem: 12.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 20.5s, mem: 27.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 27.4s, mem: 30.2GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 26.5s, mem: 54.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 20.4s, mem: 52.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 40.4s, mem: 31.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.3s, mem: 5.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.7s, mem: 7.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.4s, mem: 9.0GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.6s, mem: 7.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 10.4s, mem: 6.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 13.6s, mem: 12.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40" # test pass✅, time: 13.1s, mem: 15.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 9.9s, mem: 11.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 9.8s, mem: 11.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.5s, mem: 19.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 10.6s, mem: 10.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 26.9s, mem: 13.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 22.0s, mem: 37.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 29.9s, mem: 12.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 22.1s, mem: 36.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 34.4s, mem: 22.5GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 33.1s, mem: 40.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 151.2s, mem: 22.5GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 104.0s, mem: 40.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 78.9s, mem: 16.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test pass✅, time: 53.7s, mem: 28.2GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.9s, mem: 11.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 9.2s, mem: 20.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.9s, mem: 7.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.1s, mem: 17.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 8.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.1s, mem: 18.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 6.2s, mem: 8.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.5s, mem: 16.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 19.0s, mem: 25.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit" # test pass✅, time: 53.8s, mem: 49.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit" # test pass✅, time: 9.0s, mem: 15.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 8.6s, mem: 22.0GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 12.5s, mem: 15.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.2s, mem: 22.0GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit" # test pass✅, time: 11.4s, mem: 7.2GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn" # test pass✅, time: 9.5s, mem: 12.5GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.5s, mem: 7.3GB, 0/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 20.0s, mem: 20.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.8s, mem: 6.5GB, 0/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 28.8s, mem: 19.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 15.8s, mem: 9.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.9s, mem: 3.1GB, 0/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.5s, mem: 7.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.8s, mem: 6.6GB, 0/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 21.9s, mem: 20.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 14.9s, mem: 11.0GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 13.1s, mem: 9.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.5s, mem: 26.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 5.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.2s, mem: 14.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit" # test fail❌, time: 3.9s, mem: 21.3GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test fail❌, time: 3.6s, mem: 66.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 23.6s, mem: 8.5GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 19.4s, mem: 17.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 15.8s, mem: 17.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 14.4s, mem: 34.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 13.7s, mem: 9.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 12.8s, mem: 19.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.0s, mem: 1.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.3s, mem: 1.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.7s, mem: 1.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.8s, mem: 2.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit" # test pass✅, time: 11.5s, mem: 7.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn" # test pass✅, time: 9.1s, mem: 12.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.4s, mem: 9.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.3s, mem: 19.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.8s, mem: 5.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 8.9GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.8s, mem: 5.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.8s, mem: 9.3GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 11.5s, mem: 8.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 8.8s, mem: 17.7GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.4s, mem: 8.2GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.5s, mem: 8.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 9.8s, mem: 8.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 10.8s, mem: 17.6GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 16.6s, mem: 17.5GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.8s, mem: 32.8GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 11.4s, mem: 8.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 10.1s, mem: 17.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 10.4s, mem: 8.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 14.4s, mem: 17.4GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit" # test pass✅, time: 7.4s, mem: 3.1GB, 12/12 tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 6.0s, mem: 4.8GB, 12/12 tests passed. \ No newline at end of file +#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 8.2s, mem: 8.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 7.6s, mem: 19.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.1s, mem: 9.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.6s, mem: 10.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 11.2s, mem: 8.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 8.3s, mem: 11.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 7.3s, mem: 12.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 8.7s, mem: 5.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 6.6s, mem: 12.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 9.2s, mem: 5.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.2s, mem: 13.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 9.6s, mem: 9.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 9.6s, mem: 19.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 25.0s, mem: 29.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 21.5s, mem: 71.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.7s, mem: 12.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.0s, mem: 22.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 12.7s, mem: 12.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.5s, mem: 12.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0" # test pass✅, time: 13.4s, mem: 22.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 14.1s, mem: 12.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 19.9s, mem: 26.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 26.2s, mem: 30.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 24.8s, mem: 54.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 17.9s, mem: 52.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 34.6s, mem: 31.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.6s, mem: 3.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 6.6s, mem: 4.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.8s, mem: 4.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.2s, mem: 6.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 5.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 7.7s, mem: 10.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.1s, mem: 8.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 7.4s, mem: 18.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 21.4s, mem: 26.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 19.9s, mem: 52.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 55.6s, mem: 32.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 68.4s, mem: 77.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.1s, mem: 5.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.8s, mem: 6.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.7s, mem: 8.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.7s, mem: 7.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 9.0s, mem: 6.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 9.3s, mem: 9.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40" # test pass✅, time: 8.4s, mem: 14.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 7.4s, mem: 11.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 8.7s, mem: 11.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.0s, mem: 19.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.7s, mem: 10.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 24.9s, mem: 13.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 21.0s, mem: 37.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 25.9s, mem: 12.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 19.3s, mem: 36.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 31.5s, mem: 22.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 31.8s, mem: 40.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 135.0s, mem: 22.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 99.3s, mem: 40.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 70.9s, mem: 16.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test pass✅, time: 50.9s, mem: 27.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.1s, mem: 11.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.7s, mem: 20.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.1s, mem: 7.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.7s, mem: 17.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.9s, mem: 8.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 18.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 5.3s, mem: 8.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.0s, mem: 15.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 18.6s, mem: 25.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit" # test pass✅, time: 48.0s, mem: 49.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit" # test pass✅, time: 9.3s, mem: 15.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 8.8s, mem: 21.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 12.6s, mem: 15.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.2s, mem: 21.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit" # test pass✅, time: 10.5s, mem: 7.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn" # test pass✅, time: 9.0s, mem: 12.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.4s, mem: 7.2GB, 0/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 19.8s, mem: 20.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.4s, mem: 6.4GB, 0/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 25.7s, mem: 19.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 13.7s, mem: 9.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.7s, mem: 3.0GB, 0/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.6s, mem: 7.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.5s, mem: 6.5GB, 0/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 24.1s, mem: 20.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 13.5s, mem: 10.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.1s, mem: 9.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.2s, mem: 26.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 5.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.9s, mem: 14.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 58.6s, mem: 23.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test pass✅, time: 69.8s, mem: 68.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 20.7s, mem: 8.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 18.6s, mem: 17.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 14.3s, mem: 17.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 13.9s, mem: 33.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 13.0s, mem: 9.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 12.6s, mem: 19.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.0s, mem: 1.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.5s, mem: 1.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.4s, mem: 1.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.7s, mem: 2.4GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit" # test pass✅, time: 10.2s, mem: 7.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn" # test pass✅, time: 8.5s, mem: 12.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.1s, mem: 9.5GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.5s, mem: 19.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.0s, mem: 4.8GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.2s, mem: 8.6GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.3s, mem: 3.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.7s, mem: 8.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 9.4s, mem: 8.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 7.6s, mem: 17.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.0s, mem: 7.7GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 8.1GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 7.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.8s, mem: 17.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 14.6s, mem: 17.0GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.3s, mem: 32.3GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 10.9s, mem: 7.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.8s, mem: 17.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 9.6s, mem: 7.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 12.7s, mem: 17.2GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit" # test pass✅, time: 6.2s, mem: 2.9GB, 12/12 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 5.0s, mem: 4.6GB, 12/12 tests passed.