diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index cfaa1f973f1..9bc4f574e47 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -42,6 +42,8 @@ LLAVA_IDS = ['liuhaotian/llava-v1.5-7b'] +PHI3VISION_IDS = ['microsoft/phi-3-vision-128k-instruct'] + results = [] excludes = [] @@ -914,6 +916,13 @@ def run_transformer_int4_gpu_win(repo_id, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') + elif repo_id in PHI3VISION_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + _attn_implementation="eager", + modules_to_not_convert=["vision_embed_tokens"], + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() @@ -1021,6 +1030,14 @@ def run_transformer_int4_fp16_gpu_win(repo_id, torch_dtype=torch.float16).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') + elif repo_id in PHI3VISION_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + _attn_implementation="eager", + modules_to_not_convert=["vision_embed_tokens"], + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, + torch_dtype=torch.float16).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, @@ -1125,6 +1142,13 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id, use_cache=True, cpu_embedding=cpu_embedding).eval() tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) model = model.to('xpu') + elif repo_id in PHI3VISION_IDS: + model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, + _attn_implementation="eager", + modules_to_not_convert=["vision_embed_tokens"], + use_cache=True, cpu_embedding=cpu_embedding).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) + model = model.to('xpu') else: model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() @@ -1228,6 +1252,13 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, use_cache=True, cpu_embedding=cpu_embedding).eval() tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) model = model.half().to('xpu') + elif repo_id in PHI3VISION_IDS: + model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, + _attn_implementation="eager", + modules_to_not_convert=["vision_embed_tokens"], + use_cache=True, cpu_embedding=cpu_embedding).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True) + model = model.half().to('xpu') else: model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() diff --git a/python/llm/dev/benchmark/all-in-one/save.py b/python/llm/dev/benchmark/all-in-one/save.py index 48aa3d9802e..4f0ae1d3c41 100644 --- a/python/llm/dev/benchmark/all-in-one/save.py +++ b/python/llm/dev/benchmark/all-in-one/save.py @@ -23,7 +23,7 @@ import sys import gc -from run import LLAMA_IDS, CHATGLM_IDS, LLAVA_IDS, get_model_path +from run import LLAMA_IDS, CHATGLM_IDS, LLAVA_IDS, PHI3VISION_IDS, get_model_path current_dir = os.path.dirname(os.path.realpath(__file__)) @@ -51,6 +51,12 @@ def save_model_in_low_bit(repo_id, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, trust_remote_code=True, use_cache=True).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + elif repo_id in PHI3VISION_IDS: + model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + _attn_implementation="eager", + modules_to_not_convert=["vision_embed_tokens"], + trust_remote_code=True, use_cache=True).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True).eval() diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml index 8a6ea0f4563..b51c9fac65d 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml @@ -1,4 +1,6 @@ repo_id: + - 'openbmb/MiniCPM-1B-sft-bf16' + - 'openbmb/MiniCPM-2B-sft-bf16' - 'THUDM/chatglm3-6b' - 'THUDM/glm-4-9b-chat' - 'baichuan-inc/Baichuan2-7B-Chat' @@ -7,8 +9,6 @@ repo_id: - 'meta-llama/Llama-2-13b-chat-hf' - 'meta-llama/Meta-Llama-3-8B-Instruct' - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'openbmb/MiniCPM-1B-sft-bf16' - - 'openbmb/MiniCPM-2B-sft-bf16' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - 'RWKV/v5-Eagle-7B-HF' - '01-ai/Yi-6B-Chat' diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml index 6019026ca4d..f191801c7dc 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml @@ -3,6 +3,7 @@ repo_id: - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-4k-instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml index dde71704a92..f32b48c05f1 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml @@ -1,4 +1,6 @@ repo_id: + - 'openbmb/MiniCPM-1B-sft-bf16' + - 'openbmb/MiniCPM-2B-sft-bf16' - 'THUDM/chatglm3-6b' - 'THUDM/glm-4-9b-chat' - 'baichuan-inc/Baichuan2-7B-Chat' @@ -7,8 +9,6 @@ repo_id: - 'meta-llama/Llama-2-13b-chat-hf' - 'meta-llama/Meta-Llama-3-8B-Instruct' - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'openbmb/MiniCPM-1B-sft-bf16' - - 'openbmb/MiniCPM-2B-sft-bf16' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' local_model_hub: 'path to your local model hub' diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml index 12ccaa5d331..f9db9131ca3 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml @@ -3,6 +3,7 @@ repo_id: - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-4k-instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml index 3ea3c609d7e..18e4ca5cbb1 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml @@ -1,4 +1,6 @@ repo_id: + - 'openbmb/MiniCPM-1B-sft-bf16' + - 'openbmb/MiniCPM-2B-sft-bf16' - 'THUDM/chatglm3-6b' - 'THUDM/glm-4-9b-chat' - 'baichuan-inc/Baichuan2-7B-Chat' @@ -7,8 +9,6 @@ repo_id: - 'meta-llama/Llama-2-13b-chat-hf' - 'meta-llama/Meta-Llama-3-8B-Instruct' - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'openbmb/MiniCPM-1B-sft-bf16' - - 'openbmb/MiniCPM-2B-sft-bf16' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' local_model_hub: 'path to your local model hub' diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml index 4401207c07c..abd17aaa1e2 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml @@ -3,6 +3,7 @@ repo_id: - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-4k-instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml index 5a52ae16d91..2fc0ddb17dc 100644 --- a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml @@ -1,4 +1,6 @@ repo_id: + - 'openbmb/MiniCPM-1B-sft-bf16' + - 'openbmb/MiniCPM-2B-sft-bf16' - 'THUDM/chatglm3-6b' - 'THUDM/glm-4-9b-chat' - 'baichuan-inc/Baichuan2-7B-Chat' @@ -7,8 +9,6 @@ repo_id: - 'meta-llama/Llama-2-13b-chat-hf' - 'meta-llama/Meta-Llama-3-8B-Instruct' - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'openbmb/MiniCPM-1B-sft-bf16' - - 'openbmb/MiniCPM-2B-sft-bf16' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' local_model_hub: 'path to your local model hub' diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml index f9ae8540cd1..fd4fbbfaec1 100644 --- a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml @@ -3,6 +3,7 @@ repo_id: - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-4k-instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml index a03dfbf55a8..664b8cbbcc6 100644 --- a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml @@ -1,4 +1,6 @@ repo_id: + - 'openbmb/MiniCPM-1B-sft-bf16' + - 'openbmb/MiniCPM-2B-sft-bf16' - 'THUDM/chatglm3-6b' - 'THUDM/glm-4-9b-chat' - 'baichuan-inc/Baichuan2-7B-Chat' @@ -7,8 +9,6 @@ repo_id: - 'meta-llama/Llama-2-13b-chat-hf' - 'meta-llama/Meta-Llama-3-8B-Instruct' - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'openbmb/MiniCPM-1B-sft-bf16' - - 'openbmb/MiniCPM-2B-sft-bf16' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' local_model_hub: 'path to your local model hub' diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml index de32d305c4a..93fdc926e5f 100644 --- a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml @@ -3,6 +3,7 @@ repo_id: - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-4k-instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' local_model_hub: 'path to your local model hub' warm_up: 3 num_trials: 5