Change igpu perf to mainly test int4+fp16

intel-analytics · Jul 5, 2024 · c6088b2 · c6088b2
1 parent 72b4efa
commit c6088b2
Show file tree

Hide file tree

Showing 11 changed files with 89 additions and 96 deletions.
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -3,10 +3,9 @@ repo_id:
   - 'THUDM/glm-4-9b-chat'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  # - '01-ai/Yi-6B'
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
+  - 'meta-llama/Meta-Llama-3-8B-Instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3

diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml
@@ -1,7 +1,6 @@
 repo_id:
   - 'Qwen/Qwen1.5-7B-Chat'
   - 'Qwen/Qwen2-7B-Instruct'
-  - 'meta-llama/Meta-Llama-3-8B-Instruct'
   - 'microsoft/Phi-3-mini-4k-instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1

diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -1,13 +1,11 @@
 repo_id:
   - 'THUDM/chatglm3-6b'
+  - 'THUDM/glm-4-9b-chat'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  # - '01-ai/Yi-6B'
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
-  # - 'RWKV/rwkv-4-world-7b'
-  # - 'RWKV/rwkv-5-world-7b'
+  - 'meta-llama/Meta-Llama-3-8B-Instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3

diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml
@@ -1,6 +1,7 @@
 repo_id:
   - 'Qwen/Qwen1.5-7B-Chat'
-  - 'meta-llama/Meta-Llama-3-8B-Instruct'
+  - 'Qwen/Qwen2-7B-Instruct'
+  - 'microsoft/Phi-3-mini-4k-instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3

diff --git a/...chmark/igpu-perf/1024-128_loadlowbit.yaml → ...u-perf/1024-128_int4_fp16_loadlowbit.yaml b/...chmark/igpu-perf/1024-128_loadlowbit.yaml → ...u-perf/1024-128_int4_fp16_loadlowbit.yaml
@@ -3,10 +3,9 @@ repo_id:
   - 'THUDM/glm-4-9b-chat'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  # - '01-ai/Yi-6B'
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
+  - 'meta-llama/Meta-Llama-3-8B-Instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
@@ -16,5 +15,5 @@ batch_size: 1 # default to 1
 in_out_pairs:
   - '1024-128'
 test_api:
-  - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+  - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/...rk/igpu-perf/1024-128_loadlowbit_437.yaml → ...rf/1024-128_int4_fp16_loadlowbit_437.yaml b/...rk/igpu-perf/1024-128_loadlowbit_437.yaml → ...rf/1024-128_int4_fp16_loadlowbit_437.yaml
@@ -1,7 +1,6 @@
 repo_id:
   - 'Qwen/Qwen1.5-7B-Chat'
   - 'Qwen/Qwen2-7B-Instruct'
-  - 'meta-llama/Meta-Llama-3-8B-Instruct'
   - 'microsoft/Phi-3-mini-4k-instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
@@ -12,5 +11,5 @@ batch_size: 1 # default to 1
 in_out_pairs:
   - '1024-128'
 test_api:
-  - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+  - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/...lm/test/benchmark/igpu-perf/2048-256.yaml → ...nchmark/igpu-perf/2048-256_int4_fp16.yaml b/...lm/test/benchmark/igpu-perf/2048-256.yaml → ...nchmark/igpu-perf/2048-256_int4_fp16.yaml
@@ -3,10 +3,9 @@ repo_id:
   - 'THUDM/glm-4-9b-chat'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  # - '01-ai/Yi-6B'
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
+  - 'meta-llama/Meta-Llama-3-8B-Instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
@@ -16,5 +15,5 @@ batch_size: 1 # default to 1
 in_out_pairs:
   - '2048-256'
 test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/...est/benchmark/igpu-perf/2048-256_437.yaml → ...ark/igpu-perf/2048-256_int4_fp16_437.yaml b/...est/benchmark/igpu-perf/2048-256_437.yaml → ...ark/igpu-perf/2048-256_int4_fp16_437.yaml
@@ -1,7 +1,6 @@
 repo_id:
   - 'Qwen/Qwen1.5-7B-Chat'
   - 'Qwen/Qwen2-7B-Instruct'
-  - 'meta-llama/Meta-Llama-3-8B-Instruct'
   - 'microsoft/Phi-3-mini-4k-instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
@@ -12,5 +11,5 @@ batch_size: 1 # default to 1
 in_out_pairs:
   - '2048-256'
 test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/...n/llm/test/benchmark/igpu-perf/32-32.yaml → .../benchmark/igpu-perf/32-32_int4_fp16.yaml b/...n/llm/test/benchmark/igpu-perf/32-32.yaml → .../benchmark/igpu-perf/32-32_int4_fp16.yaml
@@ -3,10 +3,9 @@ repo_id:
   - 'THUDM/glm-4-9b-chat'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  # - '01-ai/Yi-6B'
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
+  - 'meta-llama/Meta-Llama-3-8B-Instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
 num_trials: 5
@@ -16,5 +15,5 @@ batch_size: 1 # default to 1
 in_out_pairs:
   - '32-32'
 test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/...m/test/benchmark/igpu-perf/32-32_437.yaml → ...chmark/igpu-perf/32-32_int4_fp16_437.yaml b/...m/test/benchmark/igpu-perf/32-32_437.yaml → ...chmark/igpu-perf/32-32_int4_fp16_437.yaml
@@ -1,7 +1,6 @@
 repo_id:
   - 'Qwen/Qwen1.5-7B-Chat'
   - 'Qwen/Qwen2-7B-Instruct'
-  - 'meta-llama/Meta-Llama-3-8B-Instruct'
   - 'microsoft/Phi-3-mini-4k-instruct'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
@@ -12,5 +11,5 @@ batch_size: 1 # default to 1
 in_out_pairs:
   - '32-32'
 test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
 cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)