intel-analytics · hkvision · Jun 5, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
@@ -10,15 +10,15 @@ permissions:
 
 # Controls when the action will run.
 on:
-  schedule:
-    - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
+  # schedule:
+  #  - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
@@ -28,7 +28,7 @@ jobs:
   #   uses: ./.github/workflows/llm-binary-build.yml
 
   llm-performance-test-on-arc:
-    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
     # needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
@@ -75,11 +75,11 @@ jobs:
         shell: bash
         run: |
           pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          if ! pip show ipex-llm | grep $test_version_date; then
-            echo "Did not install ipex-llm with excepted version $test_version_date"
-            exit 1
-          fi
+          # test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+          # if ! pip show ipex-llm | grep $test_version_date; then
+          #   echo "Did not install ipex-llm with excepted version $test_version_date"
+          #   exit 1
+          # fi
 
       - name: Test installed xpu version
         shell: bash
@@ -95,13 +95,21 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+          # batch_size 1
           cp python/llm/test/benchmark/arc-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
+          mkdir test_batch1
+          mkdir test_batch2
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
           # change csv name
           sed -i 's/{today}/{today}_test1/g' run.py
           python run.py
+          # batch_size 2
+          cd ../../../../../ 
+          cp python/llm/test/benchmark/arc-perf-test-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          cd python/llm/dev/benchmark/all-in-one
+          python run.py
 
       - name: Test on xpu(transformers==4.37.0)
         shell: bash
@@ -111,33 +119,64 @@ jobs:
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
           # upgrade transformers for model Qwen/Qwen1.5-7B-Chat
           python -m pip install transformers==4.37.0
+          # batch_size 1
           cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
           # change csv name
           sed -i 's/test1/test2/g' run.py
           python run.py
+          # batch_size 2
+          cd ../../../../../ 
+          cp python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          cd python/llm/dev/benchmark/all-in-one
+          python run.py
 
       - name: Concat csv and generate html
         shell: bash
         run: |
-          cd python/llm/dev/benchmark/all-in-one
-          python ../../../test/benchmark/concat_csv.py
+          # batch_size 1 
+          cd python/llm/dev/benchmark/all-in-one/test_batch1
+          python ../../../../test/benchmark/concat_csv.py
           for file in *.csv; do
               if [[ $file != *test* ]]; then
                   cp "$file" $CSV_SAVE_PATH
               fi
           done
           python -m pip install pandas==1.5.3
-          cd ../../../test/benchmark
+          cd ../../../../test/benchmark
+          python csv_to_html.py -f $CSV_SAVE_PATH
+          # batch_size 2
+          cd ../../../../
+          cd python/llm/dev/benchmark/all-in-one/test_batch2
+          python ../../../../test/benchmark/concat_csv.py
+          for file in *.csv; do
+              if [[ $file != *test* ]]; then
+                  cp "$file" $CSV_SAVE_PATH
+              fi
+          done
+          cd ../../../../test/benchmark
           python csv_to_html.py -f $CSV_SAVE_PATH
 
       - name: Check and upload results to ftp
         shell: bash
         run: |
-          cd python/llm/dev/benchmark/all-in-one
-          python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
-          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
+          # batch_size 1
+          cd python/llm/dev/benchmark/all-in-one/test_batch1
+          python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test.yaml
+          python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437.yaml
+          find . -name "*test*.csv" -delete
+          cd ../
+          rm -r test_batch1
+          if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
+            curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
+          fi
+          # batch_size 2
+          cd test_batch2
+          python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test-batch2.yaml
+          python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437-batch2.yaml
           find . -name "*test*.csv" -delete
+          cd ../
+          rm -r test_batch2
           if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
             curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
           fi

diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml
@@ -7,7 +7,8 @@ warm_up: 1 # must set >=2 when run "pipeline_parallel_gpu" test_api
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: 
+  - 1 # default to 1
 in_out_pairs:
   - '32-32'
   - '1024-128'

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -1822,18 +1822,20 @@ def run_pipeline_parallel_gpu(repo_id,
 
     import pandas as pd
     for api in conf.test_api:
-        global csv_name
-        csv_name = f'{current_dir}/{api}-results-{today}.csv'
-        for model in conf.repo_id:
-            in_out_pairs = conf['in_out_pairs'].copy()
-            if excludes:
-                for in_out in conf['in_out_pairs']:
-                    model_id_input = model + ':' + in_out.split('-')[0]
-                    model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size'])
-                    if model_id_input in excludes or model_id_input_batch_size in excludes:
-                        in_out_pairs.remove(in_out)
-            run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
-                      conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu)
+        for batch_size in conf["batch_size"]:
+            global csv_name
+            batch = str(batch_size)
+            csv_name = f'{current_dir}/test_batch{batch}/{api}-results-{today}-batch-{batch}.csv'
+            for model in conf.repo_id:
+                in_out_pairs = conf['in_out_pairs'].copy()
+                if excludes:
+                    for in_out in conf['in_out_pairs']:
+                        model_id_input = model + ':' + in_out.split('-')[0]
+                        model_id_input_batch_size = model_id_input + ':' + str(batch_size)
+                        if model_id_input in excludes or model_id_input_batch_size in excludes:
+                            in_out_pairs.remove(in_out)
+                run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
+                        conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu)
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])

diff --git a/python/llm/test/benchmark/arc-perf-test-batch2.yaml b/python/llm/test/benchmark/arc-perf-test-batch2.yaml
@@ -0,0 +1,39 @@
+repo_id:
+  - 'meta-llama/Llama-2-7b-chat-hf'
+  - 'meta-llama/Llama-2-13b-chat-hf'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b-4bit'
+  - 'tiiuae/falcon-7b-instruct-with-patch'
+  - 'mosaicml/mpt-7b-chat'
+  - 'redpajama/gptneox-7b-redpajama-bf16'
+  - 'bigcode/starcoder-15.5b-4bit'
+  - 'databricks/dolly-v1-6b'
+  - 'databricks/dolly-v2-7b'
+  - 'databricks/dolly-v2-12b'
+  - 'internlm/internlm-chat-7b'
+  - 'Qwen/Qwen-7B-Chat'
+  - 'BAAI/AquilaChat-7B'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
+  - 'bigscience/bloomz-7b1'
+#  - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+
+  - 'mistralai/Mistral-7B-v0.1'
+local_model_hub: '/mnt/disk1/models'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: # default to 1
+  - 2
+in_out_pairs:
+  - '32-32'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'bigcode/starcoder-15.5b-4bit:2048:2'
+  - 'databricks/dolly-v2-12b:2048:2'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048:2'
+  - 'bigscience/bloomz-7b1:2048:2'
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -23,7 +23,8 @@ warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: # default to 1
+  - 1
 in_out_pairs:
   - '32-32'
   - '1024-128'
@@ -32,7 +33,5 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-#  - 'fnlp/moss-moon-003-sft-4bit:1024'
-#  - 'fnlp/moss-moon-003-sft-4bit:2048'
-  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
-  - 'bigscience/bloomz-7b1:2048'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048:1'
+  - 'bigscience/bloomz-7b1:2048:1'
diff --git a/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml
@@ -0,0 +1,21 @@
+# For the models that require transformers 4.37.0
+repo_id:
+  - 'Qwen/Qwen1.5-7B-Chat'
+  - 'microsoft/phi-2'
+  - 'microsoft/Phi-3-mini-4k-instruct'
+  - 'meta-llama/Meta-Llama-3-8B-Instruct'
+local_model_hub: '/mnt/disk1/models'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: # default to 1
+  - 2
+in_out_pairs:
+  - '32-32'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml
@@ -9,11 +9,13 @@ warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: # default to 1
+  - 1
 in_out_pairs:
   - '32-32'
   - '1024-128'
   - '2048-256'
 test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
diff --git a/python/llm/test/benchmark/check_results.py b/python/llm/test/benchmark/check_results.py
@@ -34,16 +34,16 @@ def main():
     actual_test_num = len(csv_dataframe)
     actual_test_cases = []
     for index, row in csv_dataframe.iterrows():
-        actual_test_cases.append(row['model'] + ":" + row['input/output tokens'].split('-')[0])
-
+        actual_test_cases.append(row['model'] + ":" + row['input/output tokens'].split('-')[0] + ":" + str(row['batch_size']))
     if args.yaml_name:
         yaml_name = args.yaml_name
         conf = OmegaConf.load(yaml_name)
         all_test_cases = []
         for model in conf.repo_id:
             for in_out in conf['in_out_pairs']:
-                model_id_input = model + ':' + in_out.split('-')[0]
-                all_test_cases.append(model_id_input)
+                for batch_size in conf['batch_size']:
+                    model_id_input = model + ':' + in_out.split('-')[0] + ':' + str(batch_size)
+                    all_test_cases.append(model_id_input)
         exclude_test_cases = []
         if 'exclude' in conf and conf['exclude'] is not None:
             exclude_test_cases = conf['exclude']

diff --git a/python/llm/test/benchmark/csv_to_html.py b/python/llm/test/benchmark/csv_to_html.py
@@ -99,20 +99,25 @@ def main():
             for current_csv_ind,current_csv_row in current_csv.iterrows():
                 current_csv_model=current_csv_row['model'].strip()
                 current_csv_input_output_pairs=current_csv_row['input/output tokens'].strip()
-                current_csv_model_input_1st=current_csv_model+'-'+current_csv_input_output_pairs+'-'+'1st'
-                current_csv_model_input_2nd=current_csv_model+'-'+current_csv_input_output_pairs+'-'+'2nd'
-                add_to_dict(csv_dict, current_csv_model_input_1st, current_csv_row[latency_1st_token])
-                add_to_dict(csv_dict, current_csv_model_input_2nd, current_csv_row[latency_2_avg])
+                try:
+                  current_csv_batch_size=str(current_csv_row['batch_size'])
+                  current_csv_model_input_1st=current_csv_model+'-'+current_csv_input_output_pairs+'-'+current_csv_batch_size+'-'+'1st'
+                  current_csv_model_input_2nd=current_csv_model+'-'+current_csv_input_output_pairs+'-'+current_csv_batch_size+'-'+'2nd'
+                  add_to_dict(csv_dict, current_csv_model_input_1st, current_csv_row[latency_1st_token])
+                  add_to_dict(csv_dict, current_csv_model_input_2nd, current_csv_row[latency_2_avg])
+                except:
+                  pass
 
         for latest_csv_ind,latest_csv_row in latest_csv.iterrows():
 
             latest_csv_model=latest_csv_row['model'].strip()
             latest_csv_input_output_pairs=latest_csv_row['input/output tokens'].strip()
             latest_1st_token_latency=latest_csv_row[latency_1st_token]
             latest_2_avg_latency=latest_csv_row[latency_2_avg]
+            latest_csv_batch_size=str(latest_csv_row['batch_size'])
 
-            key1=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+'1st'
-            key2=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+'2nd'
+            key1=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+latest_csv_batch_size+'-'+'1st'
+            key2=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+latest_csv_batch_size+'-'+'2nd'
 
             best_last1_value=best_in_dict(csv_dict, key1, latest_1st_token_latency)
             best_last2_value=best_in_dict(csv_dict, key2, latest_2_avg_latency)
@@ -128,8 +133,9 @@ def main():
 
                 previous_csv_model=previous_csv_row['model'].strip()
                 previous_csv_input_output_pairs=previous_csv_row['input/output tokens'].strip()
+                previous_csv_batch_size=str(previous_csv_row['batch_size'])
 
-                if latest_csv_model==previous_csv_model and latest_csv_input_output_pairs==previous_csv_input_output_pairs:
+                if latest_csv_model==previous_csv_model and latest_csv_input_output_pairs==previous_csv_input_output_pairs and latest_csv_batch_size==previous_csv_batch_size:
 
                     previous_1st_token_latency=previous_csv_row[latency_1st_token]
                     previous_2_avg_latency=previous_csv_row[latency_2_avg]