LLM: add whisper models into nightly test (intel-analytics#10193)

* LLM: add whisper models into nightly test * small fix * small fix * add more whisper models * test all cases * test specific cases * collect the csv * store the resut * to html * small fix * small test * test all cases * modify whisper_csv_to_html
WeiguangHan · Mar 11, 2024 · 89554f7 · 89554f7
1 parent e2836e3
commit 89554f7
Show file tree

Hide file tree

Showing 4 changed files with 400 additions and 4 deletions.
diff --git a/.github/workflows/llm-whisper-evaluation.yml b/.github/workflows/llm-whisper-evaluation.yml
@@ -0,0 +1,207 @@
+name: LLM Whisper Models Evaluation
+
+# Cancel previous runs in the PR when you push new commits
+concurrency:
+  group: ${{ github.workflow }}-llm-nightly-test-${{ github.event.pull_request.number || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+# Controls when the action will run.
+on:
+  schedule:
+    - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm-whisper-evaluation.yml"
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+    inputs:
+      model_name:
+        description: 'Model names, separated by comma and must be quoted.'
+        required: true
+        type: string
+      precision:
+        description: 'Precisions, separated by comma and must be quoted.'
+        required: true
+        type: string
+      task:
+        description: 'Tasks, separated by comma and must be quoted.'
+        required: true
+        type: string
+      runs-on:
+        description: 'Labels to filter the runners, separated by comma and must be quoted.'
+        default: "accuracy"
+        required: false
+        type: string
+
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  llm-cpp-build: # please uncomment it for PR tests
+    uses: ./.github/workflows/llm-binary-build.yml
+
+  # Set the testing matrix based on the event (schedule, PR, or manual dispatch)
+  set-matrix:
+    runs-on: ubuntu-latest
+
+    outputs:
+      model_name: ${{ steps.set-matrix.outputs.model_name }}
+      precision: ${{ steps.set-matrix.outputs.precision }}
+      task: ${{ steps.set-matrix.outputs.task }}
+      runner: ${{ steps.set-matrix.outputs.runner }}
+
+    steps:
+      - name: set-env
+        env:
+          MATRIX_MODEL_NAME: '["whisper-tiny", "whisper-small", "whisper-medium", "whisper-base"]'
+          MATRIX_TASK: '["librispeech"]'
+          MATRIX_PRECISION: '["sym_int4", "fp8_e5m2"]'
+          LABELS: '["self-hosted", "llm", "perf"]'
+        run: |
+            echo "model_name=$MATRIX_MODEL_NAME" >> $GITHUB_ENV
+            echo "task=$MATRIX_TASK" >> $GITHUB_ENV
+            echo "precision=$MATRIX_PRECISION" >> $GITHUB_ENV
+            echo "runner=$LABELS" >> $GITHUB_ENV
+
+      - name: set-matrix
+        id: set-matrix
+        run: |
+            echo "model_name=$model_name" >> $GITHUB_OUTPUT
+            echo "task=$task" >> $GITHUB_OUTPUT
+            echo "precision=$precision" >> $GITHUB_OUTPUT
+            echo "runner=$runner" >> $GITHUB_OUTPUT
+
+  llm-whisper-evaluation:
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: [llm-cpp-build, set-matrix] # please uncomment it for PR tests
+    # needs: [set-matrix] # please comment it for PR tests
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9"]
+        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
+        task: ${{ fromJson(needs.set-matrix.outputs.task) }}
+        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
+        device: [xpu]
+    runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }}
+    env:
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+      ORIGIN_DIR: /mnt/disk1/models
+
+    steps:
+      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade wheel
+          python -m pip install --upgrade pandas
+          python -m pip install --upgrade datasets
+          python -m pip install --upgrade evaluate
+          python -m pip install --upgrade soundfile
+          python -m pip install --upgrade librosa
+          python -m pip install --upgrade jiwer
+
+      # please uncomment it and comment the "Install BigDL-LLM from Pypi" part for PR tests
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu_2.1"
+
+      # - name: Install BigDL-LLM from Pypi
+      #   shell: bash
+      #   run: |
+      #     pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
+
+      # - name: Test installed xpu version
+      #   shell: bash
+      #   run: |
+      #     source /opt/intel/oneapi/setvars.sh
+      #     bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Run whisper evaluation
+        shell: bash
+        run: |
+
+          source /opt/intel/oneapi/setvars.sh
+          export USE_XETLA=OFF
+          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+
+          echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
+          MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
+          export LIBRISPEECH_DATASET_PATH=/mnt/disk1/datasets/librispeech
+
+          cd python/llm/dev/benchmark/whisper
+          python run_whisper.py --model_path ${MODEL_PATH} --data_type other --device xpu --load_in_low_bit ${{ matrix.precision }} --save_result
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: whisper_results
+          path:
+            ${{ github.workspace }}/python/llm/dev/benchmark/whisper/results/**
+
+  llm-whisper-summary:
+    if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}}
+    needs: [set-matrix, llm-whisper-evaluation]
+    runs-on: ["self-hosted", "llm", "perf"]
+    steps:
+      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+
+      - name: Set output path
+        shell: bash
+        run: |
+          DATE=$(date +%Y-%m-%d)
+          OUTPUT_PATH="results_$DATE"
+          echo "OUTPUT_PATH=$OUTPUT_PATH" >> $GITHUB_ENV
+          NIGHTLY_FOLDER="/mnt/disk1/whisper_nightly_gpu"
+          echo "NIGHTLY_FOLDER=$NIGHTLY_FOLDER" >> $GITHUB_ENV
+          PR_FOLDER="/mnt/disk1/whisper_pr_gpu"
+          echo "PR_FOLDER=$PR_FOLDER" >> $GITHUB_ENV
+
+      - name: Download all results for nightly run
+        if: github.event_name == 'schedule'
+        uses: actions/download-artifact@v3
+        with:
+          name: whisper_results
+          path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }}
+
+      - name: Download all results for pr run
+        if: github.event_name == 'pull_request'
+        uses: actions/download-artifact@v3
+        with:
+          name: whisper_results
+          path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }}
+
+      - name: Summarize the results for nightly run
+        if: github.event_name == 'schedule'
+        shell: bash
+        run: |
+          cp -r /mnt/disk1/datasets/whisper_fp16_results/* /mnt/disk1/whisper_nightly_gpu/${{ env.OUTPUT_PATH }}
+          pip install pandas==1.5.3
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_concat_csv.py -i ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} -o ${{ env.NIGHTLY_FOLDER}}
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}}
+
+      - name: Summarize the results for pull request
+        if: github.event_name == 'pull_request'
+        shell: bash
+        run: |
+          cp -r /mnt/disk1/datasets/whisper_fp16_results/* /mnt/disk1/whisper_pr_gpu/${{ env.OUTPUT_PATH }}
+          pip install pandas==1.5.3
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_concat_csv.py -i ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} -o ${{ env.PR_FOLDER}}
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py -f ${{ env.PR_FOLDER}}
diff --git a/python/llm/dev/benchmark/whisper/run_whisper.py b/python/llm/dev/benchmark/whisper/run_whisper.py
@@ -21,12 +21,20 @@
 from evaluate import load
 import time
 import argparse
+import pandas as pd
+import os
+import csv
+from datetime import date
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
 
 def get_args():
     parser = argparse.ArgumentParser(description="Evaluate Whisper performance and accuracy")
     parser.add_argument('--model_path', required=True, help='pretrained model path')
     parser.add_argument('--data_type', required=True, help='clean, other')
     parser.add_argument('--device', required=False, help='cpu, xpu')
+    parser.add_argument('--load_in_low_bit', default='sym_int4', help='Specify whether to load data in low bit format (e.g., 4-bit)')
+    parser.add_argument('--save_result', action='store_true', help='Save the results to a CSV file')
 
     args = parser.parse_args()
     return args
@@ -40,7 +48,7 @@ def get_args():
     processor = WhisperProcessor.from_pretrained(args.model_path)
     forced_decoder_ids = processor.get_decoder_prompt_ids(language='en', task='transcribe')
 
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_path, load_in_low_bit="sym_int4", optimize_model=True).eval().to(args.device)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_path, load_in_low_bit=args.load_in_low_bit, optimize_model=True).eval().to(args.device)
     model.config.forced_decoder_ids = None
 
     def map_to_pred(batch):
@@ -67,6 +75,24 @@ def map_to_pred(batch):
     wer = load("./wer")
     speech_length = sum(result["length"][1:])
     prc_time = sum(result["time"][1:])
-    print("Realtime Factor(RTF) is : %.4f" % (prc_time/speech_length))
-    print("Realtime X(RTX) is : %.2f" % (speech_length/prc_time))
-    print(f'WER is {100 * wer.compute(references=result["reference"], predictions=result["prediction"])}')
+
+    MODEL = args.model_path.split('/')[-2]
+    RTF = prc_time/speech_length
+    RTX = speech_length/prc_time
+    WER = 100 * wer.compute(references=result["reference"], predictions=result["prediction"])
+
+    today = date.today()
+    if args.save_result:
+        csv_name = f'{current_dir}/results/{MODEL}-{args.data_type}-{args.device}-{args.load_in_low_bit}-{today}.csv'
+        os.makedirs(os.path.dirname(csv_name), exist_ok=True)
+        with open(csv_name, mode='a', newline='') as file:
+            csv_writer = csv.writer(file)
+            file.seek(0, os.SEEK_END)
+            if file.tell() == 0:
+                csv_writer.writerow(["models","precision","WER","RTF"])
+            csv_writer.writerow([MODEL, args.load_in_low_bit, WER, RTF])
+        print(f'Results saved to {csv_name}')
+
+    print("Realtime Factor(RTF) is : %.4f" % RTF)
+    print("Realtime X(RTX) is : %.2f" % RTX)
+    print(f'WER is {WER}')
diff --git a/python/llm/dev/benchmark/whisper/whisper_concat_csv.py b/python/llm/dev/benchmark/whisper/whisper_concat_csv.py
@@ -0,0 +1,50 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Python program to concat CSVs
+
+import os
+import sys
+import argparse
+import pandas as pd
+from datetime import date
+
+def main():
+    parser = argparse.ArgumentParser(description="concat .csv files")
+    parser.add_argument("-i", "--input_path", type=str, dest="input_path",
+                        help="The directory which stores the original CSV files", default="./")
+    parser.add_argument("-o", "--output_path", type=str, dest="output_path",
+                        help="The directory which stores the concated CSV file", default="./")
+
+    args = parser.parse_args()
+
+    csv_files = []
+    for file_name in os.listdir(args.input_path):
+        file_path = os.path.join(args.input_path, file_name)
+        if os.path.isfile(file_path) and file_name.endswith(".csv"):
+            csv_files.append(file_path)
+    csv_files.sort()
+
+    merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
+    merged_df.reset_index(drop=True, inplace=True)
+
+    today = date.today()
+    csv_name = f'whisper-{today}.csv'
+    output_file_path = os.path.join(args.output_path, csv_name)
+    merged_df.to_csv(output_file_path)
+
+if __name__ == "__main__":
+    sys.exit(main())