From 89554f723b1586ee0d54dcfc93cd33257d17aa58 Mon Sep 17 00:00:00 2001
From: WeiguangHan <weiguang.han@intel.com>
Date: Mon, 11 Mar 2024 20:00:47 +0800
Subject: [PATCH] LLM: add whisper models into nightly test (#10193)

* LLM: add whisper models into nightly test

* small fix

* small fix

* add more whisper models

* test all cases

* test specific cases

* collect the csv

* store the resut

* to html

* small fix

* small test

* test all cases

* modify whisper_csv_to_html
---
 .github/workflows/llm-whisper-evaluation.yml  | 207 ++++++++++++++++++
 .../llm/dev/benchmark/whisper/run_whisper.py  |  34 ++-
 .../benchmark/whisper/whisper_concat_csv.py   |  50 +++++
 .../benchmark/whisper/whisper_csv_to_html.py  | 113 ++++++++++
 4 files changed, 400 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/llm-whisper-evaluation.yml
 create mode 100644 python/llm/dev/benchmark/whisper/whisper_concat_csv.py
 create mode 100644 python/llm/dev/benchmark/whisper/whisper_csv_to_html.py

diff --git a/.github/workflows/llm-whisper-evaluation.yml b/.github/workflows/llm-whisper-evaluation.yml
new file mode 100644
index 00000000000..85a94101be2
--- /dev/null
+++ b/.github/workflows/llm-whisper-evaluation.yml
@@ -0,0 +1,207 @@
+name: LLM Whisper Models Evaluation
+
+# Cancel previous runs in the PR when you push new commits
+concurrency:
+  group: ${{ github.workflow }}-llm-nightly-test-${{ github.event.pull_request.number || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+# Controls when the action will run.
+on:
+  schedule:
+    - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm-whisper-evaluation.yml"
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+    inputs:
+      model_name:
+        description: 'Model names, separated by comma and must be quoted.'
+        required: true
+        type: string
+      precision:
+        description: 'Precisions, separated by comma and must be quoted.'
+        required: true
+        type: string
+      task:
+        description: 'Tasks, separated by comma and must be quoted.'
+        required: true
+        type: string
+      runs-on:
+        description: 'Labels to filter the runners, separated by comma and must be quoted.'
+        default: "accuracy"
+        required: false
+        type: string
+
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  llm-cpp-build: # please uncomment it for PR tests
+    uses: ./.github/workflows/llm-binary-build.yml
+
+  # Set the testing matrix based on the event (schedule, PR, or manual dispatch)
+  set-matrix:
+    runs-on: ubuntu-latest
+
+    outputs:
+      model_name: ${{ steps.set-matrix.outputs.model_name }}
+      precision: ${{ steps.set-matrix.outputs.precision }}
+      task: ${{ steps.set-matrix.outputs.task }}
+      runner: ${{ steps.set-matrix.outputs.runner }}
+
+    steps:
+      - name: set-env
+        env:
+          MATRIX_MODEL_NAME: '["whisper-tiny", "whisper-small", "whisper-medium", "whisper-base"]'
+          MATRIX_TASK: '["librispeech"]'
+          MATRIX_PRECISION: '["sym_int4", "fp8_e5m2"]'
+          LABELS: '["self-hosted", "llm", "perf"]'
+        run: |
+            echo "model_name=$MATRIX_MODEL_NAME" >> $GITHUB_ENV
+            echo "task=$MATRIX_TASK" >> $GITHUB_ENV
+            echo "precision=$MATRIX_PRECISION" >> $GITHUB_ENV
+            echo "runner=$LABELS" >> $GITHUB_ENV
+
+      - name: set-matrix
+        id: set-matrix
+        run: |
+            echo "model_name=$model_name" >> $GITHUB_OUTPUT
+            echo "task=$task" >> $GITHUB_OUTPUT
+            echo "precision=$precision" >> $GITHUB_OUTPUT
+            echo "runner=$runner" >> $GITHUB_OUTPUT
+
+  llm-whisper-evaluation:
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: [llm-cpp-build, set-matrix] # please uncomment it for PR tests
+    # needs: [set-matrix] # please comment it for PR tests
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9"]
+        model_name: ${{ fromJson(needs.set-matrix.outputs.model_name) }}
+        task: ${{ fromJson(needs.set-matrix.outputs.task) }}
+        precision: ${{ fromJson(needs.set-matrix.outputs.precision) }}
+        device: [xpu]
+    runs-on: ${{ fromJson(needs.set-matrix.outputs.runner) }}
+    env:
+      ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+      ORIGIN_DIR: /mnt/disk1/models
+
+    steps:
+      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade wheel
+          python -m pip install --upgrade pandas
+          python -m pip install --upgrade datasets
+          python -m pip install --upgrade evaluate
+          python -m pip install --upgrade soundfile
+          python -m pip install --upgrade librosa
+          python -m pip install --upgrade jiwer
+
+      # please uncomment it and comment the "Install BigDL-LLM from Pypi" part for PR tests
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu_2.1"
+
+      # - name: Install BigDL-LLM from Pypi
+      #   shell: bash
+      #   run: |
+      #     pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
+
+      # - name: Test installed xpu version
+      #   shell: bash
+      #   run: |
+      #     source /opt/intel/oneapi/setvars.sh
+      #     bash python/llm/test/run-llm-install-tests.sh
+
+      - name: Run whisper evaluation
+        shell: bash
+        run: |
+
+          source /opt/intel/oneapi/setvars.sh
+          export USE_XETLA=OFF
+          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+
+          echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
+          MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
+          export LIBRISPEECH_DATASET_PATH=/mnt/disk1/datasets/librispeech
+
+          cd python/llm/dev/benchmark/whisper
+          python run_whisper.py --model_path ${MODEL_PATH} --data_type other --device xpu --load_in_low_bit ${{ matrix.precision }} --save_result
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: whisper_results
+          path:
+            ${{ github.workspace }}/python/llm/dev/benchmark/whisper/results/**
+
+  llm-whisper-summary:
+    if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}}
+    needs: [set-matrix, llm-whisper-evaluation]
+    runs-on: ["self-hosted", "llm", "perf"]
+    steps:
+      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+
+      - name: Set output path
+        shell: bash
+        run: |
+          DATE=$(date +%Y-%m-%d)
+          OUTPUT_PATH="results_$DATE"
+          echo "OUTPUT_PATH=$OUTPUT_PATH" >> $GITHUB_ENV
+          NIGHTLY_FOLDER="/mnt/disk1/whisper_nightly_gpu"
+          echo "NIGHTLY_FOLDER=$NIGHTLY_FOLDER" >> $GITHUB_ENV
+          PR_FOLDER="/mnt/disk1/whisper_pr_gpu"
+          echo "PR_FOLDER=$PR_FOLDER" >> $GITHUB_ENV
+
+      - name: Download all results for nightly run
+        if: github.event_name == 'schedule'
+        uses: actions/download-artifact@v3
+        with:
+          name: whisper_results
+          path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }}
+
+      - name: Download all results for pr run
+        if: github.event_name == 'pull_request'
+        uses: actions/download-artifact@v3
+        with:
+          name: whisper_results
+          path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }}
+
+      - name: Summarize the results for nightly run
+        if: github.event_name == 'schedule'
+        shell: bash
+        run: |
+          cp -r /mnt/disk1/datasets/whisper_fp16_results/* /mnt/disk1/whisper_nightly_gpu/${{ env.OUTPUT_PATH }}
+          pip install pandas==1.5.3
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_concat_csv.py -i ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} -o ${{ env.NIGHTLY_FOLDER}}
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}}
+
+      - name: Summarize the results for pull request
+        if: github.event_name == 'pull_request'
+        shell: bash
+        run: |
+          cp -r /mnt/disk1/datasets/whisper_fp16_results/* /mnt/disk1/whisper_pr_gpu/${{ env.OUTPUT_PATH }}
+          pip install pandas==1.5.3
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_concat_csv.py -i ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} -o ${{ env.PR_FOLDER}}
+          python ${{ github.workspace }}/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py -f ${{ env.PR_FOLDER}}
\ No newline at end of file
diff --git a/python/llm/dev/benchmark/whisper/run_whisper.py b/python/llm/dev/benchmark/whisper/run_whisper.py
index 29339ca8764..286025d5b01 100644
--- a/python/llm/dev/benchmark/whisper/run_whisper.py
+++ b/python/llm/dev/benchmark/whisper/run_whisper.py
@@ -21,12 +21,20 @@
 from evaluate import load
 import time
 import argparse
+import pandas as pd
+import os
+import csv
+from datetime import date
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
  
 def get_args():
     parser = argparse.ArgumentParser(description="Evaluate Whisper performance and accuracy")
     parser.add_argument('--model_path', required=True, help='pretrained model path')
     parser.add_argument('--data_type', required=True, help='clean, other')
     parser.add_argument('--device', required=False, help='cpu, xpu')
+    parser.add_argument('--load_in_low_bit', default='sym_int4', help='Specify whether to load data in low bit format (e.g., 4-bit)')
+    parser.add_argument('--save_result', action='store_true', help='Save the results to a CSV file')
  
     args = parser.parse_args()
     return args
@@ -40,7 +48,7 @@ def get_args():
     processor = WhisperProcessor.from_pretrained(args.model_path)
     forced_decoder_ids = processor.get_decoder_prompt_ids(language='en', task='transcribe')
    
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_path, load_in_low_bit="sym_int4", optimize_model=True).eval().to(args.device)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(args.model_path, load_in_low_bit=args.load_in_low_bit, optimize_model=True).eval().to(args.device)
     model.config.forced_decoder_ids = None
    
     def map_to_pred(batch):
@@ -67,6 +75,24 @@ def map_to_pred(batch):
     wer = load("./wer")
     speech_length = sum(result["length"][1:])
     prc_time = sum(result["time"][1:])
-    print("Realtime Factor(RTF) is : %.4f" % (prc_time/speech_length))
-    print("Realtime X(RTX) is : %.2f" % (speech_length/prc_time))
-    print(f'WER is {100 * wer.compute(references=result["reference"], predictions=result["prediction"])}')
\ No newline at end of file
+
+    MODEL = args.model_path.split('/')[-2]
+    RTF = prc_time/speech_length
+    RTX = speech_length/prc_time
+    WER = 100 * wer.compute(references=result["reference"], predictions=result["prediction"])
+
+    today = date.today()
+    if args.save_result:
+        csv_name = f'{current_dir}/results/{MODEL}-{args.data_type}-{args.device}-{args.load_in_low_bit}-{today}.csv'
+        os.makedirs(os.path.dirname(csv_name), exist_ok=True)
+        with open(csv_name, mode='a', newline='') as file:
+            csv_writer = csv.writer(file)
+            file.seek(0, os.SEEK_END)
+            if file.tell() == 0:
+                csv_writer.writerow(["models","precision","WER","RTF"])
+            csv_writer.writerow([MODEL, args.load_in_low_bit, WER, RTF])
+        print(f'Results saved to {csv_name}')
+
+    print("Realtime Factor(RTF) is : %.4f" % RTF)
+    print("Realtime X(RTX) is : %.2f" % RTX)
+    print(f'WER is {WER}')
\ No newline at end of file
diff --git a/python/llm/dev/benchmark/whisper/whisper_concat_csv.py b/python/llm/dev/benchmark/whisper/whisper_concat_csv.py
new file mode 100644
index 00000000000..ceca9c94b09
--- /dev/null
+++ b/python/llm/dev/benchmark/whisper/whisper_concat_csv.py
@@ -0,0 +1,50 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Python program to concat CSVs
+
+import os
+import sys
+import argparse
+import pandas as pd
+from datetime import date
+
+def main():
+    parser = argparse.ArgumentParser(description="concat .csv files")
+    parser.add_argument("-i", "--input_path", type=str, dest="input_path",
+                        help="The directory which stores the original CSV files", default="./")
+    parser.add_argument("-o", "--output_path", type=str, dest="output_path",
+                        help="The directory which stores the concated CSV file", default="./")
+    
+    args = parser.parse_args()
+
+    csv_files = []
+    for file_name in os.listdir(args.input_path):
+        file_path = os.path.join(args.input_path, file_name)
+        if os.path.isfile(file_path) and file_name.endswith(".csv"):
+            csv_files.append(file_path)
+    csv_files.sort()
+
+    merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
+    merged_df.reset_index(drop=True, inplace=True)
+
+    today = date.today()
+    csv_name = f'whisper-{today}.csv'
+    output_file_path = os.path.join(args.output_path, csv_name)
+    merged_df.to_csv(output_file_path)
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py b/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py
new file mode 100644
index 00000000000..10ae19d6e9c
--- /dev/null
+++ b/python/llm/dev/benchmark/whisper/whisper_csv_to_html.py
@@ -0,0 +1,113 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Python program to convert CSV to HTML Table
+
+import os
+import sys
+import argparse
+import pandas as pd
+
+def highlight_vals(val, max=3.0, color1='red', color2='green'):
+    if isinstance(val, float):
+        if val > max:
+            return 'background-color: %s' % color2
+        elif val <= -max:
+            return 'background-color: %s' % color1
+    else:
+        return ''
+
+def main():
+    parser = argparse.ArgumentParser(description="convert .csv file to .html file")
+    parser.add_argument("-f", "--folder_path", type=str, dest="folder_path",
+                        help="The directory which stores the .csv file", default="/mnt/disk1/whisper_pr_gpu/")
+    parser.add_argument("-t", "--threshold", type=float, dest="threshold",
+                        help="the threshold of highlight values", default=1.0)
+    args = parser.parse_args()
+
+    csv_files = []
+    for file_name in os.listdir(args.folder_path):
+        file_path = os.path.join(args.folder_path, file_name)
+        if os.path.isfile(file_path) and file_name.endswith(".csv"):
+            csv_files.append(file_path)
+    csv_files.sort(reverse=True)
+
+    latest_csv = pd.read_csv(csv_files[0], index_col=0)
+    daily_html=csv_files[0].split(".")[0]+".html"
+
+    if len(csv_files)>1:
+
+        previous_csv = pd.read_csv(csv_files[1], index_col=0)
+
+        last1=['']*len(latest_csv.index)
+        diff1=['']*len(latest_csv.index)
+        last2=['']*len(latest_csv.index)
+        diff2=['']*len(latest_csv.index)
+
+        WER='WER'
+        RTF='RTF'
+
+        for latest_csv_ind,latest_csv_row in latest_csv.iterrows():
+
+            latest_csv_model=latest_csv_row['models'].strip()
+            latest_csv_precision=latest_csv_row['precision'].strip()
+            latest_WER=latest_csv_row[WER]
+            latest_RTF=latest_csv_row[RTF]
+
+            in_previous_flag=False
+
+            for previous_csv_ind,previous_csv_row in previous_csv.iterrows():
+
+                previous_csv_model=previous_csv_row['models'].strip()
+                previous_csv_precision=previous_csv_row['precision'].strip()
+
+                if latest_csv_model==previous_csv_model and latest_csv_precision==previous_csv_precision:
+
+                    previous_WER=previous_csv_row[WER]
+                    previous_RTF=previous_csv_row[RTF]
+                    if previous_WER > 0.0 and previous_RTF > 0.0:
+                        last1[latest_csv_ind]=previous_WER
+                        diff1[latest_csv_ind]=round((previous_WER-latest_WER)*100/previous_WER,2)
+                        last2[latest_csv_ind]=previous_RTF
+                        diff2[latest_csv_ind]=round((previous_RTF-latest_RTF)*100/previous_RTF,2)
+                        in_previous_flag=True
+
+            if not in_previous_flag:
+                last1[latest_csv_ind]=pd.NA
+                diff1[latest_csv_ind]=pd.NA
+                last2[latest_csv_ind]=pd.NA
+                diff2[latest_csv_ind]=pd.NA
+
+        latest_csv.insert(loc=4,column='last1',value=last1)
+        latest_csv.insert(loc=5,column='diff1(%)',value=diff1)
+        latest_csv.insert(loc=6,column='last2',value=last2)
+        latest_csv.insert(loc=7,column='diff2(%)',value=diff2)
+
+        subset1=['diff1(%)','diff2(%)']
+        columns={'WER': '{:.6f}', 'RTF': '{:.6f}', 'last1': '{:.6f}', 'diff1(%)': '{:.6f}','last2': '{:.6f}', 'diff2(%)': '{:.6f}'}
+
+        styled_df = latest_csv.style.format(columns).applymap(lambda val: highlight_vals(val, max=1.0, color1='red', color2='green'), subset=subset1)
+        html_output = styled_df.set_table_attributes("border=1").render()
+
+        with open(daily_html, 'w') as f:
+            f.write(html_output)
+    else:
+        latest_csv.to_html(daily_html)
+
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file