Skip to content

Commit

Permalink
Support benchmark history summary v2.1 (#3312)
Browse files Browse the repository at this point in the history
* Update perf benchmark summary up to releases/2.0.0

* Fix scheduler overrides

* Refine check message

* Remove unused reference file

* Fix default model category to all

* Allow benchmark run failure for summary job

* Add openpyxl dependency

* Fix workflow step order

* Fix v1.x data normalization

* Update 2.0 data
  • Loading branch information
goodsong81 authored Apr 17, 2024
1 parent c411844 commit cd03d83
Show file tree
Hide file tree
Showing 18 changed files with 13,345 additions and 272 deletions.
39 changes: 35 additions & 4 deletions .github/workflows/perf_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
- accuracy
- default # speed, balance, accuracy models only
- all # default + other models
default: default
default: all
data-group:
type: choice
description: Data group to run benchmark
Expand Down Expand Up @@ -97,7 +97,7 @@ on:
permissions: read-all

jobs:
Perf-Benchmark:
Perf-Benchmark-Run:
strategy:
fail-fast: false
matrix:
Expand All @@ -114,6 +114,8 @@ jobs:
task: "semantic_segmentation"
- task-short: "vsp"
task: "visual_prompting"
- task-short: "act"
task: "action"
name: Perf-Benchmark-${{ matrix.task-short }}
runs-on: [self-hosted, linux, x64, dmount-v2]
timeout-minutes: 8640
Expand All @@ -139,14 +141,43 @@ jobs:
--num-repeat ${{ inputs.num-repeat }}
--num-epoch ${{ inputs.num-epoch }}
--eval-upto ${{ inputs.eval-upto }}
--summary-csv .tox/perf-benchmark-summary.csv
--summary-file .tox/perf-benchmark-summary.xlsx
--mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
--user-name ${{ github.triggering_actor }}
--otx-ref ${{ inputs.otx-ref }}
- name: Upload test results
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-${{ matrix.task-short }}
path: .tox/perf-*.csv
path: .tox/perf-benchmark-*.*
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}

Perf-Benchmark-Summary:
if: ${{ always() }}
needs: Perf-Benchmark-Run
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Install Python
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
with:
python-version: "3.10"
- name: Install dependencies
run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
- name: Download benchmark results
uses: actions/download-artifact@v4
with:
path: tests/perf/history/latest
- name: Summarize benchamrk results
run: |
python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv" --normalize
jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
- name: Upload benchmark summary
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-summary
path: perf-benchmark-summary
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ dev = [
"pytest-mock",
"pytest-csv",
"pytest-cov",
"mlflow==2.11.1", # For regression test
"py-cpuinfo==9.0.0", # For regression test
"mlflow==2.11.1", # For perf benchmark
"py-cpuinfo==9.0.0", # For perf benchmark
"openpyxl", # For perf benchmark
]
docs = [
"furo",
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def pytest_addoption(parser: pytest.Parser):
help="Output root directory. Defaults to temp directory.",
)
parser.addoption(
"--summary-csv",
"--summary-file",
action="store",
help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
)
parser.addoption(
"--dry-run",
Expand Down
2 changes: 1 addition & 1 deletion tests/perf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchamrk tests."""
"""OTX perfomance benchmark tests."""
143 changes: 0 additions & 143 deletions tests/perf/benchmark-reference.csv

This file was deleted.

87 changes: 44 additions & 43 deletions tests/perf/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import gc
import logging
import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
Expand All @@ -17,6 +16,8 @@
import numpy as np
import pandas as pd

from .history import summary

log = logging.getLogger(__name__)


Expand Down Expand Up @@ -71,14 +72,25 @@ class Criterion:
def __call__(self, result_entry: pd.Series, target_entry: pd.Series) -> None:
"""Check result against given target."""
if self.name not in result_entry or result_entry[self.name] is None or np.isnan(result_entry[self.name]):
print(f"[Check] {self.name} not in result")
return
if self.name not in target_entry or target_entry[self.name] is None or np.isnan(target_entry[self.name]):
print(f"[Check] {self.name} not in target")
return
if self.compare == "==":
print(
f"[Check] abs({self.name}:{result_entry[self.name]} - {self.name}:{target_entry[self.name]}) < {self.name}:{target_entry[self.name]} * {self.margin}",
)
assert abs(result_entry[self.name] - target_entry[self.name]) < target_entry[self.name] * self.margin
elif self.compare == "<":
print(
f"[Check] {self.name}:{result_entry[self.name]} < {self.name}:{target_entry[self.name]} * (1.0 + {self.margin})",
)
assert result_entry[self.name] < target_entry[self.name] * (1.0 + self.margin)
elif self.compare == ">":
print(
f"[Check] {self.name}:{result_entry[self.name]} > {self.name}:{target_entry[self.name]} * (1.0 - {self.margin})",
)
assert result_entry[self.name] > target_entry[self.name] * (1.0 - self.margin)

def __init__(
Expand Down Expand Up @@ -156,7 +168,7 @@ def run(
"--engine.device",
self.accelerator,
]
for key, value in dataset.extra_overrides.items():
for key, value in dataset.extra_overrides.get("train", {}).items():
command.append(f"--{key}")
command.append(str(value))
command.extend(["--seed", str(seed)])
Expand All @@ -183,6 +195,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("test", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)
self._rename_raw_data(
work_dir=sub_work_dir / ".latest" / "test",
Expand All @@ -198,6 +213,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("export", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

exported_model_path = sub_work_dir / ".latest" / "export" / "exported_model.xml"
Expand All @@ -214,6 +232,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("test", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

self._rename_raw_data(
Expand All @@ -235,6 +256,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("optimize", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

optimized_model_path = sub_work_dir / ".latest" / "optimize" / "optimized_model.xml"
Expand All @@ -252,6 +276,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("test", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

self._rename_raw_data(
Expand All @@ -264,12 +291,14 @@ def run(
gc.collect()

result = self.load_result(work_dir)
return self.average_result(result, keys=["task", "model", "data_group", "data"])
if result is None:
return None
result = summary.average(result, keys=["task", "model", "data_group", "data"]) # Average out seeds
return result.set_index(["task", "model", "data_group", "data"])

def _run_command(self, command: list[str]) -> None:
if self.dry_run:
print(" ".join(command))
else:
print(" ".join(command))
if not self.dry_run:
subprocess.run(command, check=True) # noqa: S603

def _log_metrics(
Expand Down Expand Up @@ -356,40 +385,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None:
if len(results) == 0:
return None

return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"])

@staticmethod
def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None:
"""Average result w.r.t. given keys
Args:
result (pd.DataFrame): Result data frame
keys (list[str]): Keys to summarize whole data
Retruns:
pd.DataFrame: Averaged result table
"""
if data is None:
return None

# Flatten index
index_names = data.index.names
column_names = data.columns
data = data.reset_index()
# Average by keys
grouped = data.groupby(keys)
aggregated = grouped.mean(numeric_only=True)
# Merge index columns
idx_columns = set(index_names) - set(keys)
for col in idx_columns:
aggregated[col] = "all"
# Merge tag columns (non-numeric & non-index)
tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
for col in tag_columns:
# Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
# Recover index
return aggregated.reset_index().set_index(index_names)
return pd.concat(results, ignore_index=True)

def check(self, result: pd.DataFrame, criteria: list[Criterion]):
"""Check result w.r.t. reference data.
Expand All @@ -399,19 +395,24 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]):
criteria (list[Criterion]): Criteria to check results
"""
if result is None:
print("[Check] No results loaded. Skipping result checking.")
return

if self.reference_results is None:
print("No benchmark references loaded. Skipping result checking.")
print("[Check] No benchmark references loaded. Skipping result checking.")
return

for key, result_entry in result.iterrows():
if key not in self.reference_results.index:
print(f"No benchmark reference for {key} loaded. Skipping result checking.")
print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
continue
target_entry = self.reference_results.loc[key]
if isinstance(target_entry, pd.DataFrame):
target_entry = target_entry.iloc[0] # 1-row pd.DataFrame to pd.Series
# Match num_repeat of result and target
result_seed_average = result_entry["seed"]
result_num_repeat = 2 * result_seed_average + 1 # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
target_entry = target_entry.query(f"seed < {result_num_repeat}")
target_entry = target_entry.mean(numeric_only=True) # N-row pd.DataFrame to pd.Series

for criterion in criteria:
criterion(result_entry, target_entry)
Expand Down
Loading

0 comments on commit cd03d83

Please sign in to comment.