Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support benchmark history summary v1 #3307

Merged
merged 5 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions .github/workflows/perf_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,13 @@ on:
Target OTX ref (tag / branch name / commit hash) on main repo to test. Defaults to the current branch.
`pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run,
and reverted after run. Works only for v1.x assuming CLI compatibility.
default: __CURRENT_BRANCH_COMMIT__

# Declare default permissions as read only.
permissions: read-all

jobs:
Perf-Benchmark:
Perf-Benchmark-Run:
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -142,14 +143,42 @@ jobs:
--num-repeat ${{ inputs.num-repeat }}
--num-epoch ${{ inputs.num-epoch }}
--eval-upto ${{ inputs.eval-upto }}
--summary-csv .tox/perf-benchmark-summary.csv
--summary-file .tox/perf-benchmark-summary.xlsx
--mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
--user-name ${{ github.triggering_actor }}
--otx-ref ${{ inputs.otx-ref }}
- name: Upload test results
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-${{ matrix.task-short }}
path: .tox/perf-*.csv
path: .tox/perf-benchmark-*.*
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}

Perf-Benchmark-Summary:
needs: Perf-Benchmark-Run
runs-on: ubuntu-latest
steps:
- name: Download benchmark results
uses: actions/download-artifact@v4
with:
path: tests/perf/history/latest
- name: Checkout repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Install Python
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
with:
python-version: "3.10"
- name: Install dependencies
run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
- name: Summarize benchamrk results
run: |
python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv"
jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
- name: Upload benchmark summary
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-summary
path: perf-benchmark-summary
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}
1 change: 1 addition & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ pytest-csv==3.0.*
tox==4.11.*
mlflow==2.10.2
py-cpuinfo==9.0.0
openpyxl==3.1.2
143 changes: 0 additions & 143 deletions tests/perf/benchmark-reference.csv

This file was deleted.

54 changes: 21 additions & 33 deletions tests/perf/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess # nosec B404
import yaml
from pathlib import Path
from .history import summary


class Benchmark:
Expand Down Expand Up @@ -107,6 +108,7 @@ def run(
subprocess.run(cmd, check=True)
# Load result
result = self.load_result(cfg_dir)
result = summary.average(result, ["task", "model", "data_group", "data"])
return result

@staticmethod
Expand Down Expand Up @@ -145,35 +147,7 @@ def load_result(result_path: str) -> pd.DataFrame | None:
if "train_e2e_time" in data:
data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds() # H:M:S str -> seconds
data = data.rename(columns={"repeat": "seed"})
return data.set_index(["task", "model", "data_group", "data"])

@staticmethod
def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame:
"""Average result w.r.t. given keys
Args:
result (pd.DataFrame): Result data frame
keys (list[str]): Keys to summarize whole data
Retruns:
pd.DataFrame: Averaged result table
"""
# Flatten index
index_names = data.index.names
column_names = data.columns
data = data.reset_index()
# Average by keys
grouped = data.groupby(keys)
aggregated = grouped.mean(numeric_only=True)
# Merge index columns
idx_columns = set(index_names) - set(keys)
for col in idx_columns:
aggregated[col] = "all"
# Merge tag columns (non-numeric & non-index)
tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
for col in tag_columns:
# Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
# Recover index
return aggregated.reset_index().set_index(index_names)
return data

def _build_config(
self,
Expand All @@ -189,7 +163,7 @@ def _build_config(

cfg = {}
cfg["tags"] = all_tags # metadata
cfg["output_path"] = os.path.abspath(self.output_root)
cfg["output_path"] = os.path.abspath(f"{self.output_root}/{model_id}")
cfg["constants"] = {
"dataroot": os.path.abspath(self.data_root),
}
Expand Down Expand Up @@ -246,30 +220,44 @@ def check(self, result: pd.DataFrame, criteria: list[dict]):
criteria (list[dict]): Criteria to check results
"""
if result is None:
print("[Check] No results loaded. Skipping result checking.")
return

if self.reference_results is None:
print("No benchmark references loaded. Skipping result checking.")
print("[Check] No benchmark references loaded. Skipping result checking.")
return

result = result.set_index(["task", "model", "data_group", "data"])

for key, result_entry in result.iterrows():
if key not in self.reference_results.index:
print(f"No benchmark reference for {key} loaded. Skipping result checking.")
print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
continue
target_entry = self.reference_results.loc[key]
if isinstance(target_entry, pd.DataFrame):
target_entry = target_entry.iloc[0] # 1-row pd.DataFrame to pd.Series
# Match num_repeat & seeds of result and target
result_seed_average = result_entry["seed"]
result_num_repeat = 2 * result_seed_average + 1 # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
target_entry = target_entry.query(f"seed < {result_num_repeat}")
target_entry = target_entry.mean(numeric_only=True) # N-row pd.DataFrame to pd.Series

def compare(name: str, op: str, margin: float):
if name not in result_entry or result_entry[name] is None or np.isnan(result_entry[name]):
print(f"[Check] {name} not in result")
return
if name not in target_entry or target_entry[name] is None or np.isnan(target_entry[name]):
print(f"[Check] {name} not in target")
return
if op == "==":
print(
f"[Check] abs({name}:{result_entry[name]} - {name}:{target_entry[name]}) < {name}:{target_entry[name]} * {margin}",
)
assert abs(result_entry[name] - target_entry[name]) < target_entry[name] * margin
elif op == "<":
print(f"[Check] {name}:{result_entry[name]} < {name}:{target_entry[name]} * (1.0 + {margin})")
assert result_entry[name] < target_entry[name] * (1.0 + margin)
elif op == ">":
print(f"[Check] {name}:{result_entry[name]} > {name}:{target_entry[name]} * (1.0 - {margin})")
assert result_entry[name] > target_entry[name] * (1.0 - margin)

for criterion in criteria:
Expand Down
54 changes: 29 additions & 25 deletions tests/perf/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import yaml

from .benchmark import Benchmark
from .history import summary


def pytest_addoption(parser):
Expand Down Expand Up @@ -72,9 +73,9 @@ def pytest_addoption(parser):
help="Output root directory. Defaults to temp directory.",
)
parser.addoption(
"--summary-csv",
"--summary-file",
action="store",
help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
)
parser.addoption(
"--dry-run",
Expand Down Expand Up @@ -248,12 +249,15 @@ def fxt_benchmark(


def _log_benchmark_results_to_mlflow(results: pd.DataFrame, tags: dict[str, str], client: MlflowClient):
results = summary.normalize(results) # Standardize names for comparison
results = summary.average(results, keys=["task", "model", "data_group", "data"]) # Average out seeds
results = results.set_index(["task", "data_group", "data"])
for index, result in results.iterrows():
task, model, data_group, data = index
exp_name = f"[Benchmark] {task} | {model} | {data_group} | {data}"
task, data_group, data = index
model = result["model"]
exp_name = f"[Benchmark] {task} | {data_group} | {data}"
exp_tags = {
"task": task,
"model": model,
"data_group": data_group,
"data": data,
}
Expand All @@ -264,7 +268,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, tags: dict[str, str]
exp_id = exp.experiment_id
if exp.lifecycle_stage != "active":
client.restore_experiment(exp_id)
run_name = f"[{tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_name = f"[{model}] {tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_tags = {k: v for k, v in result.items() if isinstance(v, str)}
run_tags.update(**exp_tags, **tags)
run = client.create_run(exp_id, run_name=run_name, tags=run_tags)
Expand All @@ -282,32 +286,32 @@ def fxt_benchmark_summary(
):
"""Summarize all results at the end of test session."""
yield

raw_results = Benchmark.load_result(fxt_output_root)
if raw_results is None:
if raw_results is None or len(raw_results) == 0:
print("No benchmark results loaded in ", fxt_output_root)
return

print("=" * 20, "[Benchmark summary]")
summary_results = [
Benchmark.average_result(raw_results, ["task", "model", "data_group", "data"]),
Benchmark.average_result(raw_results, ["task", "model", "data_group"]),
Benchmark.average_result(raw_results, ["task", "model"]),
Benchmark.average_result(raw_results, ["task"]),
]
summary_results = pd.concat(summary_results)
summary_results = summary.summarize(raw_results)

print("=" * 20, "[Benchmark summary]")
print(summary_results)

summary_csv = request.config.getoption("--summary-csv")
if not summary_csv:
summary_csv = fxt_output_root / "perf-benchmark-summary.csv"
summary_file = request.config.getoption("--summary-file")
if not summary_file:
summary_file = fxt_output_root / "perf-benchmark-summary.csv"
else:
summary_file = Path(summary_file)
summary_file.parent.mkdir(parents=True, exist_ok=True)
raw_results.to_csv(summary_file.parent / "perf-benchmark-raw.csv", index=False)
if summary_file.suffix == ".xlsx":
summary_results.to_excel(summary_file)
else:
summary_csv = Path(summary_csv)
summary_csv.parent.mkdir(parents=True, exist_ok=True)
summary_results.to_csv(summary_csv)
raw_results.to_csv(summary_csv.parent / "perf-benchmark-raw.csv")
print(f" -> Saved to {summary_csv}.")
if summary_file.suffix != ".csv":
print(f"{summary_file.suffix} output is not supported.")
summary_file = summary_file.with_suffix(".csv")
summary_results.to_csv(summary_file)
print(f" -> Saved to {summary_file}.")

if fxt_mlflow_client is None:
print(
Expand All @@ -320,7 +324,7 @@ def fxt_benchmark_summary(
# test_branch = fxt_tags["test_branch"]
# if test_branch == "develop" or bool(re.match("^releases/[0-9]+\.[0-9]+\.[0-9]+$", test_branch)):
try:
_log_benchmark_results_to_mlflow(summary_results, fxt_tags, fxt_mlflow_client)
_log_benchmark_results_to_mlflow(raw_results, fxt_tags, fxt_mlflow_client)
except Exception as e:
print("MLFlow loging failed: ", e)

Expand All @@ -331,7 +335,7 @@ def fxt_benchmark_summary(
@pytest.fixture(scope="session")
def fxt_benchmark_reference() -> pd.DataFrame | None:
"""Load reference benchmark results with index."""
ref = pd.read_csv(Path(__file__).parent.resolve() / "benchmark-reference.csv")
ref = summary.load(Path(__file__).parent.resolve() / "history/v1.5.2")
if ref is not None:
ref = ref.set_index(["task", "model", "data_group", "data"])
return ref
4 changes: 4 additions & 0 deletions tests/perf/history/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchmark history."""
Loading
Loading