From c4db3af79f2faad4b4ab78875303094128c630f8 Mon Sep 17 00:00:00 2001 From: Songki Choi Date: Fri, 5 Apr 2024 09:13:28 +0900 Subject: [PATCH] Add action benchmark v2 (#3268) * Add action benchmark test template * Fix vsp perf test * Fix act perf test * Add action to perf benchmark workflow * Fix raw data path * Fix benchmark options * Fix vsp dataset format --- .github/workflows/perf_benchmark.yaml | 2 + tests/perf/benchmark.py | 22 ++- tests/perf/conftest.py | 2 +- tests/perf/test_action.py | 182 +++++++++++++++++++++++ tests/perf/test_classification.py | 10 +- tests/perf/test_detection.py | 39 +++-- tests/perf/test_instance_segmentation.py | 65 +++++--- tests/perf/test_visual_prompting.py | 26 ++-- 8 files changed, 292 insertions(+), 56 deletions(-) create mode 100644 tests/perf/test_action.py diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml index d7d13bb7e68..36d16f3812f 100644 --- a/.github/workflows/perf_benchmark.yaml +++ b/.github/workflows/perf_benchmark.yaml @@ -114,6 +114,8 @@ jobs: task: "semantic_segmentation" - task-short: "vsp" task: "visual_prompting" + - task-short: "act" + task: "action" name: Perf-Benchmark-${{ matrix.task-short }} runs-on: [self-hosted, linux, x64, dmount-v2] timeout-minutes: 8640 diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py index d2810f4bd4c..a6538fdf38d 100644 --- a/tests/perf/benchmark.py +++ b/tests/perf/benchmark.py @@ -156,7 +156,7 @@ def run( "--engine.device", self.accelerator, ] - for key, value in dataset.extra_overrides.items(): + for key, value in dataset.extra_overrides.get("train", {}).items(): command.append(f"--{key}") command.append(str(value)) command.extend(["--seed", str(seed)]) @@ -183,6 +183,9 @@ def run( "--work_dir", str(sub_work_dir), ] + for key, value in dataset.extra_overrides.get("test", {}).items(): + command.append(f"--{key}") + command.append(str(value)) self._run_command(command) self._rename_raw_data( work_dir=sub_work_dir / ".latest" / "test", @@ -198,6 +201,9 @@ def run( "--work_dir", str(sub_work_dir), ] + for key, value in dataset.extra_overrides.get("export", {}).items(): + command.append(f"--{key}") + command.append(str(value)) self._run_command(command) exported_model_path = sub_work_dir / ".latest" / "export" / "exported_model.xml" @@ -214,6 +220,9 @@ def run( "--work_dir", str(sub_work_dir), ] + for key, value in dataset.extra_overrides.get("test", {}).items(): + command.append(f"--{key}") + command.append(str(value)) self._run_command(command) self._rename_raw_data( @@ -235,6 +244,9 @@ def run( "--work_dir", str(sub_work_dir), ] + for key, value in dataset.extra_overrides.get("optimize", {}).items(): + command.append(f"--{key}") + command.append(str(value)) self._run_command(command) optimized_model_path = sub_work_dir / ".latest" / "optimize" / "optimized_model.xml" @@ -252,6 +264,9 @@ def run( "--work_dir", str(sub_work_dir), ] + for key, value in dataset.extra_overrides.get("test", {}).items(): + command.append(f"--{key}") + command.append(str(value)) self._run_command(command) self._rename_raw_data( @@ -267,9 +282,8 @@ def run( return self.average_result(result, keys=["task", "model", "data_group", "data"]) def _run_command(self, command: list[str]) -> None: - if self.dry_run: - print(" ".join(command)) - else: + print(" ".join(command)) + if not self.dry_run: subprocess.run(command, check=True) # noqa: S603 def _log_metrics( diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py index 2953a9f7d70..ab50a09fa22 100644 --- a/tests/perf/conftest.py +++ b/tests/perf/conftest.py @@ -380,7 +380,7 @@ def fxt_benchmark_summary( print(summary_results) fxt_summary_csv.parent.mkdir(parents=True, exist_ok=True) summary_results.to_csv(fxt_summary_csv) - raw_results.to_csv(fxt_summary_csv.parent / "benchmark-raw.csv") + raw_results.to_csv(fxt_summary_csv.parent / "perf-benchmark-raw.csv") print(f" -> Saved to {fxt_summary_csv}.") if fxt_mlflow_client: diff --git a/tests/perf/test_action.py b/tests/perf/test_action.py new file mode 100644 index 00000000000..96c6595cbef --- /dev/null +++ b/tests/perf/test_action.py @@ -0,0 +1,182 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX action perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfActionClassification(PerfTestBase): + """Benchmark action classification.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="action/action_classification", name="movinet", category="speed"), + Benchmark.Model(task="action/action_classification", name="x3d", category="accuracy"), + ] + + DATASET_TEST_CASES = [ # noqa: RUF012 + Benchmark.Dataset( + name="ucf-5percent-small", + path=Path("action/action_classification/ucf_kinetics_5percent_small"), + group="small", + num_repeat=5, + extra_overrides={ + "train": { + "max_epochs": "10", + "deterministic": "True", + }, + }, + ), + Benchmark.Dataset( + name="ucf-30percent-medium", + path=Path("action/action_classification/ucf_kinetics_30percent_medium"), + group="medium", + num_repeat=5, + extra_overrides={ + "train": { + "max_epochs": "10", + "deterministic": "True", + }, + }, + ), + Benchmark.Dataset( + name="ucf-large", + path=Path("action/action_classification/ucf_kinetics_large"), + group="large", + num_repeat=5, + extra_overrides={ + "train": { + "max_epochs": "3", + "deterministic": "True", + }, + }, + ), + ] + + BENCHMARK_CRITERIA = [ # noqa: RUF012 + Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + criteria=self.BENCHMARK_CRITERIA, + ) + + +class TestPerfActionDetection(PerfTestBase): + """Benchmark action detection.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="action/action_detection", name="x3d_fastrcnn", category="accuracy"), + ] + + DATASET_TEST_CASES = [ # noqa: RUF012 + Benchmark.Dataset( + name="ucf-5percent-small", + path=Path("action/action_detection/UCF101_ava_5percent"), + group="small", + num_repeat=5, + extra_overrides={ + "train": { + "max_epochs": "3", + "deterministic": "True", + }, + }, + ), + Benchmark.Dataset( + name="ucf-30percent-medium", + path=Path("action/action_detection/UCF101_ava_30percent"), + group="medium", + num_repeat=5, + extra_overrides={ + "train": { + "max_epochs": "3", + "deterministic": "True", + }, + }, + ), + Benchmark.Dataset( + name="ucf-large", + path=Path("action/action_detection/UCF101_ava"), + group="large", + num_repeat=5, + extra_overrides={ + "train": { + "max_epochs": "1", + "deterministic": "True", + }, + }, + ), + ] + + BENCHMARK_CRITERIA = [ # noqa: RUF012 + Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="test/map_50", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/map_50", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/map_50", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + criteria=self.BENCHMARK_CRITERIA, + ) diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py index 6b00715831d..8f8bf0a0214 100644 --- a/tests/perf/test_classification.py +++ b/tests/perf/test_classification.py @@ -180,10 +180,7 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase): path=Path("hlabel_classification/hlabel_CUB_small") / f"{idx}", group="small", num_repeat=5, - extra_overrides={ - "model.num_multiclass_heads": "3", - "model.num_multilabel_classes": "0", - }, + extra_overrides={}, ) for idx in (1, 2, 3) ] + [ @@ -192,10 +189,7 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase): path=Path("hlabel_classification/hlabel_CUB_medium"), group="medium", num_repeat=5, - extra_overrides={ - "model.num_multiclass_heads": "23", - "model.num_multilabel_classes": "0", - }, + extra_overrides={}, ), # Add large dataset ] diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py index f048eb7c357..327bfab5bbf 100644 --- a/tests/perf/test_detection.py +++ b/tests/perf/test_detection.py @@ -33,10 +33,15 @@ class TestPerfObjectDetection(PerfTestBase): group="small", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ) for idx in (1, 2, 3) @@ -47,10 +52,15 @@ class TestPerfObjectDetection(PerfTestBase): group="medium", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ), Benchmark.Dataset( @@ -59,10 +69,15 @@ class TestPerfObjectDetection(PerfTestBase): group="large", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ), ] diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py index 2711ea961f4..8d76683ad1d 100644 --- a/tests/perf/test_instance_segmentation.py +++ b/tests/perf/test_instance_segmentation.py @@ -29,10 +29,15 @@ class TestPerfInstanceSegmentation(PerfTestBase): group="small", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ) for idx in (1, 2, 3) @@ -43,10 +48,15 @@ class TestPerfInstanceSegmentation(PerfTestBase): group="medium", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ), Benchmark.Dataset( @@ -55,10 +65,15 @@ class TestPerfInstanceSegmentation(PerfTestBase): group="large", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ), ] @@ -118,10 +133,15 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase): group="small", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ) for idx in (1, 2, 3) @@ -132,10 +152,15 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase): group="medium", num_repeat=5, extra_overrides={ - "deterministic": "True", - "metric": "otx.core.metrics.fmeasure.FMeasure", - "callback_monitor": "val/f1-score", - "scheduler.monitor": "val/f1-score", + "train": { + "deterministic": "True", + "metric": "otx.core.metrics.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + "test": { + "metric": "otx.core.metrics.fmeasure.FMeasure", + }, }, ), # Add large dataset diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py index 64da6d6e5aa..c2d0a2f4766 100644 --- a/tests/perf/test_visual_prompting.py +++ b/tests/perf/test_visual_prompting.py @@ -50,10 +50,10 @@ class TestPerfVisualPrompting(PerfTestBase): BENCHMARK_CRITERIA = [ # noqa: RUF012 Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), - Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="val/dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/dice", summary="max", compare=">", margin=0.1), Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), @@ -96,21 +96,25 @@ class TestPerfZeroShotVisualPrompting(PerfTestBase): DATASET_TEST_CASES = [ # noqa: RUF012 Benchmark.Dataset( - name="coco_car_person_medium_datumaro", - path=Path("zero_shot_visual_prompting/coco_car_person_medium_datumaro"), + name="coco_car_person_medium", + path=Path("zero_shot_visual_prompting/coco_car_person_medium"), group="medium", num_repeat=5, - extra_overrides={"max_epochs": "1"}, + extra_overrides={ + "train": { + "max_epochs": "1", + }, + }, ), ] BENCHMARK_CRITERIA = [ # noqa: RUF012 Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), - Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="val/dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/dice", summary="max", compare=">", margin=0.1), Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),