From c4db3af79f2faad4b4ab78875303094128c630f8 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 5 Apr 2024 09:13:28 +0900
Subject: [PATCH] Add action benchmark v2 (#3268)

* Add action benchmark test template

* Fix vsp perf test

* Fix act perf test

* Add action to perf benchmark workflow

* Fix raw data path

* Fix benchmark options

* Fix vsp dataset format
---
 .github/workflows/perf_benchmark.yaml    |   2 +
 tests/perf/benchmark.py                  |  22 ++-
 tests/perf/conftest.py                   |   2 +-
 tests/perf/test_action.py                | 182 +++++++++++++++++++++++
 tests/perf/test_classification.py        |  10 +-
 tests/perf/test_detection.py             |  39 +++--
 tests/perf/test_instance_segmentation.py |  65 +++++---
 tests/perf/test_visual_prompting.py      |  26 ++--
 8 files changed, 292 insertions(+), 56 deletions(-)
 create mode 100644 tests/perf/test_action.py

diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml
index d7d13bb7e68..36d16f3812f 100644
--- a/.github/workflows/perf_benchmark.yaml
+++ b/.github/workflows/perf_benchmark.yaml
@@ -114,6 +114,8 @@ jobs:
             task: "semantic_segmentation"
           - task-short: "vsp"
             task: "visual_prompting"
+          - task-short: "act"
+            task: "action"
     name: Perf-Benchmark-${{ matrix.task-short }}
     runs-on: [self-hosted, linux, x64, dmount-v2]
     timeout-minutes: 8640
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index d2810f4bd4c..a6538fdf38d 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -156,7 +156,7 @@ def run(
                 "--engine.device",
                 self.accelerator,
             ]
-            for key, value in dataset.extra_overrides.items():
+            for key, value in dataset.extra_overrides.get("train", {}).items():
                 command.append(f"--{key}")
                 command.append(str(value))
             command.extend(["--seed", str(seed)])
@@ -183,6 +183,9 @@ def run(
                 "--work_dir",
                 str(sub_work_dir),
             ]
+            for key, value in dataset.extra_overrides.get("test", {}).items():
+                command.append(f"--{key}")
+                command.append(str(value))
             self._run_command(command)
             self._rename_raw_data(
                 work_dir=sub_work_dir / ".latest" / "test",
@@ -198,6 +201,9 @@ def run(
                     "--work_dir",
                     str(sub_work_dir),
                 ]
+                for key, value in dataset.extra_overrides.get("export", {}).items():
+                    command.append(f"--{key}")
+                    command.append(str(value))
                 self._run_command(command)
 
                 exported_model_path = sub_work_dir / ".latest" / "export" / "exported_model.xml"
@@ -214,6 +220,9 @@ def run(
                     "--work_dir",
                     str(sub_work_dir),
                 ]
+                for key, value in dataset.extra_overrides.get("test", {}).items():
+                    command.append(f"--{key}")
+                    command.append(str(value))
                 self._run_command(command)
 
                 self._rename_raw_data(
@@ -235,6 +244,9 @@ def run(
                     "--work_dir",
                     str(sub_work_dir),
                 ]
+                for key, value in dataset.extra_overrides.get("optimize", {}).items():
+                    command.append(f"--{key}")
+                    command.append(str(value))
                 self._run_command(command)
 
                 optimized_model_path = sub_work_dir / ".latest" / "optimize" / "optimized_model.xml"
@@ -252,6 +264,9 @@ def run(
                     "--work_dir",
                     str(sub_work_dir),
                 ]
+                for key, value in dataset.extra_overrides.get("test", {}).items():
+                    command.append(f"--{key}")
+                    command.append(str(value))
                 self._run_command(command)
 
                 self._rename_raw_data(
@@ -267,9 +282,8 @@ def run(
         return self.average_result(result, keys=["task", "model", "data_group", "data"])
 
     def _run_command(self, command: list[str]) -> None:
-        if self.dry_run:
-            print(" ".join(command))
-        else:
+        print(" ".join(command))
+        if not self.dry_run:
             subprocess.run(command, check=True)  # noqa: S603
 
     def _log_metrics(
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 2953a9f7d70..ab50a09fa22 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -380,7 +380,7 @@ def fxt_benchmark_summary(
     print(summary_results)
     fxt_summary_csv.parent.mkdir(parents=True, exist_ok=True)
     summary_results.to_csv(fxt_summary_csv)
-    raw_results.to_csv(fxt_summary_csv.parent / "benchmark-raw.csv")
+    raw_results.to_csv(fxt_summary_csv.parent / "perf-benchmark-raw.csv")
     print(f"  -> Saved to {fxt_summary_csv}.")
 
     if fxt_mlflow_client:
diff --git a/tests/perf/test_action.py b/tests/perf/test_action.py
new file mode 100644
index 00000000000..96c6595cbef
--- /dev/null
+++ b/tests/perf/test_action.py
@@ -0,0 +1,182 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX action perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfActionClassification(PerfTestBase):
+    """Benchmark action classification."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="action/action_classification", name="movinet", category="speed"),
+        Benchmark.Model(task="action/action_classification", name="x3d", category="accuracy"),
+    ]
+
+    DATASET_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Dataset(
+            name="ucf-5percent-small",
+            path=Path("action/action_classification/ucf_kinetics_5percent_small"),
+            group="small",
+            num_repeat=5,
+            extra_overrides={
+                "train": {
+                    "max_epochs": "10",
+                    "deterministic": "True",
+                },
+            },
+        ),
+        Benchmark.Dataset(
+            name="ucf-30percent-medium",
+            path=Path("action/action_classification/ucf_kinetics_30percent_medium"),
+            group="medium",
+            num_repeat=5,
+            extra_overrides={
+                "train": {
+                    "max_epochs": "10",
+                    "deterministic": "True",
+                },
+            },
+        ),
+        Benchmark.Dataset(
+            name="ucf-large",
+            path=Path("action/action_classification/ucf_kinetics_large"),
+            group="large",
+            num_repeat=5,
+            extra_overrides={
+                "train": {
+                    "max_epochs": "3",
+                    "deterministic": "True",
+                },
+            },
+        ),
+    ]
+
+    BENCHMARK_CRITERIA = [  # noqa: RUF012
+        Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+            criteria=self.BENCHMARK_CRITERIA,
+        )
+
+
+class TestPerfActionDetection(PerfTestBase):
+    """Benchmark action detection."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="action/action_detection", name="x3d_fastrcnn", category="accuracy"),
+    ]
+
+    DATASET_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Dataset(
+            name="ucf-5percent-small",
+            path=Path("action/action_detection/UCF101_ava_5percent"),
+            group="small",
+            num_repeat=5,
+            extra_overrides={
+                "train": {
+                    "max_epochs": "3",
+                    "deterministic": "True",
+                },
+            },
+        ),
+        Benchmark.Dataset(
+            name="ucf-30percent-medium",
+            path=Path("action/action_detection/UCF101_ava_30percent"),
+            group="medium",
+            num_repeat=5,
+            extra_overrides={
+                "train": {
+                    "max_epochs": "3",
+                    "deterministic": "True",
+                },
+            },
+        ),
+        Benchmark.Dataset(
+            name="ucf-large",
+            path=Path("action/action_detection/UCF101_ava"),
+            group="large",
+            num_repeat=5,
+            extra_overrides={
+                "train": {
+                    "max_epochs": "1",
+                    "deterministic": "True",
+                },
+            },
+        ),
+    ]
+
+    BENCHMARK_CRITERIA = [  # noqa: RUF012
+        Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/map_50", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/map_50", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/map_50", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+            criteria=self.BENCHMARK_CRITERIA,
+        )
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 6b00715831d..8f8bf0a0214 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -180,10 +180,7 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase):
             path=Path("hlabel_classification/hlabel_CUB_small") / f"{idx}",
             group="small",
             num_repeat=5,
-            extra_overrides={
-                "model.num_multiclass_heads": "3",
-                "model.num_multilabel_classes": "0",
-            },
+            extra_overrides={},
         )
         for idx in (1, 2, 3)
     ] + [
@@ -192,10 +189,7 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase):
             path=Path("hlabel_classification/hlabel_CUB_medium"),
             group="medium",
             num_repeat=5,
-            extra_overrides={
-                "model.num_multiclass_heads": "23",
-                "model.num_multilabel_classes": "0",
-            },
+            extra_overrides={},
         ),
         # Add large dataset
     ]
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
index f048eb7c357..327bfab5bbf 100644
--- a/tests/perf/test_detection.py
+++ b/tests/perf/test_detection.py
@@ -33,10 +33,15 @@ class TestPerfObjectDetection(PerfTestBase):
             group="small",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         )
         for idx in (1, 2, 3)
@@ -47,10 +52,15 @@ class TestPerfObjectDetection(PerfTestBase):
             group="medium",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         ),
         Benchmark.Dataset(
@@ -59,10 +69,15 @@ class TestPerfObjectDetection(PerfTestBase):
             group="large",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         ),
     ]
diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py
index 2711ea961f4..8d76683ad1d 100644
--- a/tests/perf/test_instance_segmentation.py
+++ b/tests/perf/test_instance_segmentation.py
@@ -29,10 +29,15 @@ class TestPerfInstanceSegmentation(PerfTestBase):
             group="small",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         )
         for idx in (1, 2, 3)
@@ -43,10 +48,15 @@ class TestPerfInstanceSegmentation(PerfTestBase):
             group="medium",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         ),
         Benchmark.Dataset(
@@ -55,10 +65,15 @@ class TestPerfInstanceSegmentation(PerfTestBase):
             group="large",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         ),
     ]
@@ -118,10 +133,15 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase):
             group="small",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         )
         for idx in (1, 2, 3)
@@ -132,10 +152,15 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase):
             group="medium",
             num_repeat=5,
             extra_overrides={
-                "deterministic": "True",
-                "metric": "otx.core.metrics.fmeasure.FMeasure",
-                "callback_monitor": "val/f1-score",
-                "scheduler.monitor": "val/f1-score",
+                "train": {
+                    "deterministic": "True",
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                    "callback_monitor": "val/f1-score",
+                    "scheduler.monitor": "val/f1-score",
+                },
+                "test": {
+                    "metric": "otx.core.metrics.fmeasure.FMeasure",
+                },
             },
         ),
         # Add large dataset
diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py
index 64da6d6e5aa..c2d0a2f4766 100644
--- a/tests/perf/test_visual_prompting.py
+++ b/tests/perf/test_visual_prompting.py
@@ -50,10 +50,10 @@ class TestPerfVisualPrompting(PerfTestBase):
     BENCHMARK_CRITERIA = [  # noqa: RUF012
         Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
         Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
-        Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="val/dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/dice", summary="max", compare=">", margin=0.1),
         Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
@@ -96,21 +96,25 @@ class TestPerfZeroShotVisualPrompting(PerfTestBase):
 
     DATASET_TEST_CASES = [  # noqa: RUF012
         Benchmark.Dataset(
-            name="coco_car_person_medium_datumaro",
-            path=Path("zero_shot_visual_prompting/coco_car_person_medium_datumaro"),
+            name="coco_car_person_medium",
+            path=Path("zero_shot_visual_prompting/coco_car_person_medium"),
             group="medium",
             num_repeat=5,
-            extra_overrides={"max_epochs": "1"},
+            extra_overrides={
+                "train": {
+                    "max_epochs": "1",
+                },
+            },
         ),
     ]
 
     BENCHMARK_CRITERIA = [  # noqa: RUF012
         Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
         Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
-        Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="val/dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/dice", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/dice", summary="max", compare=">", margin=0.1),
         Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),