Experiments adjustments

eth-easl · Sep 25, 2024 · a41ef3d · a41ef3d
1 parent e16f05a
commit a41ef3d
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 95 deletions.
diff --git a/experiments/arxiv/compare_trigger_policies/run.py b/experiments/arxiv/compare_trigger_policies/run.py
@@ -161,6 +161,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]:
     # -------------------------------------------------------------------------------- #
     #                                2X: Drift triggers                                #
     # -------------------------------------------------------------------------------- #
+    # 20: static tresholds are very hard to find, especially with such long timeline durations
+    # We, therefore, focus on dynamic thresholds.
     # TODO
     # Dynamic threshold drift
     21: Experiment(

diff --git a/experiments/huffpost/compare_trigger_policies/run.py b/experiments/huffpost/compare_trigger_policies/run.py
@@ -158,6 +158,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]:
     # -------------------------------------------------------------------------------- #
     #                                2X: Drift triggers                                #
     # -------------------------------------------------------------------------------- #
+    # 20: static tresholds are very hard to find, especially with such long timeline durations
+    # We, therefore, focus on dynamic thresholds.
     21: Experiment(
         name="hp-datadrift-dynamic",
         eval_handlers=(

diff --git a/experiments/yearbook/compare_trigger_policies/run.py b/experiments/yearbook/compare_trigger_policies/run.py
@@ -25,12 +25,9 @@
     AvoidableMisclassificationCostTriggerConfig,
     DataIncorporationLatencyCostTriggerConfig,
 )
-from modyn.config.schema.pipeline.trigger.performance.criterion import StaticNumberAvoidableMisclassificationCriterion
 from modyn.config.schema.pipeline.trigger.performance.performance import (
-    PerformanceTriggerConfig,
     PerformanceTriggerEvaluationConfig,
 )
-from modyn.config.schema.pipeline.trigger.simple.data_amount import DataAmountTriggerConfig
 from modyn.utils.utils import SECONDS_PER_UNIT
 from modynclient.config.schema.client_config import ModynClientConfig, Supervisor
 
@@ -192,20 +189,18 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]:
     #     gpu_device="cuda:1",
     # ),
     # # data amount baselines
-    11: Experiment(
-        name="yb-baseline-dataamount",
-        eval_handlers=(
-            construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") +
-            construct_between_trigger_eval_handler("manual")
-        ),
-        data_amount_triggers={
-            f"{num_samples}": DataAmountTriggerConfig(num_samples=num_samples)
-            # for num_samples in ([250, 500, 1_000, 2_500, 5_000, 10_000, 15_000, 30_000])
-            # for num_samples in ([1_000, 2_500, 5_000, 10_000])
-            for num_samples in ([250, 500, 15_000, 30_000])
-        },
-        gpu_device="cuda:2",
-    ),
+    # 11: Experiment(
+    #     name="yb-baseline-dataamount",
+    #     eval_handlers=(
+    #         construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") +
+    #         construct_between_trigger_eval_handler("manual")
+    #     ),
+    #     data_amount_triggers={
+    #         f"{num_samples}": DataAmountTriggerConfig(num_samples=num_samples)
+    #         for num_samples in ([250, 500, 1_000, 2_500, 5_000, 10_000, 15_000, 30_000])
+    #     },
+    #     gpu_device="cuda:2",
+    # ),
     # -------------------------------------------------------------------------------- #
     #                                2X: Drift triggers                                #
     # -------------------------------------------------------------------------------- #
@@ -306,84 +301,83 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]:
     # # -------------------------------------------------------------------------------- #
     # #                             3X:  Performance triggers                            #
     # # -------------------------------------------------------------------------------- #
-    30: Experiment(
-        name="yb-performancetrigger",
-        eval_handlers=(
-            construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual")
-            + construct_between_trigger_eval_handler("manual")
-        ),
-        performance_triggers={
-            f"{criterion_name}-int{detection_interval}y": PerformanceTriggerConfig(
-                evaluation_interval_data_points=detection_interval,
-                data_density_window_size=20,  # performed well for drift, only used for #avoidable misclass
-                performance_triggers_window_size=20,  # performed well for drift, only used for #avoidable misclass
-                warmup_intervals=3500 // detection_interval,  # same as in drift case
-                warmup_policy=TimeTriggerConfig(every="3d", start_timestamp=_FIRST_TIMESTAMP),
-                evaluation=PerformanceTriggerEvaluationConfig(
-                    device="cuda:2",
-                    dataset=EvalDataConfig(
-                        dataset_id="yearbook_train",  # optional: extra holdout split
-                        bytes_parser_function=yb_bytes_parser_function,
-                        batch_size=512,
-                        dataloader_workers=1,
-                        metrics=[
-                            AccuracyMetricConfig(evaluation_transformer_function=yb_evaluation_transformer_function),
-                        ],
-                    ),
-                ),
-                mode="hindsight",
-                forecasting_method="ridge_regression",
-                decision_criteria={criterion_name: criterion},
-            )
-            # for detection_interval in [100, 250, 500]
-            for detection_interval in [250]  # Solid choice
-            for criterion_name, criterion in (
-                # {
-                #     f"static-{perf_threshold}": StaticPerformanceThresholdCriterion(
-                #         metric="Accuracy", metric_threshold=perf_threshold
-                #     )
-                #     for perf_threshold in [0.7, 0.75, 0.8, 0.85, 0.875, 0.9, 0.925, 0.95]
-                # }
-                # | {
-                #     f"dynamic-quant-{quantile}-{decision_window_size}": DynamicQuantilePerformanceThresholdCriterion(
-                #         metric="Accuracy",
-                #         quantile=quantile,
-                #         window_size=decision_window_size,
-                #     )
-                #     for quantile in [0.05, 0.15, 0.3]
-                #     for decision_window_size in [10, 20, 30]
-                # }
-                # |
-                # {  # TODO: not completed
-                #     f"dynamic-rollavg-{deviation}-{decision_window_size}": DynamicRollingAveragePerformanceThresholdCriterion(
-                #         metric="Accuracy",
-                #         deviation=deviation,
-                #         absolute=False,
-                #         window_size=decision_window_size,
-                #     )
-                #     for deviation in reversed([0.05, 0.1, 0.2, 0.3])
-                #     for decision_window_size in [10, 20, 30]
-                # }
-                # |
-                {
-                    f"num_misclass-{num_misclassifications}-exp-{expected_accuracy}-red-{allow_reduction}-": StaticNumberAvoidableMisclassificationCriterion(
-                        expected_accuracy=expected_accuracy,
-                        allow_reduction=allow_reduction,
-                        avoidable_misclassification_threshold=num_misclassifications,
-                    )
-                    # for num_misclassifications, expected_accuracy, allow_reduction in [
-                    #     (1500, 0.95, False),
-                    # ]
-                    # cuda1: 100, 200, 500, 1000
-                    # cuda2: 50
-                    for num_misclassifications in reversed([50])  # TODO: 50, 100, 200, 500, 1000, 1500
-                    for expected_accuracy in [0.85, 0.9, 0.95]
-                    for allow_reduction in [True, False]
-                }
-            ).items()
-        },
-        gpu_device="cuda:2",
-    ),
+    # 30: Experiment(
+    #     name="yb-performancetrigger",
+    #     eval_handlers=(
+    #         construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual")
+    #         + construct_between_trigger_eval_handler("manual")
+    #     ),
+    #     performance_triggers={
+    #         f"{criterion_name}-int{detection_interval}y": PerformanceTriggerConfig(
+    #             evaluation_interval_data_points=detection_interval,
+    #             data_density_window_size=20,  # performed well for drift, only used for #avoidable misclass
+    #             performance_triggers_window_size=20,  # performed well for drift, only used for #avoidable misclass
+    #             warmup_intervals=3500 // detection_interval,  # same as in drift case
+    #             warmup_policy=TimeTriggerConfig(every="3d", start_timestamp=_FIRST_TIMESTAMP),
+    #             evaluation=PerformanceTriggerEvaluationConfig(
+    #                 device="cuda:2",
+    #                 dataset=EvalDataConfig(
+    #                     dataset_id="yearbook_train",  # optional: extra holdout split
+    #                     bytes_parser_function=yb_bytes_parser_function,
+    #                     batch_size=512,
+    #                     dataloader_workers=1,
+    #                     metrics=[
+    #                         AccuracyMetricConfig(evaluation_transformer_function=yb_evaluation_transformer_function),
+    #                     ],
+    #                 ),
+    #             ),
+    #             mode="hindsight",
+    #             forecasting_method="ridge_regression",
+    #             decision_criteria={criterion_name: criterion},
+    #         )
+    #         # for detection_interval in [100, 250, 500]
+    #         for detection_interval in [250]  # Solid choice
+    #         for criterion_name, criterion in (
+    #             {
+    #                 f"static-{perf_threshold}": StaticPerformanceThresholdCriterion(
+    #                     metric="Accuracy", metric_threshold=perf_threshold
+    #                 )
+    #                 for perf_threshold in [0.7, 0.75, 0.8, 0.85, 0.875, 0.9, 0.925, 0.95]
+    #             }
+    #             | {
+    #                 f"dynamic-quant-{quantile}-{decision_window_size}": DynamicQuantilePerformanceThresholdCriterion(
+    #                     metric="Accuracy",
+    #                     quantile=quantile,
+    #                     window_size=decision_window_size,
+    #                 )
+    #                 for quantile in [0.05, 0.15, 0.3]
+    #                 for decision_window_size in [10, 20, 30]
+    #             }
+    #             |
+    #             {   # only executed for 250 and 500 detection intervals
+    #                 f"dynamic-rollavg-{deviation}-{decision_window_size}": DynamicRollingAveragePerformanceThresholdCriterion(
+    #                     metric="Accuracy",
+    #                     deviation=deviation,
+    #                     absolute=False,
+    #                     window_size=decision_window_size,
+    #                 )
+    #                 for deviation in reversed([0.05, 0.1, 0.2, 0.3])
+    #                 for decision_window_size in [10, 20, 30]
+    #             }
+    #             |
+    #             {
+    #                 # only executed for 250 detection interval
+    #                 f"num_misclass-{num_misclassifications}-exp-{expected_accuracy}-red-{allow_reduction}-": StaticNumberAvoidableMisclassificationCriterion(
+    #                     expected_accuracy=expected_accuracy,
+    #                     allow_reduction=allow_reduction,
+    #                     avoidable_misclassification_threshold=num_misclassifications,
+    #                 )
+    #                 # for num_misclassifications, expected_accuracy, allow_reduction in [
+    #                 #     (1500, 0.95, False),
+    #                 # ]
+    #                 for num_misclassifications in reversed([50, 100, 200, 500, 1000, 1500])
+    #                 for expected_accuracy in [0.85, 0.9, 0.95]
+    #                 for allow_reduction in [True, False]
+    #             }
+    #         ).items()
+    #     },
+    #     gpu_device="cuda:2",
+    # ),
     # -------------------------------------------------------------------------------- #
     #                              4X: Cost aware triggers                             #
     # -------------------------------------------------------------------------------- #