diff --git a/experiments/arxiv/compare_trigger_policies/run.py b/experiments/arxiv/compare_trigger_policies/run.py index bbc4f984e..429c4018e 100644 --- a/experiments/arxiv/compare_trigger_policies/run.py +++ b/experiments/arxiv/compare_trigger_policies/run.py @@ -28,14 +28,20 @@ DynamicRollingAverageThresholdCriterion, ) from modyn.config.schema.pipeline.trigger.drift.detection_window.time_ import TimeWindowingStrategy -from modyn.config.schema.pipeline.trigger.performance.criterion import StaticNumberAvoidableMisclassificationCriterion -from modyn.config.schema.pipeline.trigger.performance.performance import PerformanceTriggerConfig, PerformanceTriggerEvaluationConfig -from modyn.config.schema.pipeline.trigger.simple.data_amount import DataAmountTriggerConfig +from modyn.config.schema.pipeline.trigger.performance.criterion import ( + DynamicQuantilePerformanceThresholdCriterion, + DynamicRollingAveragePerformanceThresholdCriterion, + StaticNumberAvoidableMisclassificationCriterion, + StaticPerformanceThresholdCriterion, +) +from modyn.config.schema.pipeline.trigger.performance.performance import ( + PerformanceTriggerConfig, + PerformanceTriggerEvaluationConfig, +) from modyn.config.schema.pipeline.trigger.simple.time import TimeTriggerConfig from modyn.utils.utils import SECONDS_PER_UNIT from modynclient.config.schema.client_config import ModynClientConfig, Supervisor - from .pipeline_config import ( arxiv_bytes_parser_function, arxiv_evaluation_transformer_function, @@ -220,7 +226,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: + construct_between_trigger_eval_handler("manual") ), performance_triggers={ - f"{criterion_name}-int{detection_interval}y": PerformanceTriggerConfig( + f"{criterion_name}-int{detection_interval}": PerformanceTriggerConfig( evaluation_interval_data_points=detection_interval, data_density_window_size=20, # performed well for drift, only used for #avoidable misclass performance_triggers_window_size=20, # performed well for drift, only used for #avoidable misclass @@ -228,7 +234,7 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # triggering every 3 years during the warmup phase seems reasonable. warmup_policy=TimeTriggerConfig(every="2y", start_timestamp=_FIRST_TIMESTAMP), evaluation=PerformanceTriggerEvaluationConfig( - device="cuda:2", + device="cuda:3", dataset=EvalDataConfig( dataset_id="arxiv_kaggle_train", # optional: extra holdout split bytes_parser_function=arxiv_bytes_parser_function, @@ -245,11 +251,12 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: ) for detection_interval in [20_000] for criterion_name, criterion in ( + # peak accuracy 0.6-0.65 # { # f"static-{perf_threshold}": StaticPerformanceThresholdCriterion( # metric="Accuracy", metric_threshold=perf_threshold # ) - # for perf_threshold in [0.45, 0.5, 0.55, 0.6] + # for perf_threshold in [0.45, 0.5, 0.55] # 0.6 --> too many triggers # } # | # { @@ -258,8 +265,8 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # quantile=quantile, # window_size=decision_window_size, # ) - # for quantile in [0.05, 0.15, 0.3] - # for decision_window_size in [15, 30] + # for quantile in [0.05, 0.15] + # for decision_window_size in [20] # } # | # { @@ -270,22 +277,22 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # window_size=decision_window_size, # ) # for deviation in reversed([0.1, 0.2, 0.3]) - # for decision_window_size in [15, 30] + # for decision_window_size in [20] # } # | - { - f"num_misclass-{num_misclassifications}-exp-{expected_accuracy}-red-{allow_reduction}-": StaticNumberAvoidableMisclassificationCriterion( - expected_accuracy=expected_accuracy, - allow_reduction=allow_reduction, - avoidable_misclassification_threshold=num_misclassifications, - ) - for num_misclassifications in reversed([10000]) # 1000, 2000, 5000, 7500, 10000 - for expected_accuracy in [0.5, 0.55, 0.6] - for allow_reduction in [False] - } + # { + # f"num_misclass-{num_misclassifications}-exp-{expected_accuracy}-red-{allow_reduction}-": StaticNumberAvoidableMisclassificationCriterion( + # expected_accuracy=expected_accuracy, + # allow_reduction=allow_reduction, + # avoidable_misclassification_threshold=num_misclassifications, + # ) + # for num_misclassifications in reversed([10_000, 15_000, 30_000, 50_000, 100_000]) + # for expected_accuracy in [0.6] + # for allow_reduction in [False] + # } ).items() }, - gpu_device="cuda:2", + gpu_device="cuda:3", ), } diff --git a/experiments/huffpost/compare_trigger_policies/run.py b/experiments/huffpost/compare_trigger_policies/run.py index 11e4ec8f6..0d3358dc7 100644 --- a/experiments/huffpost/compare_trigger_policies/run.py +++ b/experiments/huffpost/compare_trigger_policies/run.py @@ -24,6 +24,7 @@ from modyn.config.schema.pipeline.trigger.drift.config import DataDriftTriggerConfig from modyn.config.schema.pipeline.trigger.drift.criterion import ( DynamicQuantileThresholdCriterion, + DynamicRollingAverageThresholdCriterion, ) from modyn.config.schema.pipeline.trigger.drift.detection_window.time_ import TimeWindowingStrategy from modyn.config.schema.pipeline.trigger.performance.criterion import ( @@ -33,6 +34,7 @@ PerformanceTriggerConfig, PerformanceTriggerEvaluationConfig, ) +from modyn.config.schema.pipeline.trigger.simple.data_amount import DataAmountTriggerConfig from modyn.config.schema.pipeline.trigger.simple.time import TimeTriggerConfig from modynclient.config.schema.client_config import ModynClientConfig, Supervisor @@ -141,29 +143,26 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # gpu_device="cuda:2", # ), # # data amount baselines - # 11: Experiment( - # name="hp-baseline-dataamount", - # eval_handlers=( - # construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") - # + construct_between_trigger_eval_handler("manual") - # ), - # data_amount_triggers={ - # f"{num_samples}": DataAmountTriggerConfig(num_samples=num_samples) - # for num_samples in ([5_000, 80_000]) - # # 2: 10_000, 20_000, 40_000 - # # 3: 5_000, 80_000 - # }, - # gpu_device="cuda:3", - # ), + 11: Experiment( + name="hp-baseline-dataamount", + eval_handlers=( + construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") + ), + data_amount_triggers={ + f"{num_samples}": DataAmountTriggerConfig(num_samples=num_samples) + for num_samples in ([15_000, 30_000]) # 5_000, 10_000, 15_000, 20_000, 30_000, 40_000, 80_000 + }, + gpu_device="cuda:2", + ), # -------------------------------------------------------------------------------- # # 2X: Drift triggers # # -------------------------------------------------------------------------------- # - # TODO: rerun huffpost with different eval set 21: Experiment( name="hp-datadrift-dynamic", eval_handlers=( construct_periodic_eval_handlers(intervals=PERIODIC_EVAL_INTERVAL, execution_time="manual") - + construct_between_trigger_eval_handler("manual") + # + construct_between_trigger_eval_handler("manual") # not executed to speed things up ), drift_detection_triggers={ f"{criterion_name}_int{detection_interval}_win{window_size}": DataDriftTriggerConfig( @@ -185,28 +184,26 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # multiprocessing across gpus for detection_interval in [1500] for window_size in ["1y"] # dataset specific - for decision_window_size in [15, 30] # TODO: check + for decision_window_size in [20] # more values for criterion_name, criterion in ( { f"mmd-quant-{quantile}-{decision_window_size}": DynamicQuantileThresholdCriterion( window_size=decision_window_size, quantile=quantile ) - for quantile in [0.02, 0.05, 0.10, 0.15] # TODO: 0.3 + for quantile in [0.05, 0.10, 0.15] # TODO: 0.3 + # cuda3 + } + | + { + f"mmd-rollavg-{deviation}-{decision_window_size}": DynamicRollingAverageThresholdCriterion( + window_size=decision_window_size, deviation=deviation, absolute=False + ) + for deviation in reversed([0.5, 1.0, 2.0, 5.0]) # TODO: 0.05, 0.2, + # cuda3 } - # | - # { - # f"mmd-rollavg-{deviation}-{decision_window_size}": DynamicRollingAverageThresholdCriterion( - # window_size=decision_window_size, deviation=deviation, absolute=False - # ) - # for deviation in [0.5, 1.0, 2.0, 5.0] # TODO: 0.05, 0.2, - # # 0: - # # 1: - # # 2: 0.5 - # # 3: 1.0, 2.0 - # } ).items() }, - gpu_device="cuda:0", + gpu_device="cuda:3", ), # -------------------------------------------------------------------------------- # # 3X: Performance triggers # @@ -277,9 +274,15 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: allow_reduction=allow_reduction, avoidable_misclassification_threshold=num_misclassifications, ) - for num_misclassifications in reversed([10000]) # 1000, 2000, 5000, 7500, 10000 - for expected_accuracy in [0.5, 0.55, 0.6] - for allow_reduction in [False] # TODO: test with [False] + # for num_misclassifications in reversed([250, 500, 1000, 4000, 8000]) # 250, 500, 1000, 4000 + # for expected_accuracy in [0.5, 0.6] + # for allow_reduction in [False] + for num_misclassifications, expected_accuracy, allow_reduction in [ + (500, 0.5, False), # TODO: + (500, 0.6, False), + (250, 0.5, False), + (250, 0.6, False), + ] } ).items() }, diff --git a/experiments/yearbook/compare_trigger_policies/run.py b/experiments/yearbook/compare_trigger_policies/run.py index 1a4d4db80..150839daa 100644 --- a/experiments/yearbook/compare_trigger_policies/run.py +++ b/experiments/yearbook/compare_trigger_policies/run.py @@ -25,7 +25,9 @@ AvoidableMisclassificationCostTriggerConfig, DataIncorporationLatencyCostTriggerConfig, ) +from modyn.config.schema.pipeline.trigger.performance.criterion import StaticNumberAvoidableMisclassificationCriterion from modyn.config.schema.pipeline.trigger.performance.performance import ( + PerformanceTriggerConfig, PerformanceTriggerEvaluationConfig, ) from modyn.config.schema.pipeline.trigger.simple.data_amount import DataAmountTriggerConfig @@ -304,83 +306,84 @@ def construct_pipelines(experiment: Experiment) -> list[ModynPipelineConfig]: # # -------------------------------------------------------------------------------- # # # 3X: Performance triggers # # # -------------------------------------------------------------------------------- # - # 30: Experiment( - # name="yb-performancetrigger", - # eval_handlers=( - # construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") - # + construct_between_trigger_eval_handler("manual") - # ), - # performance_triggers={ - # f"{criterion_name}-int{detection_interval}y": PerformanceTriggerConfig( - # evaluation_interval_data_points=detection_interval, - # data_density_window_size=20, # performed well for drift, only used for #avoidable misclass - # performance_triggers_window_size=20, # performed well for drift, only used for #avoidable misclass - # warmup_intervals=3500 // detection_interval, # same as in drift case - # warmup_policy=TimeTriggerConfig(every="3d", start_timestamp=_FIRST_TIMESTAMP), - # evaluation=PerformanceTriggerEvaluationConfig( - # device="cuda:0", - # dataset=EvalDataConfig( - # dataset_id="yearbook_train", # optional: extra holdout split - # bytes_parser_function=yb_bytes_parser_function, - # batch_size=512, - # dataloader_workers=1, - # metrics=[ - # AccuracyMetricConfig(evaluation_transformer_function=yb_evaluation_transformer_function), - # ], - # ), - # ), - # mode="hindsight", - # forecasting_method="ridge_regression", - # decision_criteria={criterion_name: criterion}, - # ) - # # for detection_interval in [100, 250, 500] - # for detection_interval in [100] - # # cuda1: 100 - # # cuda2: 250 - # # cuda3: 500 - # # cuda0: 100, 250, 500 - num_misscl - 100, 200, 500 - # for criterion_name, criterion in ( - # # { - # # f"static-{perf_threshold}": StaticPerformanceThresholdCriterion( - # # metric="Accuracy", metric_threshold=perf_threshold - # # ) - # # for perf_threshold in [0.7, 0.75, 0.8, 0.85, 0.875, 0.9, 0.925, 0.95] - # # } - # # | { - # # f"dynamic-quant-{quantile}-{decision_window_size}": DynamicQuantilePerformanceThresholdCriterion( - # # metric="Accuracy", - # # quantile=quantile, - # # window_size=decision_window_size, - # # ) - # # for quantile in [0.05, 0.15, 0.3] - # # for decision_window_size in [10, 20, 30] - # # } - # # | - # { - # f"dynamic-rollavg-{deviation}-{decision_window_size}": DynamicRollingAveragePerformanceThresholdCriterion( - # metric="Accuracy", - # deviation=deviation, - # absolute=False, - # window_size=decision_window_size, - # ) - # for deviation in reversed([0.05, 0.1, 0.2, 0.3]) # TODO: delete: 0.025 - # for decision_window_size in [10, 20, 30] - # } - # # | - # # { - # # f"num_misclass-{num_misclassifications}-exp-{expected_accuracy}-red-{allow_reduction}-": StaticNumberAvoidableMisclassificationCriterion( - # # expected_accuracy=expected_accuracy, - # # allow_reduction=allow_reduction, - # # avoidable_misclassification_threshold=num_misclassifications, - # # ) - # # for num_misclassifications in reversed([100, 200, 500]) # TODO: 100, 200, 500, 1000, 2000, 5000 - # # for expected_accuracy in [0.85, 0.9, 0.95] # TODO last successful: yearbook_performancetrigger_num_misclass-200-exp-0.85-red-False--int500y --> mind the reversed - # # for allow_reduction in [True, False] - # # } - # ).items() - # }, - # gpu_device="cuda:0", - # ), + 30: Experiment( + name="yb-performancetrigger", + eval_handlers=( + construct_periodic_eval_handlers(intervals=BEST_PERIODIC_EVAL_INTERVAL, execution_time="manual") + + construct_between_trigger_eval_handler("manual") + ), + performance_triggers={ + f"{criterion_name}-int{detection_interval}y": PerformanceTriggerConfig( + evaluation_interval_data_points=detection_interval, + data_density_window_size=20, # performed well for drift, only used for #avoidable misclass + performance_triggers_window_size=20, # performed well for drift, only used for #avoidable misclass + warmup_intervals=3500 // detection_interval, # same as in drift case + warmup_policy=TimeTriggerConfig(every="3d", start_timestamp=_FIRST_TIMESTAMP), + evaluation=PerformanceTriggerEvaluationConfig( + device="cuda:2", + dataset=EvalDataConfig( + dataset_id="yearbook_train", # optional: extra holdout split + bytes_parser_function=yb_bytes_parser_function, + batch_size=512, + dataloader_workers=1, + metrics=[ + AccuracyMetricConfig(evaluation_transformer_function=yb_evaluation_transformer_function), + ], + ), + ), + mode="hindsight", + forecasting_method="ridge_regression", + decision_criteria={criterion_name: criterion}, + ) + # for detection_interval in [100, 250, 500] + for detection_interval in [250] # Solid choice + for criterion_name, criterion in ( + # { + # f"static-{perf_threshold}": StaticPerformanceThresholdCriterion( + # metric="Accuracy", metric_threshold=perf_threshold + # ) + # for perf_threshold in [0.7, 0.75, 0.8, 0.85, 0.875, 0.9, 0.925, 0.95] + # } + # | { + # f"dynamic-quant-{quantile}-{decision_window_size}": DynamicQuantilePerformanceThresholdCriterion( + # metric="Accuracy", + # quantile=quantile, + # window_size=decision_window_size, + # ) + # for quantile in [0.05, 0.15, 0.3] + # for decision_window_size in [10, 20, 30] + # } + # | + # { # TODO: not completed + # f"dynamic-rollavg-{deviation}-{decision_window_size}": DynamicRollingAveragePerformanceThresholdCriterion( + # metric="Accuracy", + # deviation=deviation, + # absolute=False, + # window_size=decision_window_size, + # ) + # for deviation in reversed([0.05, 0.1, 0.2, 0.3]) + # for decision_window_size in [10, 20, 30] + # } + # | + { + f"num_misclass-{num_misclassifications}-exp-{expected_accuracy}-red-{allow_reduction}-": StaticNumberAvoidableMisclassificationCriterion( + expected_accuracy=expected_accuracy, + allow_reduction=allow_reduction, + avoidable_misclassification_threshold=num_misclassifications, + ) + # for num_misclassifications, expected_accuracy, allow_reduction in [ + # (1500, 0.95, False), + # ] + # cuda1: 100, 200, 500, 1000 + # cuda2: 50 + for num_misclassifications in reversed([50]) # TODO: 50, 100, 200, 500, 1000, 1500 + for expected_accuracy in [0.85, 0.9, 0.95] + for allow_reduction in [True, False] + } + ).items() + }, + gpu_device="cuda:2", + ), # -------------------------------------------------------------------------------- # # 4X: Cost aware triggers # # -------------------------------------------------------------------------------- #