diff --git a/cleanrl_utils/tuner.py b/cleanrl_utils/tuner.py index 2d2f5b81..6c97299c 100644 --- a/cleanrl_utils/tuner.py +++ b/cleanrl_utils/tuner.py @@ -28,7 +28,7 @@ def __init__( metric: str, target_scores: Dict[str, Optional[List[float]]], params_fn: Callable[[optuna.Trial], Dict], - direction: str = "maximize", + directions: List[str] = ["maximize"], aggregation_type: str = "average", metric_last_n_average_window: int = 50, sampler: Optional[optuna.samplers.BaseSampler] = None, @@ -47,7 +47,7 @@ def __init__( ) self.params_fn = params_fn - self.direction = direction + self.directions = directions self.aggregation_type = aggregation_type if self.aggregation_type == "average": self.aggregation_fn = np.average @@ -84,8 +84,10 @@ def objective(trial: optuna.Trial): algo_command = [f"--{key}={value}" for key, value in params.items()] normalized_scoress = [] + relative_timess = [] for seed in range(num_seeds): normalized_scores = [] + relative_times = [] for env_id in self.target_scores.keys(): sys.argv = algo_command + [f"--env-id={env_id}", f"--seed={seed}", "--track=False"] with HiddenPrints(): @@ -99,6 +101,8 @@ def objective(trial: optuna.Trial): metric_values = [ scalar_event.value for scalar_event in ea.Scalars(self.metric)[-self.metric_last_n_average_window :] ] + relative_time = ea.Scalars(self.metric)[-1].wall_time - ea.Scalars(self.metric)[0].wall_time + relative_times += [relative_time] print( f"The average episodic return on {env_id} is {np.average(metric_values)} averaged over the last {self.metric_last_n_average_window} episodes." ) @@ -112,26 +116,28 @@ def objective(trial: optuna.Trial): if run: run.log({f"{env_id}_return": np.average(metric_values)}) + relative_timess += [relative_times] normalized_scoress += [normalized_scores] aggregated_normalized_score = self.aggregation_fn(normalized_scores) - print(f"The {self.aggregation_type} normalized score is {aggregated_normalized_score} with num_seeds={seed}") - trial.report(aggregated_normalized_score, step=seed) + aggregated_relative_time = self.aggregation_fn(relative_times) + print(f"The {self.aggregation_type} normalized score is {aggregated_normalized_score} and relative time is {aggregated_relative_time} with num_seeds={seed}") + # trial.report(aggregated_normalized_score, step=seed) if run: run.log({"aggregated_normalized_score": aggregated_normalized_score}) - if trial.should_prune(): - if run: - run.finish(quiet=True) - raise optuna.TrialPruned() + # if trial.should_prune(): + # if run: + # run.finish(quiet=True) + # raise optuna.TrialPruned() if run: run.finish(quiet=True) - return np.average( + return np.average( # we alaways return the average of the aggregated normalized scores self.aggregation_fn(normalized_scoress, axis=1) - ) # we alaways return the average of the aggregated normalized scores + ), np.average(self.aggregation_fn(relative_timess, axis=1)) study = optuna.create_study( study_name=self.study_name, - direction=self.direction, + directions=self.directions, storage=self.storage, pruner=self.pruner, sampler=self.sampler, diff --git a/tuner_example.py b/tuner_example.py index 9e01a604..98e900fb 100644 --- a/tuner_example.py +++ b/tuner_example.py @@ -6,7 +6,7 @@ script="cleanrl/ppo.py", metric="charts/episodic_return", metric_last_n_average_window=50, - direction="maximize", + directions=["maximize", "minimize"], aggregation_type="average", target_scores={ "CartPole-v1": [0, 500],