diff --git a/docs/source/user_guide/config/training_settings.rst b/docs/source/user_guide/config/training_settings.rst index 08d66fc76..9e03c965d 100644 --- a/docs/source/user_guide/config/training_settings.rst +++ b/docs/source/user_guide/config/training_settings.rst @@ -22,6 +22,8 @@ Training settings are designed to set parameters about model training. evaluated on the valid dataset. Defaults to ``1``. - ``stopping_step (int)`` : The threshold for validation-based early stopping. Defaults to ``10``. +- ``min_delta (float)`` : any change in the evaluation measure, no matter how fractional, will be considered an improvement. This parameter allows you to set the minimum amount of difference between the best value obtained and the new value that represents an improvement. + Defaults to ``0.001``. - ``clip_grad_norm (dict)`` : The args of `clip_grad_norm_ `_ which will clip gradient norm of model. Defaults to ``None``. - ``loss_decimal_place(int)``: The decimal place of training loss. Defaults to ``4``. diff --git a/recbole/properties/overall.yaml b/recbole/properties/overall.yaml index 577e4c0b4..3c7308f39 100644 --- a/recbole/properties/overall.yaml +++ b/recbole/properties/overall.yaml @@ -16,6 +16,7 @@ log_wandb: False # (bool) Whether or not to use Weights & Biases( wandb_project: 'recbole' # (str) The project to conduct experiments in W&B. shuffle: True # (bool) Whether or not to shuffle the training data before each epoch. +======= # Training Settings epochs: 300 # (int) The number of training epochs. train_batch_size: 2048 # (int) The training batch size. @@ -29,6 +30,7 @@ train_neg_sample_args: # (dict) Negative sampling configuration for mod candidate_num: 0 # (int) The number of candidate negative items when dynamic negative sampling. eval_step: 1 # (int) The number of training epochs before an evaluation on the valid dataset. stopping_step: 10 # (int) The threshold for validation-based early stopping. +min_delta: 0.001 # (float) The min value change in the evaluation measure, no matter how fractional, will be considered an improvement. clip_grad_norm: ~ # (dict) The args of clip_grad_norm_ which will clip gradient norm of model. weight_decay: 0.0 # (float) The weight decay value (L2 penalty) for optimizers. loss_decimal_place: 4 # (int) The decimal place of training loss. diff --git a/recbole/trainer/trainer.py b/recbole/trainer/trainer.py index b5863b67d..ebe647e07 100644 --- a/recbole/trainer/trainer.py +++ b/recbole/trainer/trainer.py @@ -114,6 +114,7 @@ def __init__(self, config, model): self.logger = getLogger() self.tensorboard = get_tensorboard(self.logger) self.wandblogger = WandbLogger(config) + self.learner = config["learner"] self.learning_rate = config["learning_rate"] self.epochs = config["epochs"] @@ -122,12 +123,14 @@ def __init__(self, config, model): self.clip_grad_norm = config["clip_grad_norm"] self.valid_metric = config["valid_metric"].lower() self.valid_metric_bigger = config["valid_metric_bigger"] + self.min_delta = config['min_delta'] self.test_batch_size = config["eval_batch_size"] self.gpu_available = torch.cuda.is_available() and config["use_gpu"] self.device = config["device"] self.checkpoint_dir = config["checkpoint_dir"] self.enable_amp = config["enable_amp"] self.enable_scaler = torch.cuda.is_available() and config["enable_scaler"] + ensure_dir(self.checkpoint_dir) saved_model_file = "{}-{}.pth".format(self.config["model"], get_local_time()) self.saved_model_file = os.path.join(self.checkpoint_dir, saved_model_file) @@ -474,6 +477,8 @@ def fit( self.cur_step, max_step=self.stopping_step, bigger=self.valid_metric_bigger, + min_delta=self.min_delta + ) valid_end_time = time() valid_score_output = ( @@ -910,6 +915,8 @@ def __init__(self, config, model): self.stopping_step = config["stopping_step"] self.valid_metric_bigger = config["valid_metric_bigger"] + self.min_delta = config['min_delta'] + self.cur_step = 0 self.best_valid_score = -np.inf if self.valid_metric_bigger else np.inf self.best_valid_result = None @@ -1018,6 +1025,8 @@ def fit( self.cur_step, max_step=self.stopping_step, bigger=self.valid_metric_bigger, + min_delta=self.min_delta + ) valid_end_time = time() @@ -1383,6 +1392,8 @@ def fit( self.cur_step, max_step=self.stopping_step, bigger=self.valid_metric_bigger, + min_delta=self.min_delta + ) valid_end_time = time() valid_score_output = ( diff --git a/recbole/utils/argument_list.py b/recbole/utils/argument_list.py index 669c1c88e..afb624f45 100644 --- a/recbole/utils/argument_list.py +++ b/recbole/utils/argument_list.py @@ -23,8 +23,7 @@ training_arguments = [ 'epochs', 'train_batch_size', 'learner', 'learning_rate', - 'train_neg_sample_args', - 'eval_step', 'stopping_step', + 'eval_step', 'stopping_step', 'min_delta', 'clip_grad_norm', 'weight_decay', 'loss_decimal_place', diff --git a/recbole/utils/utils.py b/recbole/utils/utils.py index ed202448c..604db7399 100644 --- a/recbole/utils/utils.py +++ b/recbole/utils/utils.py @@ -110,9 +110,10 @@ def get_trainer(model_type, model_name): return getattr(importlib.import_module("recbole.trainer"), "Trainer") -def early_stopping(value, best, cur_step, max_step, bigger=True): +def early_stopping(value, best, cur_step, max_step, bigger=True, min_delta=0.001): r"""validation-based early stopping + Args: value (float): current result best (float): best result @@ -134,7 +135,7 @@ def early_stopping(value, best, cur_step, max_step, bigger=True): stop_flag = False update_flag = False if bigger: - if value >= best: + if value >= best and abs(value - best) >= min_delta: cur_step = 0 best = value update_flag = True @@ -143,7 +144,7 @@ def early_stopping(value, best, cur_step, max_step, bigger=True): if cur_step > max_step: stop_flag = True else: - if value <= best: + if value <= best and abs(best - value) >= min_delta: cur_step = 0 best = value update_flag = True diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 571593b0f..063646cab 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -26,6 +26,7 @@ class TestConfigClass(unittest.TestCase): def test_default_settings(self): + config = Config(model="BPR", dataset="ml-100k") self.assertEqual(config["model"], "BPR") @@ -44,6 +45,7 @@ def test_default_settings(self): self.assertIsInstance(config["train_neg_sample_args"], dict) self.assertIsInstance(config["eval_step"], int) self.assertIsInstance(config["stopping_step"], int) + self.assertIsInstance(config['min_delta'], float) self.assertIsInstance(config["checkpoint_dir"], str) self.assertIsInstance(config["eval_args"], dict) diff --git a/tests/config/test_overall.py b/tests/config/test_overall.py index 665ae5b98..f14cc84bf 100644 --- a/tests/config/test_overall.py +++ b/tests/config/test_overall.py @@ -91,6 +91,12 @@ def test_stopping_step(self): settings = {"epochs": 100} self.assertTrue(run_parms({"stopping_step": [0, 1, 2]})) + def test_min_delta(self): + settings = { + 'epochs': 100 + } + self.assertTrue(run_parms({'min_delta': [0.01, 0.001, 0.0001]})) + def test_checkpoint_dir(self): self.assertTrue(run_parms({"checkpoint_dir": ["saved_1/", "./saved_2"]}))