From 5d6c5e10ab7956576f5b55c568546530d14fbc67 Mon Sep 17 00:00:00 2001
From: Shahar Bar <shaharb@playtika.com>
Date: Sun, 28 Jul 2024 09:53:13 +0300
Subject: [PATCH] Epsilon Greedy Strategy Wrapper

 Change log:
 1. Added pytest_mock for mocking pytests in pyproject.toml
 2. Added _make_epsilon_greedy functionality as a static method of Strategy in base.py. The method wraps the native select_action of the strategy with epsilon_greedy approach.
 3. Added epsilon and default_action to all smab.py and cmab.py classes and cold start methods
 4. Added test suite for the epsilon greedy functionlity
---
 pybandits/base.py     |  61 ++++++++++++++++++++
 pybandits/cmab.py     |  66 +++++++++++++++++----
 pybandits/smab.py     | 109 ++++++++++++++++++++++++++++-------
 pybandits/strategy.py |   1 +
 pyproject.toml        |   1 +
 tests/test_base.py    |  65 +++++++++++++++++++--
 tests/test_cmab.py    |  71 ++++++++++++++++++++++-
 tests/test_smab.py    | 129 +++++++++++++++++++++++++++++++++++++++---
 8 files changed, 457 insertions(+), 46 deletions(-)

diff --git a/pybandits/base.py b/pybandits/base.py
index 818599f..bd40524 100644
--- a/pybandits/base.py
+++ b/pybandits/base.py
@@ -24,6 +24,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, NewType, Optional, Set, Tuple, Union
 
+import numpy as np
 from pydantic import (
     BaseModel,
     Extra,
@@ -31,6 +32,7 @@
     confloat,
     conint,
     constr,
+    root_validator,
     validate_arguments,
     validator,
 )
@@ -91,10 +93,14 @@ class BaseMab(PyBanditsBaseModel, ABC):
         The list of possible actions, and their associated Model.
     strategy: Strategy
         The strategy used to select actions.
+    epsilon: Optional[Float01], defaults to None
+        The probability of selecting a random action.
     """
 
     actions: Dict[ActionId, Model]
     strategy: Strategy
+    epsilon: Optional[Float01]
+    default_action: Optional[ActionId]
 
     @validator("actions", pre=True)
     @classmethod
@@ -103,6 +109,14 @@ def at_least_2_actions_are_defined(cls, v):
             raise AttributeError("At least 2 actions should be defined.")
         return v
 
+    @root_validator
+    def check_default_action(cls, values):
+        if not values["epsilon"] and values["default_action"]:
+            raise AttributeError("A default action should only be defined when epsilon is defined.")
+        if values["default_action"] and values["default_action"] not in values["actions"]:
+            raise AttributeError("The default action should be defined in the actions.")
+        return values
+
     def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[ActionId]:
         """
         Given a set of forbidden action IDs, return a set of valid action IDs.
@@ -125,6 +139,8 @@ def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[
         valid_actions = set(self.actions.keys()) - forbidden_actions
         if len(valid_actions) == 0:
             raise ValueError("All actions are forbidden. You must allow at least 1 action.")
+        if self.default_action and self.default_action not in valid_actions:
+            raise ValueError("The default action is forbidden.")
 
         return valid_actions
 
@@ -201,3 +217,48 @@ def get_state(self) -> (str, dict):
         model_name = self.__class__.__name__
         state: dict = self.dict()
         return model_name, state
+
+    @validate_arguments
+    def _select_epsilon_greedy_action(
+        self,
+        p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]],
+        actions: Optional[Dict[ActionId, Model]] = None,
+    ) -> ActionId:
+        """
+        Wraps self.strategy.select_action function with epsilon-greedy strategy,
+        such that with probability epsilon a default_action is selected,
+        and with probability 1-epsilon the select_action function is triggered to choose action.
+        If no default_action is provided, a random action is selected.
+
+        Reference: Reinforcement Learning: An Introduction, Ch. 2 (Sutton and Burto, 2018)
+               https://web.stanford.edu/class/psych209/Readings/SuttonBartoIPRLBook2ndEd.pdf&ved=2ahUKEwjMy8WV9N2HAxVe0gIHHVjjG5sQFnoECEMQAQ&usg=AOvVaw3bKK-Y_1kf6XQVwR-UYrBY
+
+        Parameters
+        ----------
+        p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]]
+            The dictionary or actions and their sampled probability of getting a positive reward.
+            For MO strategy, the sampled probability is a list with elements corresponding to the objectives.
+        actions: Optional[Dict[ActionId, Model]]
+            The dictionary of actions and their associated Model.
+
+        Returns
+        -------
+        selected_action: ActionId
+            The selected action.
+
+        Raises
+        ------
+        KeyError
+            If self.default_action is not present as a key in the probabilities dictionary.
+        """
+
+        if self.epsilon:
+            if self.default_action and self.default_action not in p.keys():
+                raise KeyError(f"Default action {self.default_action} not in actions.")
+            if np.random.binomial(1, self.epsilon):
+                selected_action = self.default_action if self.default_action else np.random.choice(list(p.keys()))
+            else:
+                selected_action = self.strategy.select_action(p=p, actions=actions)
+        else:
+            selected_action = self.strategy.select_action(p=p, actions=actions)
+        return selected_action
diff --git a/pybandits/cmab.py b/pybandits/cmab.py
index 1268a11..83a7fc4 100644
--- a/pybandits/cmab.py
+++ b/pybandits/cmab.py
@@ -138,7 +138,7 @@ def predict(
                 p_to_select_action = prob if self.predict_with_proba else ws
 
                 # predict actions, probs, weighted_sums
-                selected_actions.append(self.strategy.select_action(p=p_to_select_action, actions=self.actions))
+                selected_actions.append(self._select_epsilon_greedy_action(p=p_to_select_action, actions=self.actions))
                 probs.append(prob)
                 weighted_sums.append(ws)
 
@@ -212,8 +212,13 @@ class CmabBernoulli(BaseCmabBernoulli):
     predict_with_proba: bool = False
     predict_actions_randomly: bool = False
 
-    def __init__(self, actions: Dict[ActionId, BaseBayesianLogisticRegression]):
-        super().__init__(actions=actions, strategy=ClassicBandit())
+    def __init__(
+        self,
+        actions: Dict[ActionId, BaseBayesianLogisticRegression],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+    ):
+        super().__init__(actions=actions, strategy=ClassicBandit(), epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulli":
@@ -249,9 +254,15 @@ class CmabBernoulliBAI(BaseCmabBernoulli):
     predict_with_proba: bool = False
     predict_actions_randomly: bool = False
 
-    def __init__(self, actions: Dict[ActionId, BayesianLogisticRegression], exploit_p: Optional[Float01] = None):
+    def __init__(
+        self,
+        actions: Dict[ActionId, BayesianLogisticRegression],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+        exploit_p: Optional[Float01] = None,
+    ):
         strategy = BestActionIdentification() if exploit_p is None else BestActionIdentification(exploit_p=exploit_p)
-        super().__init__(actions=actions, strategy=strategy)
+        super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulliBAI":
@@ -296,9 +307,15 @@ class CmabBernoulliCC(BaseCmabBernoulli):
     predict_with_proba: bool = True
     predict_actions_randomly: bool = False
 
-    def __init__(self, actions: Dict[ActionId, BayesianLogisticRegressionCC], subsidy_factor: Optional[Float01] = None):
+    def __init__(
+        self,
+        actions: Dict[ActionId, BayesianLogisticRegressionCC],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+        subsidy_factor: Optional[Float01] = None,
+    ):
         strategy = CostControlBandit() if subsidy_factor is None else CostControlBandit(subsidy_factor=subsidy_factor)
-        super().__init__(actions=actions, strategy=strategy)
+        super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulliCC":
@@ -310,7 +327,12 @@ def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[Bina
 
 
 @validate_arguments
-def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: PositiveInt) -> CmabBernoulli:
+def create_cmab_bernoulli_cold_start(
+    action_ids: Set[ActionId],
+    n_features: PositiveInt,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
+) -> CmabBernoulli:
     """
     Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, with default
     parameters. Until the very first update the model will predict actions randomly, where each action has equal
@@ -323,6 +345,10 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi
     n_features: PositiveInt
         The number of features expected after in the context matrix. This is also the number of betas of the
         Bayesian Logistic Regression model.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
     Returns
     -------
     cmab: CmabBernoulli
@@ -331,14 +357,18 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi
     actions = {}
     for a in set(action_ids):
         actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features)
-    mab = CmabBernoulli(actions=actions)
+    mab = CmabBernoulli(actions=actions, epsilon=epsilon, default_action=default_action)
     mab.predict_actions_randomly = True
     return mab
 
 
 @validate_arguments
 def create_cmab_bernoulli_bai_cold_start(
-    action_ids: Set[ActionId], n_features: PositiveInt, exploit_p: Optional[Float01] = None
+    action_ids: Set[ActionId],
+    n_features: PositiveInt,
+    exploit_p: Optional[Float01] = None,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> CmabBernoulliBAI:
     """
     Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, and Best Action
@@ -361,6 +391,10 @@ def create_cmab_bernoulli_bai_cold_start(
             (it behaves as a Greedy strategy).
         If exploit_p is 0, the bandits always select the action with 2nd highest probability of getting a positive
             reward.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -370,7 +404,7 @@ def create_cmab_bernoulli_bai_cold_start(
     actions = {}
     for a in set(action_ids):
         actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features)
-    mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
+    mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p, epsilon=epsilon, default_action=default_action)
     mab.predict_actions_randomly = True
     return mab
 
@@ -380,6 +414,8 @@ def create_cmab_bernoulli_cc_cold_start(
     action_ids_cost: Dict[ActionId, NonNegativeFloat],
     n_features: PositiveInt,
     subsidy_factor: Optional[Float01] = None,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> CmabBernoulliCC:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Cost Control
@@ -408,6 +444,10 @@ def create_cmab_bernoulli_cc_cold_start(
         If subsidy_factor is 1, the bandits always selects the action with the minimum cost.
         If subsidy_factor is 0, the bandits always selects the action with highest probability of getting a positive
             reward (it behaves as a classic Bernoulli bandit).
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -417,6 +457,8 @@ def create_cmab_bernoulli_cc_cold_start(
     actions = {}
     for a, cost in action_ids_cost.items():
         actions[a] = create_bayesian_logistic_regression_cc_cold_start(n_betas=n_features, cost=cost)
-    mab = CmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
+    mab = CmabBernoulliCC(
+        actions=actions, subsidy_factor=subsidy_factor, epsilon=epsilon, default_action=default_action
+    )
     mab.predict_actions_randomly = True
     return mab
diff --git a/pybandits/smab.py b/pybandits/smab.py
index b72a319..3dddd37 100644
--- a/pybandits/smab.py
+++ b/pybandits/smab.py
@@ -89,7 +89,7 @@ def predict(
 
         for _ in range(n_samples):
             p = {action: model.sample_proba() for action, model in self.actions.items() if action in valid_actions}
-            selected_actions.append(self.strategy.select_action(p=p, actions=self.actions))
+            selected_actions.append(self._select_epsilon_greedy_action(p=p, actions=self.actions))
             probs.append(p)
 
         return selected_actions, probs
@@ -144,8 +144,13 @@ class SmabBernoulli(BaseSmabBernoulli):
     actions: Dict[ActionId, Beta]
     strategy: ClassicBandit
 
-    def __init__(self, actions: Dict[ActionId, Beta]):
-        super().__init__(actions=actions, strategy=ClassicBandit())
+    def __init__(
+        self,
+        actions: Dict[ActionId, Beta],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+    ):
+        super().__init__(actions=actions, strategy=ClassicBandit(), epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulli":
@@ -174,9 +179,15 @@ class SmabBernoulliBAI(BaseSmabBernoulli):
     actions: Dict[ActionId, Beta]
     strategy: BestActionIdentification
 
-    def __init__(self, actions: Dict[ActionId, Beta], exploit_p: Optional[Float01] = None):
+    def __init__(
+        self,
+        actions: Dict[ActionId, Beta],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+        exploit_p: Optional[Float01] = None,
+    ):
         strategy = BestActionIdentification() if exploit_p is None else BestActionIdentification(exploit_p=exploit_p)
-        super().__init__(actions=actions, strategy=strategy)
+        super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliBAI":
@@ -213,9 +224,15 @@ class SmabBernoulliCC(BaseSmabBernoulli):
     actions: Dict[ActionId, BetaCC]
     strategy: CostControlBandit
 
-    def __init__(self, actions: Dict[ActionId, BetaCC], subsidy_factor: Optional[Float01] = None):
+    def __init__(
+        self,
+        actions: Dict[ActionId, BetaCC],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+        subsidy_factor: Optional[Float01] = None,
+    ):
         strategy = CostControlBandit() if subsidy_factor is None else CostControlBandit(subsidy_factor=subsidy_factor)
-        super().__init__(actions=actions, strategy=strategy)
+        super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliCC":
@@ -278,8 +295,15 @@ class SmabBernoulliMO(BaseSmabBernoulliMO):
     actions: Dict[ActionId, BetaMO]
     strategy: MultiObjectiveBandit
 
-    def __init__(self, actions: Dict[ActionId, Beta]):
-        super().__init__(actions=actions, strategy=MultiObjectiveBandit())
+    def __init__(
+        self,
+        actions: Dict[ActionId, Beta],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+    ):
+        super().__init__(
+            actions=actions, strategy=MultiObjectiveBandit(), epsilon=epsilon, default_action=default_action
+        )
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliMO":
@@ -305,8 +329,15 @@ class SmabBernoulliMOCC(BaseSmabBernoulliMO):
     actions: Dict[ActionId, BetaMOCC]
     strategy: MultiObjectiveCostControlBandit
 
-    def __init__(self, actions: Dict[ActionId, Beta]):
-        super().__init__(actions=actions, strategy=MultiObjectiveCostControlBandit())
+    def __init__(
+        self,
+        actions: Dict[ActionId, Beta],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+    ):
+        super().__init__(
+            actions=actions, strategy=MultiObjectiveCostControlBandit(), epsilon=epsilon, default_action=default_action
+        )
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliMOCC":
@@ -314,7 +345,9 @@ def from_state(cls, state: dict) -> "SmabBernoulliMOCC":
 
 
 @validate_arguments
-def create_smab_bernoulli_cold_start(action_ids: Set[ActionId]) -> SmabBernoulli:
+def create_smab_bernoulli_cold_start(
+    action_ids: Set[ActionId], epsilon: Optional[Float01] = None, default_action: Optional[ActionId] = None
+) -> SmabBernoulli:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, with default
     parameters.
@@ -323,6 +356,10 @@ def create_smab_bernoulli_cold_start(action_ids: Set[ActionId]) -> SmabBernoulli
     ----------
     action_ids: Set[ActionId]
         The list of possible actions.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -332,12 +369,15 @@ def create_smab_bernoulli_cold_start(action_ids: Set[ActionId]) -> SmabBernoulli
     actions = {}
     for a in set(action_ids):
         actions[a] = Beta()
-    return SmabBernoulli(actions=actions)
+    return SmabBernoulli(actions=actions, epsilon=epsilon, default_action=default_action)
 
 
 @validate_arguments
 def create_smab_bernoulli_bai_cold_start(
-    action_ids: Set[ActionId], exploit_p: Optional[Float01] = None
+    action_ids: Set[ActionId],
+    exploit_p: Optional[Float01] = None,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> SmabBernoulliBAI:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Best Action
@@ -356,6 +396,10 @@ def create_smab_bernoulli_bai_cold_start(
             (it behaves as a Greedy strategy).
         If exploit_p is 0, the bandits always select the action with 2nd highest probability of getting a positive
             reward.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -365,13 +409,15 @@ def create_smab_bernoulli_bai_cold_start(
     actions = {}
     for a in set(action_ids):
         actions[a] = Beta()
-    return SmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
+    return SmabBernoulliBAI(actions=actions, epsilon=epsilon, default_action=default_action, exploit_p=exploit_p)
 
 
 @validate_arguments
 def create_smab_bernoulli_cc_cold_start(
     action_ids_cost: Dict[ActionId, NonNegativeFloat],
     subsidy_factor: Optional[Float01] = None,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> SmabBernoulliCC:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Cost Control
@@ -397,6 +443,10 @@ def create_smab_bernoulli_cc_cold_start(
         If subsidy_factor is 1, the bandits always selects the action with the minimum cost.
         If subsidy_factor is 0, the bandits always selects the action with highest probability of getting a positive
             reward (it behaves as a classic Bernoulli bandit).
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -406,11 +456,18 @@ def create_smab_bernoulli_cc_cold_start(
     actions = {}
     for a, cost in action_ids_cost.items():
         actions[a] = BetaCC(cost=cost)
-    return SmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
+    return SmabBernoulliCC(
+        actions=actions, epsilon=epsilon, default_action=default_action, subsidy_factor=subsidy_factor
+    )
 
 
 @validate_arguments
-def create_smab_bernoulli_mo_cold_start(action_ids: Set[ActionId], n_objectives: PositiveInt) -> SmabBernoulliMO:
+def create_smab_bernoulli_mo_cold_start(
+    action_ids: Set[ActionId],
+    n_objectives: PositiveInt,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
+) -> SmabBernoulliMO:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Multi-Objectives
     strategy, with default parameters.
@@ -429,6 +486,10 @@ def create_smab_bernoulli_mo_cold_start(action_ids: Set[ActionId], n_objectives:
         The list of possible actions.
     n_objectives: PositiveInt
         The number of objectives to optimize. The bandit assumes the same number of objectives for all actions.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -438,12 +499,15 @@ def create_smab_bernoulli_mo_cold_start(action_ids: Set[ActionId], n_objectives:
     actions = {}
     for a in set(action_ids):
         actions[a] = BetaMO(counters=n_objectives * [Beta()])
-    return SmabBernoulliMO(actions=actions)
+    return SmabBernoulliMO(actions=actions, epsilon=epsilon, default_action=default_action)
 
 
 @validate_arguments
 def create_smab_bernoulli_mo_cc_cold_start(
-    action_ids_cost: Dict[ActionId, NonNegativeFloat], n_objectives: PositiveInt
+    action_ids_cost: Dict[ActionId, NonNegativeFloat],
+    n_objectives: PositiveInt,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> SmabBernoulliMOCC:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling implementation for
@@ -458,6 +522,11 @@ def create_smab_bernoulli_mo_cc_cold_start(
         The list of possible actions, and their cost.
     n_objectives: PositiveInt
         The number of objectives to optimize. The bandit assumes the same number of objectives for all actions.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
+
 
     Returns
     -------
@@ -467,4 +536,4 @@ def create_smab_bernoulli_mo_cc_cold_start(
     actions = {}
     for a, cost in action_ids_cost.items():
         actions[a] = BetaMOCC(counters=n_objectives * [Beta()], cost=cost)
-    return SmabBernoulliMOCC(actions=actions)
+    return SmabBernoulliMOCC(actions=actions, epsilon=epsilon, default_action=default_action)
diff --git a/pybandits/strategy.py b/pybandits/strategy.py
index b18567d..ac59f9b 100644
--- a/pybandits/strategy.py
+++ b/pybandits/strategy.py
@@ -292,6 +292,7 @@ class MultiObjectiveBandit(Strategy):
 
     Reference: Thompson Sampling for Multi-Objective Multi-Armed Bandits Problem (Yahyaa and Manderick, 2015)
                https://www.researchgate.net/publication/272823659_Thompson_Sampling_for_Multi-Objective_Multi-Armed_Bandits_Problem
+
     Parameters
     ----------
     n_objectives: int
diff --git a/pyproject.toml b/pyproject.toml
index 15bb053..7d47381 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ ipykernel = "^6.21.3"
 jupyterlab = "^3.6.1"
 flake8-pyproject = "^1.2.2"
 pytest-cov = "^4.0.0"
+pytest_mock = "^3.14.0"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/tests/test_base.py b/tests/test_base.py
index ecb6372..30f6424 100644
--- a/tests/test_base.py
+++ b/tests/test_base.py
@@ -20,14 +20,16 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from typing import List
+from typing import Dict, List, Optional, Set
 
 import hypothesis.strategies as st
+import numpy as np
 import pytest
 from hypothesis import given
 from pydantic import NonNegativeInt, ValidationError
+from pytest_mock import MockerFixture
 
-from pybandits.base import ActionId, BaseMab
+from pybandits.base import ActionId, BaseMab, Probability
 from pybandits.model import Beta
 from pybandits.strategy import ClassicBandit
 
@@ -41,8 +43,12 @@ def update(
         super().update(actions=actions, rewards=rewards)
         pass
 
-    def predict():
-        pass
+    def predict(
+        self,
+        forbidden_actions: Optional[Set[ActionId]] = None,
+    ):
+        valid_actions = self._get_valid_actions(forbidden_actions)
+        return np.random.choice(valid_actions)
 
     def get_state(self) -> (str, dict):
         model_name = self.__class__.__name__
@@ -82,3 +88,54 @@ def test_base_mab_update_ok(r1, r2):
     dummy_mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit())
     dummy_mab.update(actions=["a1", "a2"], rewards=[r1, r2])
     dummy_mab.update(actions=["a1", "a1"], rewards=[r1, r2])
+
+
+########################################################################################################################
+
+
+# Epsilon-greedy functionality tests
+
+
+@pytest.fixture
+def p() -> Dict[ActionId, Probability]:
+    return {"a1": 0.5, "a2": 0.5}
+
+
+def test_valid_epsilon_value(mocker: MockerFixture, p: Dict[ActionId, Probability]):
+    mocker.patch.object(ClassicBandit, "select_action", return_value="a2")
+    mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.1, default_action="a1")
+    selected_action = mab._select_epsilon_greedy_action(p)
+    assert selected_action in p.keys()
+
+
+def test_epsilon_boundary_values(mocker: MockerFixture, p: Dict[ActionId, Probability]):
+    mocker.patch.object(ClassicBandit, "select_action", return_value="a2")
+
+    mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.0)
+    selected_action = mab._select_epsilon_greedy_action(p)
+    assert selected_action == "a2"
+
+    mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=1.0, default_action="a1")
+    selected_action = mab._select_epsilon_greedy_action(p)
+    assert selected_action == "a1"
+
+
+def test_default_action_not_in_actions(p: Dict[ActionId, Probability]):
+    with pytest.raises(AttributeError):
+        DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=1.0, default_action="a3")
+
+
+def test_select_action_raises_exception(mocker: MockerFixture, p: Dict[ActionId, Probability]):
+    mocker.patch.object(ClassicBandit, "select_action", side_effect=Exception("Test Exception"))
+    mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.1, default_action="a1")
+
+    with pytest.raises(Exception) as excinfo:
+        mab._select_epsilon_greedy_action(p)
+
+    assert str(excinfo.value) == "Test Exception"
+
+
+def test_default_action_in_forbidden_actions():
+    mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.1, default_action="a1")
+    with pytest.raises(ValueError):
+        mab.predict(forbidden_actions={"a1"})
diff --git a/tests/test_cmab.py b/tests/test_cmab.py
index d2d25f4..cc97ad7 100644
--- a/tests/test_cmab.py
+++ b/tests/test_cmab.py
@@ -297,7 +297,8 @@ def run_predict(mab):
         with pytest.raises(ValueError):  # all actions forbidden
             assert set(mab.predict(context=context, forbidden_actions=["a1", "a2", "a3", "a4", "a5"])[0])
         with pytest.raises(ValueError):  # all actions forbidden (unordered)
-            assert set(mab.predict(n_samples=1000, forbidden_actions=["a5", "a4", "a2", "a3", "a1"])[0])
+            n_samples = 1000
+            assert set(mab.predict(n_samples=n_samples, forbidden_actions=["a5", "a4", "a2", "a3", "a1"])[0])
 
     # cold start mab
     mab = create_cmab_bernoulli_cold_start(action_ids=["a1", "a2", "a3", "a4", "a5"], n_features=n_features)
@@ -328,7 +329,14 @@ def test_cmab_get_state(mu, sigma, n_features):
     cmab = CmabBernoulli(actions=actions)
     expected_state = json.loads(
         json.dumps(
-            {"actions": actions, "strategy": {}, "predict_with_proba": False, "predict_actions_randomly": False},
+            {
+                "actions": actions,
+                "strategy": {},
+                "predict_with_proba": False,
+                "predict_actions_randomly": False,
+                "epsilon": None,
+                "default_action": None,
+            },
             default=dict,
         )
     )
@@ -547,6 +555,8 @@ def test_cmab_bai_get_state(mu, sigma, n_features, exploit_p: Float01):
                 "strategy": {"exploit_p": exploit_p},
                 "predict_with_proba": False,
                 "predict_actions_randomly": False,
+                "epsilon": None,
+                "default_action": None,
             },
             default=dict,
         )
@@ -790,6 +800,8 @@ def test_cmab_cc_get_state(
                 "strategy": {"subsidy_factor": subsidy_factor},
                 "predict_with_proba": True,
                 "predict_actions_randomly": False,
+                "epsilon": None,
+                "default_action": None,
             },
             default=dict,
         )
@@ -857,3 +869,58 @@ def test_cmab_cc_from_state(state):
     # Ensure get_state and from_state compatibility
     new_cmab = globals()[cmab.get_state()[0]].from_state(state=cmab.get_state()[1])
     assert new_cmab == cmab
+
+
+########################################################################################################################
+
+
+# Cmab with epsilon-greedy super strategy
+
+
+@settings(deadline=500)
+@given(st.integers(min_value=1, max_value=1000), st.integers(min_value=1, max_value=100))
+def test_epsilon_greedy_cmab_predict_cold_start(n_samples, n_features):
+    context = np.random.uniform(low=-1.0, high=1.0, size=(n_samples, n_features))
+
+    mab = create_cmab_bernoulli_cold_start(
+        action_ids=["a1", "a2"], n_features=n_features, epsilon=0.1, default_action="a1"
+    )
+    selected_actions, probs, weighted_sums = mab.predict(context=context)
+    assert mab.predict_actions_randomly
+    assert all([a in ["a1", "a2"] for a in selected_actions])
+    assert len(selected_actions) == n_samples
+    assert probs == n_samples * [{"a1": 0.5, "a2": 0.5}]
+    assert weighted_sums == n_samples * [{"a1": 0, "a2": 0}]
+
+
+@settings(deadline=500)
+@given(st.integers(min_value=1, max_value=100), st.integers(min_value=1, max_value=3))
+def test_epsilon_greedy_cmab_bai_predict(n_samples, n_features):
+    context = np.random.uniform(low=-1.0, high=1.0, size=(n_samples, n_features))
+
+    mab = create_cmab_bernoulli_bai_cold_start(
+        action_ids=["a1", "a2"], n_features=n_features, epsilon=0.1, default_action="a1"
+    )
+    selected_actions, probs, weighted_sums = mab.predict(context=context)
+    assert mab.predict_actions_randomly
+    assert all([a in ["a1", "a2"] for a in selected_actions])
+    assert len(selected_actions) == n_samples
+    assert probs == n_samples * [{"a1": 0.5, "a2": 0.5}]
+    assert weighted_sums == n_samples * [{"a1": 0, "a2": 0}]
+
+
+@settings(deadline=500)
+@given(st.integers(min_value=1, max_value=100), st.integers(min_value=1, max_value=3))
+def test_epsilon_greedy_cmab_cc_predict(n_samples, n_features):
+    context = np.random.uniform(low=-1.0, high=1.0, size=(n_samples, n_features))
+
+    # cold start
+    mab = create_cmab_bernoulli_cc_cold_start(
+        action_ids_cost={"a1": 10, "a2": 20.5}, n_features=n_features, epsilon=0.1, default_action="a1"
+    )
+    selected_actions, probs, weighted_sums = mab.predict(context=context)
+    assert mab.predict_actions_randomly
+    assert all([a in ["a1", "a2"] for a in selected_actions])
+    assert len(selected_actions) == n_samples
+    assert probs == n_samples * [{"a1": 0.5, "a2": 0.5}]
+    assert weighted_sums == n_samples * [{"a1": 0, "a2": 0}]
diff --git a/tests/test_smab.py b/tests/test_smab.py
index 52347de..39c03e0 100644
--- a/tests/test_smab.py
+++ b/tests/test_smab.py
@@ -131,6 +131,7 @@ def test_smab_predict_raise_when_all_actions_forbidden():
 
 
 def test_smab_predict():
+    n_samples = 1000
     s = SmabBernoulli(
         actions={
             "a0": Beta(),
@@ -143,7 +144,7 @@ def test_smab_predict():
     )
     forbidden_actions = set(["forb_1", "forb_2"])
 
-    best_actions, probs = s.predict(n_samples=1000, forbidden_actions=forbidden_actions)
+    best_actions, probs = s.predict(n_samples=n_samples, forbidden_actions=forbidden_actions)
     assert ["forb1" not in p.keys() for p in probs], "forbidden actions weren't removed from the output"
 
     valid_actions = set(s.actions.keys()) - forbidden_actions
@@ -206,7 +207,12 @@ def test_smab_get_state(a, b, c, d):
     actions = {"action1": Beta(n_successes=a, n_failures=b), "action2": Beta(n_successes=c, n_failures=d)}
     smab = SmabBernoulli(actions=actions)
 
-    expected_state = {"actions": actions, "strategy": {}}
+    expected_state = {
+        "actions": actions,
+        "strategy": {},
+        "epsilon": None,
+        "default_action": None,
+    }
 
     class_name, smab_state = smab.get_state()
     assert class_name == "SmabBernoulli"
@@ -288,8 +294,9 @@ def test_can_init_smabbai():
 
 
 def test_smabbai_predict():
+    n_samples = 1000
     s = SmabBernoulliBAI(actions={"a1": Beta(), "a2": Beta()})
-    _, _ = s.predict(n_samples=1000)
+    _, _ = s.predict(n_samples=n_samples)
 
 
 def test_smabbai_update():
@@ -318,7 +325,12 @@ def test_smabbai_with_betacc():
 def test_smab_bai_get_state(a, b, c, d, exploit_p: Float01):
     actions = {"action1": Beta(n_successes=a, n_failures=b), "action2": Beta(n_successes=c, n_failures=d)}
     smab = SmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
-    expected_state = {"actions": actions, "strategy": {"exploit_p": exploit_p}}
+    expected_state = {
+        "actions": actions,
+        "strategy": {"exploit_p": exploit_p},
+        "epsilon": None,
+        "default_action": None,
+    }
 
     class_name, smab_state = smab.get_state()
     assert class_name == "SmabBernoulliBAI"
@@ -413,6 +425,7 @@ def test_can_init_smabcc():
 
 
 def test_smabcc_predict():
+    n_samples = 1000
     s = SmabBernoulliCC(
         actions={
             "a1": BetaCC(n_successes=1, n_failures=2, cost=10),
@@ -420,7 +433,7 @@ def test_smabcc_predict():
         },
         subsidy_factor=0.7,
     )
-    _, _ = s.predict(n_samples=1000)
+    _, _ = s.predict(n_samples=n_samples)
 
 
 def test_smabcc_update():
@@ -443,7 +456,14 @@ def test_smab_cc_get_state(a, b, c, d, cost1: NonNegativeFloat, cost2: NonNegati
         "action2": BetaCC(n_successes=c, n_failures=d, cost=cost2),
     }
     smab = SmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
-    expected_state = {"actions": actions, "strategy": {"subsidy_factor": subsidy_factor}}
+    expected_state = {
+        "actions": actions,
+        "strategy": {
+            "subsidy_factor": subsidy_factor,
+        },
+        "epsilon": None,
+        "default_action": None,
+    }
 
     class_name, smab_state = smab.get_state()
     assert class_name == "SmabBernoulliCC"
@@ -600,7 +620,12 @@ def test_smab_mo_get_state(a_list):
         ),
     }
     smab = SmabBernoulliMO(actions=actions)
-    expected_state = {"actions": actions, "strategy": {}}
+    expected_state = {
+        "actions": actions,
+        "strategy": {},
+        "epsilon": None,
+        "default_action": None,
+    }
 
     class_name, smab_state = smab.get_state()
     assert class_name == "SmabBernoulliMO"
@@ -756,7 +781,12 @@ def test_smab_mocc_get_state(a_list):
         ),
     }
     smab = SmabBernoulliMOCC(actions=actions)
-    expected_state = {"actions": actions, "strategy": {}}
+    expected_state = {
+        "actions": actions,
+        "strategy": {},
+        "epsilon": None,
+        "default_action": None,
+    }
 
     class_name, smab_state = smab.get_state()
     assert class_name == "SmabBernoulliMOCC"
@@ -802,3 +832,86 @@ def test_smab_mo_cc_from_state(state):
     # Ensure get_state and from_state compatibility
     new_smab = globals()[smab.get_state()[0]].from_state(state=smab.get_state()[1])
     assert new_smab == smab
+
+
+########################################################################################################################
+
+
+# Smab with epsilon-greedy super strategy
+
+
+@given(
+    st.integers(min_value=1),
+    st.integers(min_value=1),
+)
+def test_can_instantiate_epsilon_greddy_smab_with_params(a, b):
+    s = SmabBernoulli(
+        actions={
+            "action1": Beta(n_successes=a, n_failures=b),
+            "action2": Beta(n_successes=a, n_failures=b),
+        },
+        epsilon=0.1,
+        default_action="action1",
+    )
+    assert (s.actions["action1"].n_successes == a) and (s.actions["action1"].n_failures == b)
+    assert s.actions["action1"] == s.actions["action2"]
+
+
+def test_epsilon_greedy_smab_predict():
+    n_samples = 1000
+
+    s = SmabBernoulli(
+        actions={
+            "a0": Beta(),
+            "a1": Beta(n_successes=5, n_failures=5),
+            "forb_1": Beta(n_successes=10, n_failures=1),
+            "best": Beta(n_successes=10, n_failures=5),
+            "forb_2": Beta(n_successes=100, n_failures=4),
+            "a5": Beta(),
+        },
+        epsilon=0.1,
+        default_action="a1",
+    )
+    forbidden_actions = set(["forb_1", "forb_2"])
+
+    _, _ = s.predict(n_samples=n_samples, forbidden_actions=forbidden_actions)
+
+
+def test_epsilon_greddy_smabbai_predict():
+    n_samples = 1000
+    s = SmabBernoulliBAI(actions={"a1": Beta(), "a2": Beta()}, epsilon=0.1, default_action="a1")
+    _, _ = s.predict(n_samples=n_samples)
+
+
+def test_epsilon_greddy_smabcc_predict():
+    n_samples = 1000
+    s = SmabBernoulliCC(
+        actions={
+            "a1": BetaCC(n_successes=1, n_failures=2, cost=10),
+            "a2": BetaCC(n_successes=3, n_failures=4, cost=20),
+        },
+        subsidy_factor=0.7,
+        epsilon=0.1,
+        default_action="a1",
+    )
+    _, _ = s.predict(n_samples=n_samples)
+
+
+def test_epsilon_greddy_smab_mo_predict():
+    n_samples = 1000
+
+    s = create_smab_bernoulli_mo_cold_start(action_ids=["a1", "a2"], n_objectives=3, epsilon=0.1, default_action="a1")
+
+    forbidden = None
+    s.predict(n_samples=n_samples, forbidden_actions=forbidden)
+
+
+def test_epsilon_greddy_smab_mo_cc_predict():
+    n_samples = 1000
+
+    s = create_smab_bernoulli_mo_cc_cold_start(
+        action_ids_cost={"a1": 1, "a2": 2}, n_objectives=2, epsilon=0.1, default_action="a1"
+    )
+
+    forbidden = None
+    s.predict(n_samples=n_samples, forbidden_actions=forbidden)