From 5d6c5e10ab7956576f5b55c568546530d14fbc67 Mon Sep 17 00:00:00 2001 From: Shahar Bar Date: Sun, 28 Jul 2024 09:53:13 +0300 Subject: [PATCH] Epsilon Greedy Strategy Wrapper Change log: 1. Added pytest_mock for mocking pytests in pyproject.toml 2. Added _make_epsilon_greedy functionality as a static method of Strategy in base.py. The method wraps the native select_action of the strategy with epsilon_greedy approach. 3. Added epsilon and default_action to all smab.py and cmab.py classes and cold start methods 4. Added test suite for the epsilon greedy functionlity --- pybandits/base.py | 61 ++++++++++++++++++++ pybandits/cmab.py | 66 +++++++++++++++++---- pybandits/smab.py | 109 ++++++++++++++++++++++++++++------- pybandits/strategy.py | 1 + pyproject.toml | 1 + tests/test_base.py | 65 +++++++++++++++++++-- tests/test_cmab.py | 71 ++++++++++++++++++++++- tests/test_smab.py | 129 +++++++++++++++++++++++++++++++++++++++--- 8 files changed, 457 insertions(+), 46 deletions(-) diff --git a/pybandits/base.py b/pybandits/base.py index 818599f..bd40524 100644 --- a/pybandits/base.py +++ b/pybandits/base.py @@ -24,6 +24,7 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, NewType, Optional, Set, Tuple, Union +import numpy as np from pydantic import ( BaseModel, Extra, @@ -31,6 +32,7 @@ confloat, conint, constr, + root_validator, validate_arguments, validator, ) @@ -91,10 +93,14 @@ class BaseMab(PyBanditsBaseModel, ABC): The list of possible actions, and their associated Model. strategy: Strategy The strategy used to select actions. + epsilon: Optional[Float01], defaults to None + The probability of selecting a random action. """ actions: Dict[ActionId, Model] strategy: Strategy + epsilon: Optional[Float01] + default_action: Optional[ActionId] @validator("actions", pre=True) @classmethod @@ -103,6 +109,14 @@ def at_least_2_actions_are_defined(cls, v): raise AttributeError("At least 2 actions should be defined.") return v + @root_validator + def check_default_action(cls, values): + if not values["epsilon"] and values["default_action"]: + raise AttributeError("A default action should only be defined when epsilon is defined.") + if values["default_action"] and values["default_action"] not in values["actions"]: + raise AttributeError("The default action should be defined in the actions.") + return values + def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[ActionId]: """ Given a set of forbidden action IDs, return a set of valid action IDs. @@ -125,6 +139,8 @@ def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[ valid_actions = set(self.actions.keys()) - forbidden_actions if len(valid_actions) == 0: raise ValueError("All actions are forbidden. You must allow at least 1 action.") + if self.default_action and self.default_action not in valid_actions: + raise ValueError("The default action is forbidden.") return valid_actions @@ -201,3 +217,48 @@ def get_state(self) -> (str, dict): model_name = self.__class__.__name__ state: dict = self.dict() return model_name, state + + @validate_arguments + def _select_epsilon_greedy_action( + self, + p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]], + actions: Optional[Dict[ActionId, Model]] = None, + ) -> ActionId: + """ + Wraps self.strategy.select_action function with epsilon-greedy strategy, + such that with probability epsilon a default_action is selected, + and with probability 1-epsilon the select_action function is triggered to choose action. + If no default_action is provided, a random action is selected. + + Reference: Reinforcement Learning: An Introduction, Ch. 2 (Sutton and Burto, 2018) + https://web.stanford.edu/class/psych209/Readings/SuttonBartoIPRLBook2ndEd.pdf&ved=2ahUKEwjMy8WV9N2HAxVe0gIHHVjjG5sQFnoECEMQAQ&usg=AOvVaw3bKK-Y_1kf6XQVwR-UYrBY + + Parameters + ---------- + p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]] + The dictionary or actions and their sampled probability of getting a positive reward. + For MO strategy, the sampled probability is a list with elements corresponding to the objectives. + actions: Optional[Dict[ActionId, Model]] + The dictionary of actions and their associated Model. + + Returns + ------- + selected_action: ActionId + The selected action. + + Raises + ------ + KeyError + If self.default_action is not present as a key in the probabilities dictionary. + """ + + if self.epsilon: + if self.default_action and self.default_action not in p.keys(): + raise KeyError(f"Default action {self.default_action} not in actions.") + if np.random.binomial(1, self.epsilon): + selected_action = self.default_action if self.default_action else np.random.choice(list(p.keys())) + else: + selected_action = self.strategy.select_action(p=p, actions=actions) + else: + selected_action = self.strategy.select_action(p=p, actions=actions) + return selected_action diff --git a/pybandits/cmab.py b/pybandits/cmab.py index 1268a11..83a7fc4 100644 --- a/pybandits/cmab.py +++ b/pybandits/cmab.py @@ -138,7 +138,7 @@ def predict( p_to_select_action = prob if self.predict_with_proba else ws # predict actions, probs, weighted_sums - selected_actions.append(self.strategy.select_action(p=p_to_select_action, actions=self.actions)) + selected_actions.append(self._select_epsilon_greedy_action(p=p_to_select_action, actions=self.actions)) probs.append(prob) weighted_sums.append(ws) @@ -212,8 +212,13 @@ class CmabBernoulli(BaseCmabBernoulli): predict_with_proba: bool = False predict_actions_randomly: bool = False - def __init__(self, actions: Dict[ActionId, BaseBayesianLogisticRegression]): - super().__init__(actions=actions, strategy=ClassicBandit()) + def __init__( + self, + actions: Dict[ActionId, BaseBayesianLogisticRegression], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + ): + super().__init__(actions=actions, strategy=ClassicBandit(), epsilon=epsilon, default_action=default_action) @classmethod def from_state(cls, state: dict) -> "CmabBernoulli": @@ -249,9 +254,15 @@ class CmabBernoulliBAI(BaseCmabBernoulli): predict_with_proba: bool = False predict_actions_randomly: bool = False - def __init__(self, actions: Dict[ActionId, BayesianLogisticRegression], exploit_p: Optional[Float01] = None): + def __init__( + self, + actions: Dict[ActionId, BayesianLogisticRegression], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + exploit_p: Optional[Float01] = None, + ): strategy = BestActionIdentification() if exploit_p is None else BestActionIdentification(exploit_p=exploit_p) - super().__init__(actions=actions, strategy=strategy) + super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action) @classmethod def from_state(cls, state: dict) -> "CmabBernoulliBAI": @@ -296,9 +307,15 @@ class CmabBernoulliCC(BaseCmabBernoulli): predict_with_proba: bool = True predict_actions_randomly: bool = False - def __init__(self, actions: Dict[ActionId, BayesianLogisticRegressionCC], subsidy_factor: Optional[Float01] = None): + def __init__( + self, + actions: Dict[ActionId, BayesianLogisticRegressionCC], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + subsidy_factor: Optional[Float01] = None, + ): strategy = CostControlBandit() if subsidy_factor is None else CostControlBandit(subsidy_factor=subsidy_factor) - super().__init__(actions=actions, strategy=strategy) + super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action) @classmethod def from_state(cls, state: dict) -> "CmabBernoulliCC": @@ -310,7 +327,12 @@ def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[Bina @validate_arguments -def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: PositiveInt) -> CmabBernoulli: +def create_cmab_bernoulli_cold_start( + action_ids: Set[ActionId], + n_features: PositiveInt, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, +) -> CmabBernoulli: """ Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, with default parameters. Until the very first update the model will predict actions randomly, where each action has equal @@ -323,6 +345,10 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi n_features: PositiveInt The number of features expected after in the context matrix. This is also the number of betas of the Bayesian Logistic Regression model. + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- cmab: CmabBernoulli @@ -331,14 +357,18 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi actions = {} for a in set(action_ids): actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features) - mab = CmabBernoulli(actions=actions) + mab = CmabBernoulli(actions=actions, epsilon=epsilon, default_action=default_action) mab.predict_actions_randomly = True return mab @validate_arguments def create_cmab_bernoulli_bai_cold_start( - action_ids: Set[ActionId], n_features: PositiveInt, exploit_p: Optional[Float01] = None + action_ids: Set[ActionId], + n_features: PositiveInt, + exploit_p: Optional[Float01] = None, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, ) -> CmabBernoulliBAI: """ Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, and Best Action @@ -361,6 +391,10 @@ def create_cmab_bernoulli_bai_cold_start( (it behaves as a Greedy strategy). If exploit_p is 0, the bandits always select the action with 2nd highest probability of getting a positive reward. + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- @@ -370,7 +404,7 @@ def create_cmab_bernoulli_bai_cold_start( actions = {} for a in set(action_ids): actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features) - mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p) + mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p, epsilon=epsilon, default_action=default_action) mab.predict_actions_randomly = True return mab @@ -380,6 +414,8 @@ def create_cmab_bernoulli_cc_cold_start( action_ids_cost: Dict[ActionId, NonNegativeFloat], n_features: PositiveInt, subsidy_factor: Optional[Float01] = None, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, ) -> CmabBernoulliCC: """ Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Cost Control @@ -408,6 +444,10 @@ def create_cmab_bernoulli_cc_cold_start( If subsidy_factor is 1, the bandits always selects the action with the minimum cost. If subsidy_factor is 0, the bandits always selects the action with highest probability of getting a positive reward (it behaves as a classic Bernoulli bandit). + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- @@ -417,6 +457,8 @@ def create_cmab_bernoulli_cc_cold_start( actions = {} for a, cost in action_ids_cost.items(): actions[a] = create_bayesian_logistic_regression_cc_cold_start(n_betas=n_features, cost=cost) - mab = CmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor) + mab = CmabBernoulliCC( + actions=actions, subsidy_factor=subsidy_factor, epsilon=epsilon, default_action=default_action + ) mab.predict_actions_randomly = True return mab diff --git a/pybandits/smab.py b/pybandits/smab.py index b72a319..3dddd37 100644 --- a/pybandits/smab.py +++ b/pybandits/smab.py @@ -89,7 +89,7 @@ def predict( for _ in range(n_samples): p = {action: model.sample_proba() for action, model in self.actions.items() if action in valid_actions} - selected_actions.append(self.strategy.select_action(p=p, actions=self.actions)) + selected_actions.append(self._select_epsilon_greedy_action(p=p, actions=self.actions)) probs.append(p) return selected_actions, probs @@ -144,8 +144,13 @@ class SmabBernoulli(BaseSmabBernoulli): actions: Dict[ActionId, Beta] strategy: ClassicBandit - def __init__(self, actions: Dict[ActionId, Beta]): - super().__init__(actions=actions, strategy=ClassicBandit()) + def __init__( + self, + actions: Dict[ActionId, Beta], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + ): + super().__init__(actions=actions, strategy=ClassicBandit(), epsilon=epsilon, default_action=default_action) @classmethod def from_state(cls, state: dict) -> "SmabBernoulli": @@ -174,9 +179,15 @@ class SmabBernoulliBAI(BaseSmabBernoulli): actions: Dict[ActionId, Beta] strategy: BestActionIdentification - def __init__(self, actions: Dict[ActionId, Beta], exploit_p: Optional[Float01] = None): + def __init__( + self, + actions: Dict[ActionId, Beta], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + exploit_p: Optional[Float01] = None, + ): strategy = BestActionIdentification() if exploit_p is None else BestActionIdentification(exploit_p=exploit_p) - super().__init__(actions=actions, strategy=strategy) + super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action) @classmethod def from_state(cls, state: dict) -> "SmabBernoulliBAI": @@ -213,9 +224,15 @@ class SmabBernoulliCC(BaseSmabBernoulli): actions: Dict[ActionId, BetaCC] strategy: CostControlBandit - def __init__(self, actions: Dict[ActionId, BetaCC], subsidy_factor: Optional[Float01] = None): + def __init__( + self, + actions: Dict[ActionId, BetaCC], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + subsidy_factor: Optional[Float01] = None, + ): strategy = CostControlBandit() if subsidy_factor is None else CostControlBandit(subsidy_factor=subsidy_factor) - super().__init__(actions=actions, strategy=strategy) + super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action) @classmethod def from_state(cls, state: dict) -> "SmabBernoulliCC": @@ -278,8 +295,15 @@ class SmabBernoulliMO(BaseSmabBernoulliMO): actions: Dict[ActionId, BetaMO] strategy: MultiObjectiveBandit - def __init__(self, actions: Dict[ActionId, Beta]): - super().__init__(actions=actions, strategy=MultiObjectiveBandit()) + def __init__( + self, + actions: Dict[ActionId, Beta], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + ): + super().__init__( + actions=actions, strategy=MultiObjectiveBandit(), epsilon=epsilon, default_action=default_action + ) @classmethod def from_state(cls, state: dict) -> "SmabBernoulliMO": @@ -305,8 +329,15 @@ class SmabBernoulliMOCC(BaseSmabBernoulliMO): actions: Dict[ActionId, BetaMOCC] strategy: MultiObjectiveCostControlBandit - def __init__(self, actions: Dict[ActionId, Beta]): - super().__init__(actions=actions, strategy=MultiObjectiveCostControlBandit()) + def __init__( + self, + actions: Dict[ActionId, Beta], + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, + ): + super().__init__( + actions=actions, strategy=MultiObjectiveCostControlBandit(), epsilon=epsilon, default_action=default_action + ) @classmethod def from_state(cls, state: dict) -> "SmabBernoulliMOCC": @@ -314,7 +345,9 @@ def from_state(cls, state: dict) -> "SmabBernoulliMOCC": @validate_arguments -def create_smab_bernoulli_cold_start(action_ids: Set[ActionId]) -> SmabBernoulli: +def create_smab_bernoulli_cold_start( + action_ids: Set[ActionId], epsilon: Optional[Float01] = None, default_action: Optional[ActionId] = None +) -> SmabBernoulli: """ Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, with default parameters. @@ -323,6 +356,10 @@ def create_smab_bernoulli_cold_start(action_ids: Set[ActionId]) -> SmabBernoulli ---------- action_ids: Set[ActionId] The list of possible actions. + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- @@ -332,12 +369,15 @@ def create_smab_bernoulli_cold_start(action_ids: Set[ActionId]) -> SmabBernoulli actions = {} for a in set(action_ids): actions[a] = Beta() - return SmabBernoulli(actions=actions) + return SmabBernoulli(actions=actions, epsilon=epsilon, default_action=default_action) @validate_arguments def create_smab_bernoulli_bai_cold_start( - action_ids: Set[ActionId], exploit_p: Optional[Float01] = None + action_ids: Set[ActionId], + exploit_p: Optional[Float01] = None, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, ) -> SmabBernoulliBAI: """ Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Best Action @@ -356,6 +396,10 @@ def create_smab_bernoulli_bai_cold_start( (it behaves as a Greedy strategy). If exploit_p is 0, the bandits always select the action with 2nd highest probability of getting a positive reward. + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- @@ -365,13 +409,15 @@ def create_smab_bernoulli_bai_cold_start( actions = {} for a in set(action_ids): actions[a] = Beta() - return SmabBernoulliBAI(actions=actions, exploit_p=exploit_p) + return SmabBernoulliBAI(actions=actions, epsilon=epsilon, default_action=default_action, exploit_p=exploit_p) @validate_arguments def create_smab_bernoulli_cc_cold_start( action_ids_cost: Dict[ActionId, NonNegativeFloat], subsidy_factor: Optional[Float01] = None, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, ) -> SmabBernoulliCC: """ Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Cost Control @@ -397,6 +443,10 @@ def create_smab_bernoulli_cc_cold_start( If subsidy_factor is 1, the bandits always selects the action with the minimum cost. If subsidy_factor is 0, the bandits always selects the action with highest probability of getting a positive reward (it behaves as a classic Bernoulli bandit). + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- @@ -406,11 +456,18 @@ def create_smab_bernoulli_cc_cold_start( actions = {} for a, cost in action_ids_cost.items(): actions[a] = BetaCC(cost=cost) - return SmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor) + return SmabBernoulliCC( + actions=actions, epsilon=epsilon, default_action=default_action, subsidy_factor=subsidy_factor + ) @validate_arguments -def create_smab_bernoulli_mo_cold_start(action_ids: Set[ActionId], n_objectives: PositiveInt) -> SmabBernoulliMO: +def create_smab_bernoulli_mo_cold_start( + action_ids: Set[ActionId], + n_objectives: PositiveInt, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, +) -> SmabBernoulliMO: """ Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Multi-Objectives strategy, with default parameters. @@ -429,6 +486,10 @@ def create_smab_bernoulli_mo_cold_start(action_ids: Set[ActionId], n_objectives: The list of possible actions. n_objectives: PositiveInt The number of objectives to optimize. The bandit assumes the same number of objectives for all actions. + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. Returns ------- @@ -438,12 +499,15 @@ def create_smab_bernoulli_mo_cold_start(action_ids: Set[ActionId], n_objectives: actions = {} for a in set(action_ids): actions[a] = BetaMO(counters=n_objectives * [Beta()]) - return SmabBernoulliMO(actions=actions) + return SmabBernoulliMO(actions=actions, epsilon=epsilon, default_action=default_action) @validate_arguments def create_smab_bernoulli_mo_cc_cold_start( - action_ids_cost: Dict[ActionId, NonNegativeFloat], n_objectives: PositiveInt + action_ids_cost: Dict[ActionId, NonNegativeFloat], + n_objectives: PositiveInt, + epsilon: Optional[Float01] = None, + default_action: Optional[ActionId] = None, ) -> SmabBernoulliMOCC: """ Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling implementation for @@ -458,6 +522,11 @@ def create_smab_bernoulli_mo_cc_cold_start( The list of possible actions, and their cost. n_objectives: PositiveInt The number of objectives to optimize. The bandit assumes the same number of objectives for all actions. + epsilon: Optional[Float01] + epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used. + default_action: Optional[ActionId] + Default action to select if the epsilon-greedy approach is used. None for random selection. + Returns ------- @@ -467,4 +536,4 @@ def create_smab_bernoulli_mo_cc_cold_start( actions = {} for a, cost in action_ids_cost.items(): actions[a] = BetaMOCC(counters=n_objectives * [Beta()], cost=cost) - return SmabBernoulliMOCC(actions=actions) + return SmabBernoulliMOCC(actions=actions, epsilon=epsilon, default_action=default_action) diff --git a/pybandits/strategy.py b/pybandits/strategy.py index b18567d..ac59f9b 100644 --- a/pybandits/strategy.py +++ b/pybandits/strategy.py @@ -292,6 +292,7 @@ class MultiObjectiveBandit(Strategy): Reference: Thompson Sampling for Multi-Objective Multi-Armed Bandits Problem (Yahyaa and Manderick, 2015) https://www.researchgate.net/publication/272823659_Thompson_Sampling_for_Multi-Objective_Multi-Armed_Bandits_Problem + Parameters ---------- n_objectives: int diff --git a/pyproject.toml b/pyproject.toml index 15bb053..7d47381 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ ipykernel = "^6.21.3" jupyterlab = "^3.6.1" flake8-pyproject = "^1.2.2" pytest-cov = "^4.0.0" +pytest_mock = "^3.14.0" [build-system] requires = ["poetry-core"] diff --git a/tests/test_base.py b/tests/test_base.py index ecb6372..30f6424 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -20,14 +20,16 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import List +from typing import Dict, List, Optional, Set import hypothesis.strategies as st +import numpy as np import pytest from hypothesis import given from pydantic import NonNegativeInt, ValidationError +from pytest_mock import MockerFixture -from pybandits.base import ActionId, BaseMab +from pybandits.base import ActionId, BaseMab, Probability from pybandits.model import Beta from pybandits.strategy import ClassicBandit @@ -41,8 +43,12 @@ def update( super().update(actions=actions, rewards=rewards) pass - def predict(): - pass + def predict( + self, + forbidden_actions: Optional[Set[ActionId]] = None, + ): + valid_actions = self._get_valid_actions(forbidden_actions) + return np.random.choice(valid_actions) def get_state(self) -> (str, dict): model_name = self.__class__.__name__ @@ -82,3 +88,54 @@ def test_base_mab_update_ok(r1, r2): dummy_mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit()) dummy_mab.update(actions=["a1", "a2"], rewards=[r1, r2]) dummy_mab.update(actions=["a1", "a1"], rewards=[r1, r2]) + + +######################################################################################################################## + + +# Epsilon-greedy functionality tests + + +@pytest.fixture +def p() -> Dict[ActionId, Probability]: + return {"a1": 0.5, "a2": 0.5} + + +def test_valid_epsilon_value(mocker: MockerFixture, p: Dict[ActionId, Probability]): + mocker.patch.object(ClassicBandit, "select_action", return_value="a2") + mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.1, default_action="a1") + selected_action = mab._select_epsilon_greedy_action(p) + assert selected_action in p.keys() + + +def test_epsilon_boundary_values(mocker: MockerFixture, p: Dict[ActionId, Probability]): + mocker.patch.object(ClassicBandit, "select_action", return_value="a2") + + mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.0) + selected_action = mab._select_epsilon_greedy_action(p) + assert selected_action == "a2" + + mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=1.0, default_action="a1") + selected_action = mab._select_epsilon_greedy_action(p) + assert selected_action == "a1" + + +def test_default_action_not_in_actions(p: Dict[ActionId, Probability]): + with pytest.raises(AttributeError): + DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=1.0, default_action="a3") + + +def test_select_action_raises_exception(mocker: MockerFixture, p: Dict[ActionId, Probability]): + mocker.patch.object(ClassicBandit, "select_action", side_effect=Exception("Test Exception")) + mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.1, default_action="a1") + + with pytest.raises(Exception) as excinfo: + mab._select_epsilon_greedy_action(p) + + assert str(excinfo.value) == "Test Exception" + + +def test_default_action_in_forbidden_actions(): + mab = DummyMab(actions={"a1": Beta(), "a2": Beta()}, strategy=ClassicBandit(), epsilon=0.1, default_action="a1") + with pytest.raises(ValueError): + mab.predict(forbidden_actions={"a1"}) diff --git a/tests/test_cmab.py b/tests/test_cmab.py index d2d25f4..cc97ad7 100644 --- a/tests/test_cmab.py +++ b/tests/test_cmab.py @@ -297,7 +297,8 @@ def run_predict(mab): with pytest.raises(ValueError): # all actions forbidden assert set(mab.predict(context=context, forbidden_actions=["a1", "a2", "a3", "a4", "a5"])[0]) with pytest.raises(ValueError): # all actions forbidden (unordered) - assert set(mab.predict(n_samples=1000, forbidden_actions=["a5", "a4", "a2", "a3", "a1"])[0]) + n_samples = 1000 + assert set(mab.predict(n_samples=n_samples, forbidden_actions=["a5", "a4", "a2", "a3", "a1"])[0]) # cold start mab mab = create_cmab_bernoulli_cold_start(action_ids=["a1", "a2", "a3", "a4", "a5"], n_features=n_features) @@ -328,7 +329,14 @@ def test_cmab_get_state(mu, sigma, n_features): cmab = CmabBernoulli(actions=actions) expected_state = json.loads( json.dumps( - {"actions": actions, "strategy": {}, "predict_with_proba": False, "predict_actions_randomly": False}, + { + "actions": actions, + "strategy": {}, + "predict_with_proba": False, + "predict_actions_randomly": False, + "epsilon": None, + "default_action": None, + }, default=dict, ) ) @@ -547,6 +555,8 @@ def test_cmab_bai_get_state(mu, sigma, n_features, exploit_p: Float01): "strategy": {"exploit_p": exploit_p}, "predict_with_proba": False, "predict_actions_randomly": False, + "epsilon": None, + "default_action": None, }, default=dict, ) @@ -790,6 +800,8 @@ def test_cmab_cc_get_state( "strategy": {"subsidy_factor": subsidy_factor}, "predict_with_proba": True, "predict_actions_randomly": False, + "epsilon": None, + "default_action": None, }, default=dict, ) @@ -857,3 +869,58 @@ def test_cmab_cc_from_state(state): # Ensure get_state and from_state compatibility new_cmab = globals()[cmab.get_state()[0]].from_state(state=cmab.get_state()[1]) assert new_cmab == cmab + + +######################################################################################################################## + + +# Cmab with epsilon-greedy super strategy + + +@settings(deadline=500) +@given(st.integers(min_value=1, max_value=1000), st.integers(min_value=1, max_value=100)) +def test_epsilon_greedy_cmab_predict_cold_start(n_samples, n_features): + context = np.random.uniform(low=-1.0, high=1.0, size=(n_samples, n_features)) + + mab = create_cmab_bernoulli_cold_start( + action_ids=["a1", "a2"], n_features=n_features, epsilon=0.1, default_action="a1" + ) + selected_actions, probs, weighted_sums = mab.predict(context=context) + assert mab.predict_actions_randomly + assert all([a in ["a1", "a2"] for a in selected_actions]) + assert len(selected_actions) == n_samples + assert probs == n_samples * [{"a1": 0.5, "a2": 0.5}] + assert weighted_sums == n_samples * [{"a1": 0, "a2": 0}] + + +@settings(deadline=500) +@given(st.integers(min_value=1, max_value=100), st.integers(min_value=1, max_value=3)) +def test_epsilon_greedy_cmab_bai_predict(n_samples, n_features): + context = np.random.uniform(low=-1.0, high=1.0, size=(n_samples, n_features)) + + mab = create_cmab_bernoulli_bai_cold_start( + action_ids=["a1", "a2"], n_features=n_features, epsilon=0.1, default_action="a1" + ) + selected_actions, probs, weighted_sums = mab.predict(context=context) + assert mab.predict_actions_randomly + assert all([a in ["a1", "a2"] for a in selected_actions]) + assert len(selected_actions) == n_samples + assert probs == n_samples * [{"a1": 0.5, "a2": 0.5}] + assert weighted_sums == n_samples * [{"a1": 0, "a2": 0}] + + +@settings(deadline=500) +@given(st.integers(min_value=1, max_value=100), st.integers(min_value=1, max_value=3)) +def test_epsilon_greedy_cmab_cc_predict(n_samples, n_features): + context = np.random.uniform(low=-1.0, high=1.0, size=(n_samples, n_features)) + + # cold start + mab = create_cmab_bernoulli_cc_cold_start( + action_ids_cost={"a1": 10, "a2": 20.5}, n_features=n_features, epsilon=0.1, default_action="a1" + ) + selected_actions, probs, weighted_sums = mab.predict(context=context) + assert mab.predict_actions_randomly + assert all([a in ["a1", "a2"] for a in selected_actions]) + assert len(selected_actions) == n_samples + assert probs == n_samples * [{"a1": 0.5, "a2": 0.5}] + assert weighted_sums == n_samples * [{"a1": 0, "a2": 0}] diff --git a/tests/test_smab.py b/tests/test_smab.py index 52347de..39c03e0 100644 --- a/tests/test_smab.py +++ b/tests/test_smab.py @@ -131,6 +131,7 @@ def test_smab_predict_raise_when_all_actions_forbidden(): def test_smab_predict(): + n_samples = 1000 s = SmabBernoulli( actions={ "a0": Beta(), @@ -143,7 +144,7 @@ def test_smab_predict(): ) forbidden_actions = set(["forb_1", "forb_2"]) - best_actions, probs = s.predict(n_samples=1000, forbidden_actions=forbidden_actions) + best_actions, probs = s.predict(n_samples=n_samples, forbidden_actions=forbidden_actions) assert ["forb1" not in p.keys() for p in probs], "forbidden actions weren't removed from the output" valid_actions = set(s.actions.keys()) - forbidden_actions @@ -206,7 +207,12 @@ def test_smab_get_state(a, b, c, d): actions = {"action1": Beta(n_successes=a, n_failures=b), "action2": Beta(n_successes=c, n_failures=d)} smab = SmabBernoulli(actions=actions) - expected_state = {"actions": actions, "strategy": {}} + expected_state = { + "actions": actions, + "strategy": {}, + "epsilon": None, + "default_action": None, + } class_name, smab_state = smab.get_state() assert class_name == "SmabBernoulli" @@ -288,8 +294,9 @@ def test_can_init_smabbai(): def test_smabbai_predict(): + n_samples = 1000 s = SmabBernoulliBAI(actions={"a1": Beta(), "a2": Beta()}) - _, _ = s.predict(n_samples=1000) + _, _ = s.predict(n_samples=n_samples) def test_smabbai_update(): @@ -318,7 +325,12 @@ def test_smabbai_with_betacc(): def test_smab_bai_get_state(a, b, c, d, exploit_p: Float01): actions = {"action1": Beta(n_successes=a, n_failures=b), "action2": Beta(n_successes=c, n_failures=d)} smab = SmabBernoulliBAI(actions=actions, exploit_p=exploit_p) - expected_state = {"actions": actions, "strategy": {"exploit_p": exploit_p}} + expected_state = { + "actions": actions, + "strategy": {"exploit_p": exploit_p}, + "epsilon": None, + "default_action": None, + } class_name, smab_state = smab.get_state() assert class_name == "SmabBernoulliBAI" @@ -413,6 +425,7 @@ def test_can_init_smabcc(): def test_smabcc_predict(): + n_samples = 1000 s = SmabBernoulliCC( actions={ "a1": BetaCC(n_successes=1, n_failures=2, cost=10), @@ -420,7 +433,7 @@ def test_smabcc_predict(): }, subsidy_factor=0.7, ) - _, _ = s.predict(n_samples=1000) + _, _ = s.predict(n_samples=n_samples) def test_smabcc_update(): @@ -443,7 +456,14 @@ def test_smab_cc_get_state(a, b, c, d, cost1: NonNegativeFloat, cost2: NonNegati "action2": BetaCC(n_successes=c, n_failures=d, cost=cost2), } smab = SmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor) - expected_state = {"actions": actions, "strategy": {"subsidy_factor": subsidy_factor}} + expected_state = { + "actions": actions, + "strategy": { + "subsidy_factor": subsidy_factor, + }, + "epsilon": None, + "default_action": None, + } class_name, smab_state = smab.get_state() assert class_name == "SmabBernoulliCC" @@ -600,7 +620,12 @@ def test_smab_mo_get_state(a_list): ), } smab = SmabBernoulliMO(actions=actions) - expected_state = {"actions": actions, "strategy": {}} + expected_state = { + "actions": actions, + "strategy": {}, + "epsilon": None, + "default_action": None, + } class_name, smab_state = smab.get_state() assert class_name == "SmabBernoulliMO" @@ -756,7 +781,12 @@ def test_smab_mocc_get_state(a_list): ), } smab = SmabBernoulliMOCC(actions=actions) - expected_state = {"actions": actions, "strategy": {}} + expected_state = { + "actions": actions, + "strategy": {}, + "epsilon": None, + "default_action": None, + } class_name, smab_state = smab.get_state() assert class_name == "SmabBernoulliMOCC" @@ -802,3 +832,86 @@ def test_smab_mo_cc_from_state(state): # Ensure get_state and from_state compatibility new_smab = globals()[smab.get_state()[0]].from_state(state=smab.get_state()[1]) assert new_smab == smab + + +######################################################################################################################## + + +# Smab with epsilon-greedy super strategy + + +@given( + st.integers(min_value=1), + st.integers(min_value=1), +) +def test_can_instantiate_epsilon_greddy_smab_with_params(a, b): + s = SmabBernoulli( + actions={ + "action1": Beta(n_successes=a, n_failures=b), + "action2": Beta(n_successes=a, n_failures=b), + }, + epsilon=0.1, + default_action="action1", + ) + assert (s.actions["action1"].n_successes == a) and (s.actions["action1"].n_failures == b) + assert s.actions["action1"] == s.actions["action2"] + + +def test_epsilon_greedy_smab_predict(): + n_samples = 1000 + + s = SmabBernoulli( + actions={ + "a0": Beta(), + "a1": Beta(n_successes=5, n_failures=5), + "forb_1": Beta(n_successes=10, n_failures=1), + "best": Beta(n_successes=10, n_failures=5), + "forb_2": Beta(n_successes=100, n_failures=4), + "a5": Beta(), + }, + epsilon=0.1, + default_action="a1", + ) + forbidden_actions = set(["forb_1", "forb_2"]) + + _, _ = s.predict(n_samples=n_samples, forbidden_actions=forbidden_actions) + + +def test_epsilon_greddy_smabbai_predict(): + n_samples = 1000 + s = SmabBernoulliBAI(actions={"a1": Beta(), "a2": Beta()}, epsilon=0.1, default_action="a1") + _, _ = s.predict(n_samples=n_samples) + + +def test_epsilon_greddy_smabcc_predict(): + n_samples = 1000 + s = SmabBernoulliCC( + actions={ + "a1": BetaCC(n_successes=1, n_failures=2, cost=10), + "a2": BetaCC(n_successes=3, n_failures=4, cost=20), + }, + subsidy_factor=0.7, + epsilon=0.1, + default_action="a1", + ) + _, _ = s.predict(n_samples=n_samples) + + +def test_epsilon_greddy_smab_mo_predict(): + n_samples = 1000 + + s = create_smab_bernoulli_mo_cold_start(action_ids=["a1", "a2"], n_objectives=3, epsilon=0.1, default_action="a1") + + forbidden = None + s.predict(n_samples=n_samples, forbidden_actions=forbidden) + + +def test_epsilon_greddy_smab_mo_cc_predict(): + n_samples = 1000 + + s = create_smab_bernoulli_mo_cc_cold_start( + action_ids_cost={"a1": 1, "a2": 2}, n_objectives=2, epsilon=0.1, default_action="a1" + ) + + forbidden = None + s.predict(n_samples=n_samples, forbidden_actions=forbidden)