Skip to content

Commit

Permalink
Epsilon Greedy Strategy Wrapper
Browse files Browse the repository at this point in the history
 Change log:
 1. Added pytest_mock for mocking pytests in pyproject.toml
 2. Added _make_epsilon_greedy functionality as a static method of Strategy in base.py. The method wraps the native select_action of the strategy with epsilon_greedy approach.
 3. Added epsilon and default_action to all smab.py and cmab.py classes and cold start methods
 4. Added test suite for the epsilon greedy functionlity
  • Loading branch information
Shahar-Bar committed Aug 7, 2024
1 parent b3d295b commit 5d6c5e1
Show file tree
Hide file tree
Showing 8 changed files with 457 additions and 46 deletions.
61 changes: 61 additions & 0 deletions pybandits/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, List, NewType, Optional, Set, Tuple, Union

import numpy as np
from pydantic import (
BaseModel,
Extra,
NonNegativeInt,
confloat,
conint,
constr,
root_validator,
validate_arguments,
validator,
)
Expand Down Expand Up @@ -91,10 +93,14 @@ class BaseMab(PyBanditsBaseModel, ABC):
The list of possible actions, and their associated Model.
strategy: Strategy
The strategy used to select actions.
epsilon: Optional[Float01], defaults to None
The probability of selecting a random action.
"""

actions: Dict[ActionId, Model]
strategy: Strategy
epsilon: Optional[Float01]
default_action: Optional[ActionId]

@validator("actions", pre=True)
@classmethod
Expand All @@ -103,6 +109,14 @@ def at_least_2_actions_are_defined(cls, v):
raise AttributeError("At least 2 actions should be defined.")
return v

@root_validator
def check_default_action(cls, values):
if not values["epsilon"] and values["default_action"]:
raise AttributeError("A default action should only be defined when epsilon is defined.")
if values["default_action"] and values["default_action"] not in values["actions"]:
raise AttributeError("The default action should be defined in the actions.")
return values

def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[ActionId]:
"""
Given a set of forbidden action IDs, return a set of valid action IDs.
Expand All @@ -125,6 +139,8 @@ def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[
valid_actions = set(self.actions.keys()) - forbidden_actions
if len(valid_actions) == 0:
raise ValueError("All actions are forbidden. You must allow at least 1 action.")
if self.default_action and self.default_action not in valid_actions:
raise ValueError("The default action is forbidden.")

return valid_actions

Expand Down Expand Up @@ -201,3 +217,48 @@ def get_state(self) -> (str, dict):
model_name = self.__class__.__name__
state: dict = self.dict()
return model_name, state

@validate_arguments
def _select_epsilon_greedy_action(
self,
p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]],
actions: Optional[Dict[ActionId, Model]] = None,
) -> ActionId:
"""
Wraps self.strategy.select_action function with epsilon-greedy strategy,
such that with probability epsilon a default_action is selected,
and with probability 1-epsilon the select_action function is triggered to choose action.
If no default_action is provided, a random action is selected.
Reference: Reinforcement Learning: An Introduction, Ch. 2 (Sutton and Burto, 2018)
https://web.stanford.edu/class/psych209/Readings/SuttonBartoIPRLBook2ndEd.pdf&ved=2ahUKEwjMy8WV9N2HAxVe0gIHHVjjG5sQFnoECEMQAQ&usg=AOvVaw3bKK-Y_1kf6XQVwR-UYrBY
Parameters
----------
p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]]
The dictionary or actions and their sampled probability of getting a positive reward.
For MO strategy, the sampled probability is a list with elements corresponding to the objectives.
actions: Optional[Dict[ActionId, Model]]
The dictionary of actions and their associated Model.
Returns
-------
selected_action: ActionId
The selected action.
Raises
------
KeyError
If self.default_action is not present as a key in the probabilities dictionary.
"""

if self.epsilon:
if self.default_action and self.default_action not in p.keys():
raise KeyError(f"Default action {self.default_action} not in actions.")
if np.random.binomial(1, self.epsilon):
selected_action = self.default_action if self.default_action else np.random.choice(list(p.keys()))
else:
selected_action = self.strategy.select_action(p=p, actions=actions)
else:
selected_action = self.strategy.select_action(p=p, actions=actions)
return selected_action
66 changes: 54 additions & 12 deletions pybandits/cmab.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def predict(
p_to_select_action = prob if self.predict_with_proba else ws

# predict actions, probs, weighted_sums
selected_actions.append(self.strategy.select_action(p=p_to_select_action, actions=self.actions))
selected_actions.append(self._select_epsilon_greedy_action(p=p_to_select_action, actions=self.actions))
probs.append(prob)
weighted_sums.append(ws)

Expand Down Expand Up @@ -212,8 +212,13 @@ class CmabBernoulli(BaseCmabBernoulli):
predict_with_proba: bool = False
predict_actions_randomly: bool = False

def __init__(self, actions: Dict[ActionId, BaseBayesianLogisticRegression]):
super().__init__(actions=actions, strategy=ClassicBandit())
def __init__(
self,
actions: Dict[ActionId, BaseBayesianLogisticRegression],
epsilon: Optional[Float01] = None,
default_action: Optional[ActionId] = None,
):
super().__init__(actions=actions, strategy=ClassicBandit(), epsilon=epsilon, default_action=default_action)

@classmethod
def from_state(cls, state: dict) -> "CmabBernoulli":
Expand Down Expand Up @@ -249,9 +254,15 @@ class CmabBernoulliBAI(BaseCmabBernoulli):
predict_with_proba: bool = False
predict_actions_randomly: bool = False

def __init__(self, actions: Dict[ActionId, BayesianLogisticRegression], exploit_p: Optional[Float01] = None):
def __init__(
self,
actions: Dict[ActionId, BayesianLogisticRegression],
epsilon: Optional[Float01] = None,
default_action: Optional[ActionId] = None,
exploit_p: Optional[Float01] = None,
):
strategy = BestActionIdentification() if exploit_p is None else BestActionIdentification(exploit_p=exploit_p)
super().__init__(actions=actions, strategy=strategy)
super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)

@classmethod
def from_state(cls, state: dict) -> "CmabBernoulliBAI":
Expand Down Expand Up @@ -296,9 +307,15 @@ class CmabBernoulliCC(BaseCmabBernoulli):
predict_with_proba: bool = True
predict_actions_randomly: bool = False

def __init__(self, actions: Dict[ActionId, BayesianLogisticRegressionCC], subsidy_factor: Optional[Float01] = None):
def __init__(
self,
actions: Dict[ActionId, BayesianLogisticRegressionCC],
epsilon: Optional[Float01] = None,
default_action: Optional[ActionId] = None,
subsidy_factor: Optional[Float01] = None,
):
strategy = CostControlBandit() if subsidy_factor is None else CostControlBandit(subsidy_factor=subsidy_factor)
super().__init__(actions=actions, strategy=strategy)
super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)

@classmethod
def from_state(cls, state: dict) -> "CmabBernoulliCC":
Expand All @@ -310,7 +327,12 @@ def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[Bina


@validate_arguments
def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: PositiveInt) -> CmabBernoulli:
def create_cmab_bernoulli_cold_start(
action_ids: Set[ActionId],
n_features: PositiveInt,
epsilon: Optional[Float01] = None,
default_action: Optional[ActionId] = None,
) -> CmabBernoulli:
"""
Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, with default
parameters. Until the very first update the model will predict actions randomly, where each action has equal
Expand All @@ -323,6 +345,10 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi
n_features: PositiveInt
The number of features expected after in the context matrix. This is also the number of betas of the
Bayesian Logistic Regression model.
epsilon: Optional[Float01]
epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
default_action: Optional[ActionId]
Default action to select if the epsilon-greedy approach is used. None for random selection.
Returns
-------
cmab: CmabBernoulli
Expand All @@ -331,14 +357,18 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi
actions = {}
for a in set(action_ids):
actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features)
mab = CmabBernoulli(actions=actions)
mab = CmabBernoulli(actions=actions, epsilon=epsilon, default_action=default_action)
mab.predict_actions_randomly = True
return mab


@validate_arguments
def create_cmab_bernoulli_bai_cold_start(
action_ids: Set[ActionId], n_features: PositiveInt, exploit_p: Optional[Float01] = None
action_ids: Set[ActionId],
n_features: PositiveInt,
exploit_p: Optional[Float01] = None,
epsilon: Optional[Float01] = None,
default_action: Optional[ActionId] = None,
) -> CmabBernoulliBAI:
"""
Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, and Best Action
Expand All @@ -361,6 +391,10 @@ def create_cmab_bernoulli_bai_cold_start(
(it behaves as a Greedy strategy).
If exploit_p is 0, the bandits always select the action with 2nd highest probability of getting a positive
reward.
epsilon: Optional[Float01]
epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
default_action: Optional[ActionId]
Default action to select if the epsilon-greedy approach is used. None for random selection.
Returns
-------
Expand All @@ -370,7 +404,7 @@ def create_cmab_bernoulli_bai_cold_start(
actions = {}
for a in set(action_ids):
actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features)
mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p, epsilon=epsilon, default_action=default_action)
mab.predict_actions_randomly = True
return mab

Expand All @@ -380,6 +414,8 @@ def create_cmab_bernoulli_cc_cold_start(
action_ids_cost: Dict[ActionId, NonNegativeFloat],
n_features: PositiveInt,
subsidy_factor: Optional[Float01] = None,
epsilon: Optional[Float01] = None,
default_action: Optional[ActionId] = None,
) -> CmabBernoulliCC:
"""
Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Cost Control
Expand Down Expand Up @@ -408,6 +444,10 @@ def create_cmab_bernoulli_cc_cold_start(
If subsidy_factor is 1, the bandits always selects the action with the minimum cost.
If subsidy_factor is 0, the bandits always selects the action with highest probability of getting a positive
reward (it behaves as a classic Bernoulli bandit).
epsilon: Optional[Float01]
epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
default_action: Optional[ActionId]
Default action to select if the epsilon-greedy approach is used. None for random selection.
Returns
-------
Expand All @@ -417,6 +457,8 @@ def create_cmab_bernoulli_cc_cold_start(
actions = {}
for a, cost in action_ids_cost.items():
actions[a] = create_bayesian_logistic_regression_cc_cold_start(n_betas=n_features, cost=cost)
mab = CmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
mab = CmabBernoulliCC(
actions=actions, subsidy_factor=subsidy_factor, epsilon=epsilon, default_action=default_action
)
mab.predict_actions_randomly = True
return mab
Loading

0 comments on commit 5d6c5e1

Please sign in to comment.