-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaselines.py
395 lines (300 loc) · 18.6 KB
/
baselines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# Implementation of a set of baseline for the Combinatorial Bandit budget allocation problem
# 1. The first baseline is the actions taken by the human operator from the dataset
# 2. The second baseline is the GP-UCB policy with sliding window
# 3. The third baseline is the GP-TS policy with sliding window
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, RationalQuadratic, Matern, ConstantKernel as C, WhiteKernel, ConvergenceWarning
import numpy as np
import matplotlib.pyplot as plt
import warnings
import pulp
from pulp import PULP_CBC_CMD
from scipy.stats import norm
from utility import get_log_likelihood, simple_prediction_test
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")
''' --------------- Class for Bayesian Change Point Detection --------------- '''
class BOCD:
def __init__(self, hazard, prior_mean, prior_var):
self.hazard = hazard # probability of change occuring at each timestep
self.prior_mean = prior_mean # The initial mean of the reward distribution (before seeing any data).
self.prior_var = prior_var # The initial variance of the reward distribution.
self.run_length_probs = np.array([1.0]) # Start with run length = 0 with probability 1.
def update(self, observation):
T = len(self.run_length_probs) # Time step
pred_means = np.zeros(T)
pred_vars = np.zeros(T)
# Predictive step: Calculate the predictive mean and variance for all run lengths
for t in range(T):
pred_means[t] = self.prior_mean
pred_vars[t] = self.prior_var + 1 # Gaussian likelihood with unit variance
# Observation likelihood for the new data point
observation_likelihoods = norm.pdf(observation, loc=pred_means, scale=np.sqrt(pred_vars))
# Growth step: Propagate the probabilities for the current run length
growth_probs = self.run_length_probs * (1 - self.hazard) * observation_likelihoods
# Change step: Calculate the probabilities of a new change point
new_run_prob = np.sum(self.run_length_probs *self.hazard * observation_likelihoods)
# Update run-length probabilities
self.run_length_probs = np.append(growth_probs, new_run_prob)
self.run_length_probs /=np.sum(self.run_length_probs) # Normalize
# Update the mean and variance (assuming Gaussian update for simplicity)
self.prior_mean = (self.prior_mean * T + observation) / (T + 1)
self.prior_var = 1/(1/self.prior_var + 1)
def detect_change_point(self):
# If a change point id likely reset
return self.run_length_probs[-1] > 0.5
''' ------- Code for simulating the actions taken by a human operator form the dataset ------- '''
class HumanPolicy():
def __init__(self, data):
self.data = data
self.current_action = {}
def reset(self, env):
pass
def get_action(self, current_date, campaigns):
human_actions = {}
for campaign in campaigns:
human_data = self.data[(self.data['date'] == current_date) & (self.data['campaign_name'] == campaign)]
if human_data.empty:
if campaign not in human_actions.keys():
self.current_action[campaign] = 1 # adding a dummy value if data does not exist
human_actions[campaign] = self.current_action[campaign]
else:
human_actions[campaign] = human_data['campaign_budget_total_amount'].values[0]
# print(f'Human actions ======= {human_actions}')
self.current_action = human_actions
return human_actions
''' -------------- Code for GP policy with sliding window and different exploration strategy and adaptive strategy --------------
GP Kernel is vanilla GP kernel without saturating kernel and guided ucb exploration
1. No exploration : ne
2. Thompson sampling : ts
3. Upper Confidence Bound : ucb
Different adaption strategies
1. Sliding window : sliding_window
2. Discounted reward : discount the reward based on the time
3. Log likelihood test : log_likelihood_test
4. mae test : Mean average error test
5.
'''
class GPUCBPolicyWithSlidingWindow():
def __init__(self, data, config, exploration_type='ucb', adaptation_type='sliding_window'):
self.offline_data = data # pd.concat([data_brand, data_nonbrand, data_display])
self.offline_data['date'] = pd.to_datetime(self.offline_data['date'], errors='coerce')
self.current_date = self.offline_data['date'].min()
self.current_month = self.current_date.month
self.current_year = self.current_date.year
self.adaptation_type = adaptation_type # Type of reward function update policy
self.exploration_type = exploration_type
self.beta = config.getint('POLICY', 'beta') # High beta for exploration
self.use_prior = False
self.seed = config.getint('POLICY', 'seed')
self.channel = config.get('SIMULATOR', 'data_channel')
# Dictionaries for the campaign data
self.reward_models = {}
self.cost_data = {}
self.click_data = {}
self.budget_data = {}
self.avg_cpc = {}
self.gamma = 0.9
self.budget_granualarity = 500
# To be set every month based on the camapign data
self.monthly_budget = 0
self.daily_budget_max = 0
# Store the previous action
self.prev_action = {}
# Intialize click and cost data for the different channels
self.campaigns = self.offline_data['campaign_name'].unique()
self.bocd_models = {campaign: BOCD(hazard=0.01, prior_mean=0, prior_var=1) for campaign in self.campaigns}
for campaign in self.campaigns:
if self.use_prior:
self.cost_data[campaign] = self.offline_data[self.offline_data['campaign_name'] == campaign]['cost'].values
self.click_data[campaign] = self.offline_data[self.offline_data['campaign_name'] == campaign]['click'].values
self.budget_data[campaign] = self.offline_data[self.offline_data['campaign_name'] == campaign]['campaign_budget_total_amount'].values
self.reward_models[campaign] = self.learn_reward_model(self.cost_data[campaign], self.click_data[campaign], campaign, True)
else:
self.cost_data[campaign] = [0]
self.click_data[campaign] = [0]
self.budget_data[campaign] = []
self.avg_cpc[campaign] = 0
# print(f'campaign ========== {len(self.cost_data[campaign])} ======== {len(self.click_data[campaign])} ======== {len(self.budget_data[campaign])}')
def reset(self, env):
self.env = env # For regret calculation store the current env the policy is working on
self.current_month = env.current_month
self.current_year = env.current_year
self.daily_budget_max = env.daily_budget_max
np.random.seed(self.seed)
# Maintain a list of current cost and reward of 7 days
self.cost_data_current = {}
self.click_data_current = {}
for campaign in self.campaigns:
self.cost_data_current[campaign] = np.array([])
self.click_data_current[campaign] = np.array([])
def learn_reward_model(self, cost_data, reward_data, campaign_name, plot_function):
# Learn the reward function using Gaussian Process Regression
X = np.array(cost_data).reshape(-1, 1)
y = np.array(reward_data)
kernel = (Matern(length_scale=1.0, nu=1.5) + RBF(length_scale=1.0))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
gp.fit(X, y)
return gp
def predict_reward_budgetlevel(self, budget_levels):
reward_dict = {}
budget_levels = np.array(budget_levels).reshape(-1, 1)
for campaign in self.campaigns:
gp = self.reward_models[campaign]
y_pred, sigma = gp.predict(budget_levels, return_std=True)
explore_reward = y_pred
if self.exploration_type == 'ucb':
explore_reward = y_pred + self.beta * sigma
if self.exploration_type == 'ts':
sampled_rewards = gp.sample_y(budget_levels, n_samples=1).flatten()
explore_reward = sampled_rewards
reward_dict[campaign] = explore_reward # ucb_reward # y_pred
return reward_dict
'''
Function to calculate the optimal budget allocation using the predicted rewards
While maintaining the budget constraint of not exceeding the daily budget limit
args : Dictionary of Gaussian Process models
Output: Dictionary of optimal budget allocation
'''
def optimization(self):
budget_levels = np.arange(0, self.daily_budget_max + 1, self.budget_granualarity)
reward_dict = self.predict_reward_budgetlevel(budget_levels)
num_budgets = len(budget_levels)
# Campaign IDs
campaign_ids = list(reward_dict.keys())
# Create the problem
prob = pulp.LpProblem("BudgetAllocation", pulp.LpMaximize)
# Decision variables (allocation percentage)
allocation_vars = [[pulp.LpVariable(f"x_{i}_{j}", lowBound=0, cat='Binary') for j in range(num_budgets)] for i in range(len(campaign_ids))]
# Objective function
prob += pulp.lpSum(allocation_vars[i][j] * reward_dict[campaign_ids[i]][j]
for i in range(len(campaign_ids)) for j in range(num_budgets))
# Constraints : Each campaign's total budget should not exceed budget_max
for i in range(len(campaign_ids)):
prob += pulp.lpSum(allocation_vars[i][j] * budget_levels[j] for j in range(num_budgets)) <= self.daily_budget_max
# Each campaign must allocate its budget to exactly one level
for i in range(len(campaign_ids)):
prob += pulp.lpSum(allocation_vars[i][j] for j in range(num_budgets)) == 1
# Total budget allocation must not exceed the daily budget limit
prob += pulp.lpSum(allocation_vars[i][j] * budget_levels[j] for i in range(len(campaign_ids)) for j in range(num_budgets)) <= self.daily_budget_max
# Solve the problem and suprress the output message
prob.solve(PULP_CBC_CMD(msg=False))
# Extract the optimal budget allocation
budget_allocation = {}
for i in range(len(campaign_ids)):
for j in range(num_budgets):
if pulp.value(allocation_vars[i][j]) > 0:
budget_allocation[campaign_ids[i]] = budget_levels[j]
break
# Extract optimal results
optimal_reward = pulp.value(prob.objective)
return budget_allocation, optimal_reward
def get_action(self, observation, day):
if day > 0:
for campaign in self.campaigns:
self.cost_data[campaign] = np.append(self.cost_data[campaign], observation[campaign][0][-1])
self.click_data[campaign] = np.append(self.click_data[campaign], observation[campaign][1][-1])
self.budget_data[campaign] = np.append(self.budget_data[campaign], self.prev_action[campaign])
# Maintain a list of current cost and reward of 7 days
self.cost_data_current[campaign] = np.append(self.cost_data_current[campaign], observation[campaign][0][-1])
self.click_data_current[campaign] = np.append(self.click_data_current[campaign], observation[campaign][1][-1])
if len(self.cost_data_current[campaign]) > 7:
# take only the last 7 days data
self.cost_data_current[campaign] = self.cost_data_current[campaign][-10:]
self.click_data_current[campaign] = self.click_data_current[campaign][-10:]
if self.adaptation_type == 'no_change_detection':
for campaign in self.campaigns:
self.reward_models[campaign] = self.learn_reward_model(self.cost_data[campaign], self.click_data[campaign], campaign, True)
# Passive Approch: Sliding window with a
if self.adaptation_type == 'sliding_window':
for campaign in self.campaigns:
self.reward_models[campaign] = self.learn_reward_model(self.cost_data[campaign], self.click_data[campaign], campaign, True)
# Taking a sliding window of 10 days
if len(self.cost_data[campaign]) > 30:
# print(f'cost_data ========= {len(self.cost_data[campaign])}')
self.cost_data[campaign] = self.cost_data[campaign][10:]
self.click_data[campaign] = self.click_data[campaign][10:]
# print(f'cost_data ========= {len(self.cost_data[campaign])}')
# Code for discounting reward as per time
if self.adaptation_type == 'discounted_reward':
for campaign in self.campaigns:
# Adjust the importance of cost and click based on the time
time_steps = len(self.cost_data[campaign])
# Generate discount weights for the past observations
discount_weights = np.array([self.gamma**(time_steps - i -1) for i in range(time_steps)])
# Reweight cost and click data with the discount factor
discounted_costs = self.cost_data[campaign] * discount_weights
discounted_clicks = self.click_data[campaign] * discount_weights
self.reward_models[campaign] = self.learn_reward_model(discounted_costs, discounted_clicks, campaign, True)
# Code for Bayesian Change Point Detection
if self.adaptation_type == 'bayesian_change_point':
for campaign in self.campaigns:
if len(observation[campaign][0]) > 1:
new_observation = observation[campaign][0][-1]
# Update the BOCD model with the new observation
self.bocd_models[campaign].update(new_observation)
# Detect if there is a change point
if self.bocd_models[campaign].detect_change_point():
print(f'Change point detected for campaign {campaign} at day {day}')
self.cost_data[campaign] = np.array([])
self.click_data[campaign] = np.array([])
self.reward_models[campaign] = self.learn_reward_model(self.cost_data[campaign], self.click_data[campaign], campaign, False)
if self.adaptation_type == 'mae_test':
# Use the current data to learn a new model
new_gp_models = {}
for campaign in self.campaigns:
self.reward_models[campaign] = self.learn_reward_model(self.cost_data[campaign], self.click_data[campaign], campaign, False)
# Learn only when there are atleast some available data points
if len(self.cost_data_current[campaign]) > 6:
new_gp_models[campaign] = self.learn_reward_model(self.cost_data_current[campaign], self.click_data_current[campaign], campaign, False)
# Calculate the log likelihood of the new model and the old model
# likelihood_ratio = get_log_likelihood(self.reward_models[campaign], new_gp_models[campaign], self.cost_data_current[campaign], self.click_data_current[campaign])
avg_diff, p_value = simple_prediction_test(self.reward_models[campaign], new_gp_models[campaign], self.cost_data_current[campaign], self.daily_budget_max, self.current_month)
# compare the reward learnt by the policy and the reward learnt by the environment
current_date = self.env.current_date + pd.Timedelta(days=self.env._day)
# use new data as cost
if avg_diff > 10:
print(f'Change detectesd in campaign {campaign} MAE_DIFF ========= {avg_diff}')
self.cost_data[campaign] = self.cost_data_current[campaign]
self.click_data[campaign] = self.click_data_current[campaign]
# Code for log likehood test for change detection
if self.adaptation_type == 'log_likelihood_test':
new_gp_models = {}
for campaign in self.campaigns:
self.reward_models[campaign] = self.learn_reward_model(self.cost_data[campaign], self.click_data[campaign], campaign, False)
# Learn only when there are atleast some available data points
if len(self.cost_data_current[campaign]) > 9:
new_gp_models[campaign] = self.learn_reward_model(self.cost_data_current[campaign], self.click_data_current[campaign], campaign, False)
# Calculate the log likelihood of the new model and the old model
likelihood_ratio = get_log_likelihood(self.reward_models[campaign], new_gp_models[campaign], self.cost_data_current[campaign], self.click_data_current[campaign])
print(f'likelihood_ratio ===={campaign}===== {likelihood_ratio}')
# use new data as cost
if likelihood_ratio > 10:
print(f'Change detectesd in campaign {campaign} likelihood_ratio ========= {likelihood_ratio}')
self.cost_data[campaign] = self.cost_data_current[campaign]
self.click_data[campaign] = self.click_data_current[campaign]
budget_allocation, optimal_reward = self.optimization()
# Normalize the budget allocation
total_budget = sum(budget_allocation.values())
for campaign in self.campaigns:
budget_allocation[campaign] = budget_allocation[campaign] * self.daily_budget_max / total_budget
# Make sure each campaign gets atleast 500 (for google channel) 50 (for smn channel) budget adjust from campaign with highest budget
if self.channel == 'smn':
for campaign in self.campaigns:
if budget_allocation[campaign] < 100:
budget_allocation[campaign] = 100
budget_allocation[max(budget_allocation, key=budget_allocation.get)] -= 100
else:
for campaign in self.campaigns:
if budget_allocation[campaign] < 500:
budget_allocation[campaign] = 500
budget_allocation[max(budget_allocation, key=budget_allocation.get)] -= 500
self.prev_action = budget_allocation
print(f' budget_allocation ========= {budget_allocation}')
return budget_allocation, optimal_reward
''' -------------- Simple Heuristic Baseline that samples the highest point from budget allocation -------------- '''
class SimpleBaselinePolicy():
def get_action(self, day):
pass