forked from xuwd11/cs294-112_hws
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdensity_model.py
391 lines (326 loc) · 14 KB
/
density_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from ex_utils import build_mlp
class Density_Model(object):
def __init__(self):
super(Density_Model, self).__init__()
def receive_tf_sess(self, sess):
self.sess = sess
def get_prob(self, state):
raise NotImplementedError
class Histogram(Density_Model):
def __init__(self, nbins, preprocessor):
super(Histogram, self).__init__()
self.nbins = nbins
self.total = 0.
self.hist = {}
for i in range(int(self.nbins)):
self.hist[i] = 0
self.preprocessor = preprocessor
def update_count(self, state, increment):
"""
### PROBLEM 1
### YOUR CODE HERE
args:
state: numpy array
increment: int
TODO:
1. increment the entry "bin_name" in self.hist by "increment"
2. increment self.total by "increment"
"""
bin_name = self.preprocessor(state)
self.hist[bin_name] += increment
self.total += increment
def get_count(self, states):
"""
### PROBLEM 1
### YOUR CODE HERE
args:
states: numpy array (bsize, ob_dim)
returns:
counts: numpy_array (bsize)
TODO:
For each state in states:
1. get the bin_name using self.preprocessor
2. get the value of self.hist with key bin_name
"""
counts = []
for state in states:
bin_name = self.preprocessor(state)
counts.append(self.hist[bin_name])
counts = np.array(counts)
return counts
def get_prob(self, states):
"""
### PROBLEM 1
### YOUR CODE HERE
args:
states: numpy array (bsize, ob_dim)
returns:
return the probabilities of the state (bsize)
NOTE:
remember to normalize by float(self.total)
"""
counts = self.get_count(states)
probs = counts / self.total
return probs
class RBF(Density_Model):
"""
https://en.wikipedia.org/wiki/Radial_basis_function_kernel
https://en.wikipedia.org/wiki/Kernel_density_estimation
"""
def __init__(self, sigma):
super(RBF, self).__init__()
self.sigma = sigma
self.means = None
def fit_data(self, data):
"""
### PROBLEM 2
### YOUR CODE HERE
args:
data: list of states of shape (ob_dim)
TODO:
We simply assign self.means to be equal to the data points.
Let the length of the data be B
self.means: np array (B, ob_dim)
"""
B, ob_dim = len(data), len(data[0])
self.means = np.array(data)
assert self.means.shape == (B, ob_dim)
def get_prob(self, states):
"""
### PROBLEM 2
### YOUR CODE HERE
given:
states: (b, ob_dim)
where b is the number of states we wish to get the
probability of
self.means: (B, ob_dim)
where B is the number of states in the replay buffer
we will plop a Gaussian distribution on top of each
of self.means with a std of self.sigma
TODO:
1. Compute deltas: for each state in states, compute the
difference between that state and every mean in self.means.
2. Euclidean distance: sum the squared deltas
3. Gaussian: evaluate the probability of the state under the
gaussian centered around each mean. The hyperparameters
for the reference solution assume that you do not normalize
the gaussian. This is fine since the rewards will be
normalized later when we compute advantages anyways.
4. Average: average the probabilities from each gaussian
"""
b, ob_dim = states.shape
if self.means is None:
# Return a uniform distribution if we don't have samples in the
# replay buffer yet.
return (1.0/len(states))*np.ones(len(states))
else:
B, replay_dim = self.means.shape
assert states.ndim == self.means.ndim and ob_dim == replay_dim
# 1. Compute deltas
deltas = np.expand_dims(states, axis=1) - np.expand_dims(self.means, axis=0)
assert deltas.shape == (b, B, ob_dim)
# 2. Euclidean distance
euc_dists = np.sum(np.square(deltas), axis=-1)
assert euc_dists.shape == (b, B)
# Gaussian
gaussians = np.exp(- euc_dists / 2 / self.sigma ** 2)
assert gaussians.shape == (b, B)
# 4. Average
densities = np.mean(gaussians, axis=-1)
assert densities.shape == (b,)
return densities
class Exemplar(Density_Model):
def __init__(self, ob_dim, hid_dim, learning_rate, kl_weight):
super(Exemplar, self).__init__()
self.ob_dim = ob_dim
self.hid_dim = hid_dim
self.learning_rate = learning_rate
self.kl_weight = kl_weight
def build_computation_graph(self):
"""
### PROBLEM 3
### YOUR CODE HERE
TODO:
1. self.log_likelihood. shape: (batch_size)
- use tf.squeeze
- use the discriminator to get the log prob of the discrim_target
2. self.likelihood. shape: (batch_size)
- use tf.squeeze
- use the discriminator to get the prob of the discrim_target
3. self.kl. shape: (batch_size)
- simply add the kl divergence between self.encoder1 and
the prior and the kl divergence between self.encoder2
and the prior. Do not average.
4. self.elbo:
- subtract the kl (weighted by self.kl_weight) from the
log_likelihood, and average over the batch
5. self.update_op: use the AdamOptimizer with self.learning_rate
to minimize the -self.elbo (Note the negative sign!)
Hint:
https://www.tensorflow.org/probability/api_docs/python/tfp/distributions
"""
self.state1, self.state2 = self.define_placeholders()
self.encoder1, self.encoder2, self.prior, self.discriminator = self.forward_pass(self.state1, self.state2)
self.discrim_target = tf.placeholder(shape=[None, 1], name="discrim_target", dtype=tf.float32)
self.log_likelihood = tf.squeeze(self.discriminator.log_prob(self.discrim_target), axis=1)
self.likelihood = tf.squeeze(self.discriminator.prob(self.discrim_target), axis=1)
self.kl = tfp.distributions.kl_divergence(self.encoder1, self.prior) \
+ tfp.distributions.kl_divergence(self.encoder2, self.prior)
assert len(self.log_likelihood.shape) == len(self.likelihood.shape) == len(self.kl.shape) == 1
self.elbo = tf.reduce_mean(self.log_likelihood - self.kl_weight * self.kl)
self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(-self.elbo)
def define_placeholders(self):
state1 = tf.placeholder(shape=[None, self.ob_dim], name="s1", dtype=tf.float32)
state2 = tf.placeholder(shape=[None, self.ob_dim], name="s2", dtype=tf.float32)
return state1, state2
def make_encoder(self, state, z_size, scope, n_layers, hid_size):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
state: tf variable
z_size: output dimension of the encoder network
scope: scope name
n_layers: number of layers of the encoder network
hid_size: hidden dimension of encoder network
TODO:
1. z_mean: the output of a neural network that takes the state as input,
has output dimension z_size, n_layers layers, and hidden
dimension hid_size
2. z_logstd: a trainable variable, initialized to 0
shape (z_size,)
Hint: use build_mlp
"""
z_mean = build_mlp(state, z_size, scope, n_layers, hid_size)
z_logstd = tf.get_variable('z_logstd', shape=z_size, trainable=True,
initializer=tf.constant_initializer(value=0.))
return tfp.distributions.MultivariateNormalDiag(loc=z_mean, scale_diag=tf.exp(z_logstd))
def make_prior(self, z_size):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
z_size: output dimension of the encoder network
TODO:
prior_mean and prior_logstd are for a standard normal distribution
both have dimension z_size
"""
prior_mean = tf.zeros(z_size)
prior_logstd = tf.zeros(z_size)
return tfp.distributions.MultivariateNormalDiag(loc=prior_mean, scale_diag=tf.exp(prior_logstd))
def make_discriminator(self, z, output_size, scope, n_layers, hid_size):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
z: input to to discriminator network
output_size: output dimension of discriminator network
scope: scope name
n_layers: number of layers of discriminator network
hid_size: hidden dimension of discriminator network
TODO:
1. logit: the output of a neural network that takes z as input,
has output size output_size, n_layers layers, and hidden
dimension hid_size
Hint: use build_mlp
"""
logit = build_mlp(z, output_size, scope, n_layers, hid_size)
return tfp.distributions.Bernoulli(logit)
def forward_pass(self, state1, state2):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
state1: tf variable
state2: tf variable
encoder1: tfp.distributions.MultivariateNormalDiag distribution
encoder2: tfp.distributions.MultivariateNormalDiag distribution
prior: tfp.distributions.MultivariateNormalDiag distribution
discriminator: tfp.distributions.Bernoulli distribution
TODO:
1. z1: sample from encoder1
2. z2: sample from encoder2
3. z: concatenate z1 and z2
Hint:
https://www.tensorflow.org/probability/api_docs/python/tfp/distributions
"""
# Reuse
make_encoder1 = tf.make_template('encoder1', self.make_encoder)
make_encoder2 = tf.make_template('encoder2', self.make_encoder)
make_discriminator = tf.make_template('decoder', self.make_discriminator)
# Encoder
encoder1 = make_encoder1(state1, self.hid_dim/2, 'z1', n_layers=2, hid_size=self.hid_dim)
encoder2 = make_encoder2(state2, self.hid_dim/2, 'z2', n_layers=2, hid_size=self.hid_dim)
# Prior
prior = self.make_prior(self.hid_dim/2)
# Sampled Latent
z1 = encoder1.sample()
z2 = encoder2.sample()
z = tf.concat([z1, z2], axis=1)
# Discriminator
discriminator = make_discriminator(z, 1, 'discriminator', n_layers=2, hid_size=self.hid_dim)
return encoder1, encoder2, prior, discriminator
def update(self, state1, state2, target):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
state1: np array (batch_size, ob_dim)
state2: np array (batch_size, ob_dim)
target: np array (batch_size, 1)
TODO:
train the density model and return
ll: log_likelihood
kl: kl divergence
elbo: elbo
"""
assert state1.ndim == state2.ndim == target.ndim
assert state1.shape[1] == state2.shape[1] == self.ob_dim
assert state1.shape[0] == state2.shape[0] == target.shape[0]
_, ll, kl, elbo = self.sess.run(
[self.update_op, self.log_likelihood, self.kl, self.elbo],
feed_dict={self.state1:state1, self.state2:state2, self.discrim_target:target}
)
return ll, kl, elbo
def get_likelihood(self, state1, state2):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
state1: np array (batch_size, ob_dim)
state2: np array (batch_size, ob_dim)
TODO:
likelihood of state1 == state2
Hint:
what should be the value of self.discrim_target?
"""
assert state1.ndim == state2.ndim
assert state1.shape[1] == state2.shape[1] == self.ob_dim
assert state1.shape[0] == state2.shape[0]
likelihood = self.sess.run(
self.likelihood,
feed_dict={self.state1:state1, self.state2:state2,
self.discrim_target:np.ones([state1.shape[0], 1])}
)
return likelihood
def get_prob(self, state):
"""
### PROBLEM 3
### YOUR CODE HERE
args:
state: np array (batch_size, ob_dim)
TODO:
likelihood:
evaluate the discriminator D(x,x) on the same input
prob:
compute the probability density of x from the discriminator
likelihood (see homework doc)
"""
likelihood = self.get_likelihood(state, state)
# avoid divide by 0 and log(0)
likelihood = np.clip(np.squeeze(likelihood), 1e-5, 1-1e-5)
prob = (1 - likelihood) / likelihood
return prob