-
Notifications
You must be signed in to change notification settings - Fork 4
/
concrete_dropout.py
115 lines (104 loc) · 4.91 KB
/
concrete_dropout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Kudos to Yarin Gal
import keras.backend as K
from keras import initializers
from keras.engine import InputSpec
from keras.layers import Wrapper
import numpy as np
class ConcreteDropout(Wrapper):
"""This wrapper allows to learn the dropout probability for any given input Dense layer.
```python
# as the first layer in a model
model = Sequential()
model.add(ConcreteDropout(Dense(8), input_shape=(16)))
# now model.output_shape == (None, 8)
# subsequent layers: no need for input_shape
model.add(ConcreteDropout(Dense(32)))
# now model.output_shape == (None, 32)
```
`ConcreteDropout` can be used with arbitrary layers which have 2D
kernels, not just `Dense`. However, Conv2D layers require different
weighing of the regulariser (use SpatialConcreteDropout instead).
# Arguments
layer: a layer instance.
weight_regularizer:
A positive number which satisfies
$weight_regularizer = l**2 / (\tau * N)$
with prior lengthscale l, model precision $\tau$ (inverse observation noise),
and N the number of instances in the dataset.
Note that kernel_regularizer is not needed.
dropout_regularizer:
A positive number which satisfies
$dropout_regularizer = 2 / (\tau * N)$
with model precision $\tau$ (inverse observation noise) and N the number of
instances in the dataset.
Note the relation between dropout_regularizer and weight_regularizer:
$weight_regularizer / dropout_regularizer = l**2 / 2$
with prior lengthscale l. Note also that the factor of two should be
ignored for cross-entropy loss, and used only for the eculedian loss.
"""
def __init__(self, layer, weight_regularizer=1e-6, dropout_regularizer=1e-5,
init_min=0.1, init_max=0.1, is_mc_dropout=True, **kwargs):
assert 'kernel_regularizer' not in kwargs
super(ConcreteDropout, self).__init__(layer, **kwargs)
self.weight_regularizer = weight_regularizer
self.dropout_regularizer = dropout_regularizer
self.is_mc_dropout = is_mc_dropout
self.supports_masking = True
self.p_logit = None
self.p = None
self.init_min = np.log(init_min) - np.log(1. - init_min)
self.init_max = np.log(init_max) - np.log(1. - init_max)
def build(self, input_shape=None):
self.input_spec = InputSpec(shape=input_shape)
if not self.layer.built:
self.layer.build(input_shape)
self.layer.built = True
super(ConcreteDropout, self).build() # this is very weird.. we must call super before we add new losses
# initialise p
self.p_logit = self.layer.add_weight(name='p_logit',
shape=(1,),
initializer=initializers.RandomUniform(self.init_min, self.init_max),
trainable=True)
self.p = K.sigmoid(self.p_logit[0])
# initialise regulariser / prior KL term
assert len(input_shape) == 2, 'this wrapper only supports Dense layers'
input_dim = np.prod(input_shape[-1]) # we drop only last dim
weight = self.layer.kernel
kernel_regularizer = self.weight_regularizer * K.sum(K.square(weight)) / (1. - self.p)
dropout_regularizer = self.p * K.log(self.p)
dropout_regularizer += (1. - self.p) * K.log(1. - self.p)
dropout_regularizer *= self.dropout_regularizer * input_dim
regularizer = K.sum(kernel_regularizer + dropout_regularizer)
self.layer.add_loss(regularizer)
def compute_output_shape(self, input_shape):
return self.layer.compute_output_shape(input_shape)
def concrete_dropout(self, x):
"""
Concrete dropout - used at training time (gradients can be propagated)
:param x: input
:return: approx. dropped out input
"""
eps = K.cast_to_floatx(K.epsilon())
temp = 0.1
unif_noise = K.random_uniform(shape=K.shape(x))
drop_prob = (
K.log(self.p + eps)
- K.log(1. - self.p + eps)
+ K.log(unif_noise + eps)
- K.log(1. - unif_noise + eps)
)
drop_prob = K.sigmoid(drop_prob / temp)
random_tensor = 1. - drop_prob
retain_prob = 1. - self.p
x *= random_tensor
x /= retain_prob
return x
def call(self, inputs, training=None):
if self.is_mc_dropout:
return self.layer.call(self.concrete_dropout(inputs))
else:
def relaxed_dropped_inputs():
return self.layer.call(self.concrete_dropout(inputs))
return K.in_train_phase(relaxed_dropped_inputs,
self.layer.call(inputs),
training=training)