-
Notifications
You must be signed in to change notification settings - Fork 0
/
kohonen.py
233 lines (175 loc) · 7.43 KB
/
kohonen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
"""Python script for Exercise set 6 of the Unsupervised and
Reinforcement Learning.
"""
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import os.path as path
import pickle as pkl
def load_data(name='Bertrand Champenois'):
""" Loads the required digits for the given name. Also serialises them to
disk for faster future loading
Returns: (data, labels, target_digits) -- data is only the subset of 4 selected digits
"""
target_digits = name2digits(name) # assign the four digits that should be used
print('Your digits: ', target_digits)
# labels is small and fast, no need to serialise
labels = np.loadtxt('labels.txt')
target_idxs = np.logical_or.reduce([labels==x for x in target_digits])
labels = labels[target_idxs] # filter our digits only
# data is big and takes time; check serialised version of subset
target_digits_path = name + '.pkl'
if path.isfile(target_digits_path):
print('Loading from binary...')
with open(target_digits_path, 'rb') as digits_file:
data = pkl.load(digits_file)
else:
print('Loading from text...')
data = np.loadtxt('data.txt')
data = data[target_idxs,:] # filter our digits only
# serialise to disk:
with open(target_digits_path, 'wb') as digits_file:
pkl.dump(data, digits_file)
print('Target digits serialised to %s' % target_digits_path)
return data, labels, target_digits
def standardize(data, mean=None, std=None):
""" Standardizes the data to have 0 mean and unit variance for each feature.
If not given, these values are calculated from the data.
Otherwise, then they are applied directly (as for testing data)
Returns: data_transformed, mean, std
"""
if mean is None:
mean = np.mean(data, axis=0)
data = data - mean
if std is None:
std = np.std(data, axis=0)
data[:, std > 0] = data[:, std > 0] / std[std > 0] # avoid / 0
return data, mean, std
def de_standardize(data, mean, std):
""" Takes standardized data and shifts it by mean with std"""
return data * std + mean
def score(centers, data):
""" Computes how well the centers fit the data """
s = 0
for example in data:
s += np.min(np.sum((centers - example)**2, axis=1)) / data.shape[1] # normalise by num_features
return s / data.shape[0] # normalise by num_examples
def build_and_train(data, size=6, eta=0.1, sigma=3, tmax=4000, epoch=50, seed=None,):
""" Initializes and trains a SOM with the given parameters
size (int) size of map side -> map is (size x size) neurons
eta (float) learning rate -- in (0, 1)
sigma (int) width of gaussian neighbourhood
tmax (int) how many random indices to build
epoch (int) size of epoch when scores are computed
seed passed to numpy.random for reproducibility
Returns: (centers, scores)
"""
# initialise the centers randomly
np.random.seed(seed)
data_range = 255.0
centers = np.random.rand(size**2, data.shape[1]) * data_range
neighbor = np.arange(size**2).reshape((size, size))
# set the random order in which the datapoints should be presented
idxs_random = np.arange(tmax) % data.shape[0]
np.random.shuffle(idxs_random)
# train
scores = []
for step, idx in enumerate(idxs_random):
som_step(centers, data[idx,:],neighbor,eta,sigma)
if step % epoch == 0:
scores.append(score(centers, data))
perc = step*100 / tmax
if perc % 10 == 0:
print(perc, end='%, ')
print("100%")
return centers, scores
def kohonen():
"""Example for using create_data, plot_data and som_step.
"""
plb.close('all')
data, labels = load_data() # default name
# Kohonen algorithm hyper-parameters
size_k = 6 # size of the Kohonen map (size_k, size_k)
sigma = 1 # width of the gaussian neighborhood
eta = 0.1 # learning rate
tmax = 5*2000 # max iteration count; substitutes convergence criterion
# initialise the centers randomly
dim = data.shape[1] # 28*28 = 784
data_range = 255.0
centers = np.random.rand(size_k**2, dim) * data_range
# build a neighborhood matrix
neighbor = np.arange(size_k**2).reshape((size_k, size_k))
# set the random order in which the datapoints should be presented
idxs_random = np.arange(tmax) % data.shape[0]
np.random.shuffle(idxs_random)
movs = [] # movements created at each step
for example_idx in idxs_random:
mov, win = som_step(centers, data[example_idx,:],neighbor,eta,sigma)
movs.append(mov)
plb.plot(movs)
plb.show()
for j in range(size_k ** 2):
plb.subplot(size_k, size_k, j + 1)
plb.imshow(np.reshape(centers[j, :], [28, 28]), interpolation='nearest', cmap='Greys')
plb.axis('off')
plb.show()
def som_step(centers,data,neighbor,eta,sigma):
""" Performs one step of the sequential learning for a self-organized map (SOM).
centers = som_step(centers,data,neighbor,eta,sigma)
Input and output arguments:
centers (matrix) cluster centres. Have to be in format:
center X dimension
data (vector) the actual datapoint to be presented in this timestep
neighbor (matrix) the layout of the centers in the desired neighborhood
eta (scalar) a learning rate
sigma (scalar) the width of the gaussian neighborhood function.
Effectively describing the width of the neighborhood
Return: (tuple)
movement total movement created by this `data` normalised by map size
winner the center closest to this data point
"""
size_k = int(np.sqrt(len(centers)))
# find the best matching unit via the minimal distance to the datapoint
winner = np.argmin(np.sum((centers - data)**2, axis=1))
win_x, win_y = np.nonzero(neighbor == winner)
# total movement produced by this example
movement = 0
# update all units
for c in range(size_k**2):
# find coordinates of this unit
c_x, c_y = np.nonzero(neighbor==c)
# calculate the distance and discounting factor
dist = np.sqrt((win_x-c_x)**2 + (win_y-c_y)**2)
disc = gauss(dist, 0,sigma)
# update weights and accumulate movement
c_movement = disc * (data - centers[c,:])
centers[c,:] += eta * c_movement
movement += np.linalg.norm(c_movement)
return movement / (size_k**2), winner
def gauss(x,m,s):
"""Return the gauss function N(x), with mean m and std s.
Normalized such that N(x=m) = 1.
"""
return np.exp((-(x - m)**2) / (2 * s*s))
def name2digits(name):
""" takes a string NAME and converts it into a pseudo-random selection of 4
digits from 0-9.
Example:
name2digits('Felipe Gerhard')
returns: [0 4 5 7]
"""
name = name.lower()
if len(name)>25:
name = name[0:25]
primenumbers = [2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97]
n = len(name)
s = 0.0
for i in range(n):
s += primenumbers[i]*ord(name[i])*2.0**(i+1)
import scipy.io.matlab
Data = scipy.io.matlab.loadmat('hash.mat',struct_as_record=True)
x = Data['x']
t = int(np.mod(s,x.shape[0]))
return np.sort(x[t,:])
if __name__ == "__main__":
kohonen()