-
Notifications
You must be signed in to change notification settings - Fork 0
/
kde2d_trimmed.m
executable file
·296 lines (251 loc) · 9.4 KB
/
kde2d_trimmed.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
function [bandwidth,density,X,Y]=kde2d_trimmed(alpha,data,n,MIN_XY,MAX_XY,bw)
% function [bandwidth,density,X,Y]=kde2d_trimmed(alpha,data,n,MIN_XY,MAX_XY,bw)
% The level-set trimmed version of Botev et al.'s kde2d code. It will
% (1) Estimate the initial density with kde2d
% (2) Trim off the data points with low densities such that the total
% removed is <= alpha
% (3) Rerun kde2d with the trimmed dataset
%
% Documetation from kde2d:
%
% fast and accurate state-of-the-art
% bivariate kernel density estimator
% with diagonal bandwidth matrix.
% The kernel is assumed to be Gaussian.
% The two bandwidth parameters are
% chosen optimally without ever
% using/assuming a parametric model for the data or any "rules of thumb".
% Unlike many other procedures, this one
% is immune to accuracy failures in the estimation of
% multimodal densities with widely separated modes (see examples).
% INPUTS: data - an N by 2 array with continuous data
% n - size of the n by n grid over which the density is computed
% n has to be a power of 2, otherwise n=2^ceil(log2(n));
% the default value is 2^8;
% MIN_XY,MAX_XY- limits of the bounding box over which the density is computed;
% the format is:
% MIN_XY=[lower_Xlim,lower_Ylim]
% MAX_XY=[upper_Xlim,upper_Ylim].
% The dafault limits are computed as:
% MAX=max(data,[],1); MIN=min(data,[],1); Range=MAX-MIN;
% MAX_XY=MAX+Range/4; MIN_XY=MIN-Range/4;
% OUTPUT: bandwidth - a row vector with the two optimal
% bandwidths for a bivaroate Gaussian kernel;
% the format is:
% bandwidth=[bandwidth_X, bandwidth_Y];
% density - an n by n matrix containing the density values over the n by n grid;
% density is not computed unless the function is asked for such an output;
% X,Y - the meshgrid over which the variable "density" has been computed;
% the intended usage is as follows:
% surf(X,Y,density)
% Example (simple Gaussian mixture)
% clear all
% % generate a Gaussian mixture with distant modes
% data=[randn(500,2);
% randn(500,1)+3.5, randn(500,1);];
% % call the routine
% [bandwidth,density,X,Y]=kde2d(data);
% % plot the data and the density estimate
% contour3(X,Y,density,50), hold on
% plot(data(:,1),data(:,2),'r.','MarkerSize',5)
%
% Example (Gaussian mixture with distant modes):
%
% clear all
% % generate a Gaussian mixture with distant modes
% data=[randn(100,1), randn(100,1)/4;
% randn(100,1)+18, randn(100,1);
% randn(100,1)+15, randn(100,1)/2-18;];
% % call the routine
% [bandwidth,density,X,Y]=kde2d(data);
% % plot the data and the density estimate
% surf(X,Y,density,'LineStyle','none'), view([0,60])
% colormap hot, hold on, alpha(.8)
% set(gca, 'color', 'blue');
% plot(data(:,1),data(:,2),'w.','MarkerSize',5)
%
% Example (Sinusoidal density):
%
% clear all
% X=rand(1000,1); Y=sin(X*10*pi)+randn(size(X))/3; data=[X,Y];
% % apply routine
% [bandwidth,density,X,Y]=kde2d(data);
% % plot the data and the density estimate
% surf(X,Y,density,'LineStyle','none'), view([0,70])
% colormap hot, hold on, alpha(.8)
% set(gca, 'color', 'blue');
% plot(data(:,1),data(:,2),'w.','MarkerSize',5)
%
% Notes: If you have a more accurate density estimator
% (as measured by which routine attains the smallest
% L_2 distance between the estimate and the true density) or you have
% problems running this code, please email me at [email protected]
% Reference: Z. I. Botev, J. F. Grotowski and D. P. Kroese
% "KERNEL DENSITY ESTIMATION VIA DIFFUSION" ,Submitted to the
% Annals of Statistics, 2009
% 4/2011 bst modified it to add a bandwidth parameter
% 9/2011 bst modified it to do the trimming
global N A2 I
if nargin<2 || isempty(n) %%%needed for the HACK by bst
n=2^8;
end
n=2^ceil(log2(n)); % round up n to the next power of 2;
N=size(data,1);
if nargin<3 || isempty(MIN_XY) || isempty(MAX_XY) %%%needed for the HACK by bst
MAX=max(data,[],1); MIN=min(data,[],1); Range=MAX-MIN;
MAX_XY=MAX+Range/4; MIN_XY=MIN-Range/4;
end
scaling=MAX_XY-MIN_XY;
if N<=size(data,2)
error('data has to be an N by 2 array where each row represents a two dimensional observation')
end
transformed_data=(data-repmat(MIN_XY,N,1))./repmat(scaling,N,1);
%bin the data uniformly using regular grid;
initial_data=ndhist(transformed_data,n);
%%%%%
% discrete cosine transform of initial data
a= dct2d(initial_data);
% now compute the optimal bandwidth^2
I=(0:n-1).^2; A2=a.^2;
t_star=fzero( @(t)(t-evolve(t)),[0,0.1]);
p_02=func([0,2],t_star);p_20=func([2,0],t_star); p_11=func([1,1],t_star);
t_y=(p_02^(3/4)/(4*pi*N*p_20^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
t_x=(p_20^(3/4)/(4*pi*N*p_02^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
%%%HACK by bst (4/2011) for convenience
if exist('bw','var') && ~isempty(bw)
t_x = (bw(1)/scaling(1))^2;
t_y = (bw(2)/scaling(2))^2;
end
%%%END HACK
% smooth the discrete cosine transform of initial data using t_star
a_t=exp(-(0:n-1)'.^2*pi^2*t_x/2)*exp(-(0:n-1).^2*pi^2*t_y/2).*a;
% now apply the inverse discrete cosine transform
if nargout>1
numel(a_t);
prod(scaling);
density=idct2d(a_t)*(numel(a_t)/prod(scaling));
[X,Y]=meshgrid(MIN_XY(1):scaling(1)/(n-1):MAX_XY(1),MIN_XY(2):scaling(2)/(n-1):MAX_XY(2));
end
bandwidth=sqrt([t_x,t_y]).*scaling;
%%%%%
%%%find places to trim (BST)
dent = density';
ii = find(initial_data(:)~=0);
[dd jj] = sort(dent(ii));
cdd = cumsum(dd);
cdd = cdd/(cdd(end));
kk = find(cdd>alpha,1);
if isempty(kk) || kk<=1
return
end
initial_data(ii(jj(1:kk-1))) = 0; % trimmed
%%%%%
% discrete cosine transform of initial data
a= dct2d(initial_data);
% now compute the optimal bandwidth^2
I=(0:n-1).^2; A2=a.^2;
t_star=fzero( @(t)(t-evolve(t)),[0,0.1]);
p_02=func([0,2],t_star);p_20=func([2,0],t_star); p_11=func([1,1],t_star);
t_y=(p_02^(3/4)/(4*pi*N*p_20^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
t_x=(p_20^(3/4)/(4*pi*N*p_02^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
%%%HACK by bst (4/2011) for convenience
if exist('bw','var') && ~isempty(bw)
t_x = (bw(1)/scaling(1))^2;
t_y = (bw(2)/scaling(2))^2;
end
%%%END HACK
% smooth the discrete cosine transform of initial data using t_star
a_t=exp(-(0:n-1)'.^2*pi^2*t_x/2)*exp(-(0:n-1).^2*pi^2*t_y/2).*a;
% now apply the inverse discrete cosine transform
if nargout>1
numel(a_t);
prod(scaling);
density=idct2d(a_t)*(numel(a_t)/prod(scaling));
[X,Y]=meshgrid(MIN_XY(1):scaling(1)/(n-1):MAX_XY(1),MIN_XY(2):scaling(2)/(n-1):MAX_XY(2));
end
bandwidth=sqrt([t_x,t_y]).*scaling;
%%%%%
end
%#######################################
function [out,time]=evolve(t)
global N
Sum_func = func([0,2],t) + func([2,0],t) + 2*func([1,1],t);
time=(2*pi*N*Sum_func)^(-1/3);
out=(t-time)/time;
end
%#######################################
function out=func(s,t)
global N
if sum(s)<=4
Sum_func=func([s(1)+1,s(2)],t)+func([s(1),s(2)+1],t); const=(1+1/2^(sum(s)+1))/3;
time=(-2*const*K(s(1))*K(s(2))/N/Sum_func)^(1/(2+sum(s)));
out=psi(s,time);
else
out=psi(s,t);
end
end
%#######################################
function out=psi(s,Time)
global I A2
% s is a vector
w=exp(-I*pi^2*Time).*[1,.5*ones(1,length(I)-1)];
wx=w.*(I.^s(1));
wy=w.*(I.^s(2));
out=(-1)^sum(s)*(wy*A2*wx')*pi^(2*sum(s));
end
%#######################################
function out=K(s)
out=(-1)^s*prod((1:2:2*s-1))/sqrt(2*pi);
end
%#######################################
function data=dct2d(data)
% computes the 2 dimensional discrete cosine transform of data
% data is an nd cube
[nrows,ncols]= size(data);
if nrows~=ncols
error('data is not a square array!')
end
% Compute weights to multiply DFT coefficients
w = [1;2*(exp(-i*(1:nrows-1)*pi/(2*nrows))).'];
weight=w(:,ones(1,ncols));
data=dct1d(dct1d(data)')';
function transform1d=dct1d(x)
% Re-order the elements of the columns of x
x = [ x(1:2:end,:); x(end:-2:2,:) ];
% Multiply FFT by weights:
transform1d = real(weight.* fft(x));
end
end
%#######################################
function data = idct2d(data)
% computes the 2 dimensional inverse discrete cosine transform
[nrows,ncols]=size(data);
% Compute wieghts
w = exp(i*(0:nrows-1)*pi/(2*nrows)).';
weights=w(:,ones(1,ncols));
data=idct1d(idct1d(data)');
function out=idct1d(x)
y = real(ifft(weights.*x));
out = zeros(nrows,ncols);
out(1:2:nrows,:) = y(1:nrows/2,:);
out(2:2:nrows,:) = y(nrows:-1:nrows/2+1,:);
end
end
%#######################################
function binned_data=ndhist(data,M)
% this function computes the histogram
% of an n-dimensional data set;
% 'data' is nrows by n columns
% M is the number of bins used in each dimension
% so that 'binned_data' is a hypercube with
% size length equal to M;
[nrows,ncols]=size(data);
bins=zeros(nrows,ncols);
for i=1:ncols
[dum,bins(:,i)] = histc(data(:,i),[0:1/M:1],1);
bins(:,i) = min(bins(:,i),M);
end
% Combine the vectors of 1D bin counts into a grid of nD bin
% counts.
binned_data = accumarray(bins(all(bins>0,2),:),1/nrows,M(ones(1,ncols)));
end