-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbinning.py
637 lines (467 loc) · 26.3 KB
/
binning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
class Binning:
def __init__(self, c, r_ip1, N, N_bins, lags, store_frame_rate=1, uniform_bins = True, min_count = 0, verbose = True):
#number of conditional variables
N_c = c.shape[1]
self.N_c = N_c
self.N = N
self.r_ip1 = r_ip1
self.c = c
self.N_bins = N_bins
self.N_s = r_ip1.size/N**2
self.lags = lags
self.max_lag = np.max(lags)*store_frame_rate
self.covar = {}
for i in range(N_c):
#self.covar[i] = np.zeros([N**2, max_lag])
self.covar[i] = []
#compute bins of c_i
if uniform_bins == True:
bins = self.get_bins(N_bins)
else:
bins = self.get_bins_same_nsamples(N_bins)
#count = numver of r_ip1 samples in each bin
#binedges = same as bins
#binnumber = bin indices of the r_ip1 samples. A 1D array, no matter N_c
count, binedges, binnumber = stats.binned_statistic_dd(c, r_ip1, statistic='count', bins=bins)
#x_idx = np.unravel_index(binnumber, [len(b) + 1 for b in self.bins])
#x_idx = [x_idx[i] - 1 for i in range(N_c)]
#the unique set of binnumers which have at least one r_ip1 sample
unique_binnumbers = np.unique(binnumber)
#some scalars
binnumber_max = np.max(unique_binnumbers)
#array containing r_ip1 indices sorted PER BIN
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#SHOULD BE A 1D ARRAY, AN ARRAY OF SIZE [MAX(BINNUMBER), MAX(COUNT)]
#WILL STORE MOSTLY ZEROS IN HIGHER DIMENSIONS, LEADING TO MEMORY FAILURES
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
idx_of_bin = []
#number of samples in each bin
size_of_bin = []
for b in unique_binnumbers:
#r_ip1 indices of bin b
tmp = np.where(binnumber == b)[0]
idx_of_bin.append(tmp)
size_of_bin.append(tmp.size)
#make a 1d array of the nested arrays
idx_of_bin = np.array(list(chain(*idx_of_bin)))
size_of_bin = np.array(size_of_bin)
#the starting offset of each bin in the 1D array idx_of_bin
#offset[unique_binnumbers] will give the starting index in idex_of_bin for
#each bin. Some entries are zero, which correspond to empty bins (except for
#the lowest non-empty binnumber)
offset = np.zeros(unique_binnumbers.size).astype('int')
offset[1:] = np.cumsum(size_of_bin[0:-1])
tmp = np.zeros(binnumber_max+1).astype('int')
tmp[unique_binnumbers] = offset
self.offset = tmp
####################
#empty bin handling#
####################
#find indices of empty bins
#Note: if 'full' is defined as 1 or more sample, binnumbers_nonempty is
#the same as binnumbers_unique
x_idx_nonempty = np.where(count > min_count)
x_idx_nonempty_p1 = [x_idx_nonempty[i] + 1 for i in range(N_c)]
binnumbers_nonempty = np.ravel_multi_index(x_idx_nonempty_p1, [len(b) + 1 for b in bins])
N_nonempty = binnumbers_nonempty.size
#mid points of the bins
x_mid = [0.5*(bins[i][1:] + bins[i][0:-1]) for i in range(N_c)]
#mid points of the non-empty bins
midpoints = np.zeros([N_nonempty, N_c])
for i in range(N_c):
midpoints[:, i] = x_mid[i][x_idx_nonempty[i]]
self.verbose = verbose
self.bins = bins
self.count = count
self.binnumber = binnumber
self.binnumbers_nonempty = binnumbers_nonempty
self.midpoints = midpoints
self.idx_of_bin = idx_of_bin
self.unique_binnumbers = unique_binnumbers
self.compute_binnumber_per_bin()
self.fill_in_blanks()
#mean r / var per cell
self.rmean, _, _ = stats.binned_statistic_dd(c, r_ip1, statistic='mean', bins=bins)
#self.rstd, _, _ = stats.binned_statistic_dd(c, r_ip1, statistic=std_per_bin, bins=bins)
#self.rmax, _, _ = stats.binned_statistic_dd(c, r_ip1, statistic='max', bins=bins)
#self.rmin, _, _ = stats.binned_statistic_dd(c, r_ip1, statistic='min', bins=bins)
#self.compute_c_samples_per_bin()
#check which c_i fall within empty bins and correct binnumbers_i by
#projecting to the nearest non-empty bin
def check_outliers(self, binnumbers_i, c_i):
#find out how many BINS with outliers there are
unique_binnumbers_i = np.unique(binnumbers_i)
idx = np.where(np.in1d(unique_binnumbers_i, self.binnumbers_nonempty) == False)[0]
N_outlier_bins = idx.size
if N_outlier_bins > 0:
#index of outlier SAMPLES in binnumbers_i
outliers_idx = np.in1d(binnumbers_i, unique_binnumbers_i[idx]).nonzero()[0]
N_outliers = outliers_idx.size
if self.verbose == True:
print N_outlier_bins, ' bins with', N_outliers ,'outlier samples found'
#x location of outliers
x_outliers = np.copy(c_i[outliers_idx])
#find non-empty bin closest to the outliers
closest_idx = np.zeros(N_outliers).astype('int')
for i in range(N_outliers):
if self.N_c == 1:
dist = np.linalg.norm(self.midpoints - x_outliers[i], 2, 1)
else:
dist = np.linalg.norm(self.midpoints - x_outliers[i,:], 2, 1)
closest_idx[i] = np.argmin(dist)
binnumbers_closest = self.binnumbers_nonempty[closest_idx]
check = self.binnumbers_nonempty[closest_idx]
x_idx_check = np.unravel_index(check, [len(b) + 1 for b in self.bins])
x_idx_check = [x_idx_check[i] - 1 for i in range(self.N_c)]
#overwrite outliers in binnumbers_i with nearest non-empty binnumber
binnumbers_closest = self.binnumbers_nonempty[closest_idx]
if self.verbose == True:
print 'Moving', binnumbers_i[outliers_idx], '-->', binnumbers_closest
binnumbers_i[outliers_idx] = binnumbers_closest
def get_binnumbers_i(self, c_i):
#find in which bins the c_i samples fall, shape: (N**2,)
_, _, binnumbers_i = stats.binned_statistic_dd(c_i, np.zeros(self.N**2), bins=self.bins)
return binnumbers_i
#create an apriori mapping between every possible bin and the nearest
#non-empty bin. Non-empty bins will link to themselves.
def fill_in_blanks(self):
bins_padded = []
#mid points of all possible bins, including ghost bins
for i in range(self.N_c):
dx1 = self.bins[i][1] - self.bins[i][0]
dx2 = self.bins[i][-1] - self.bins[i][-2]
#pad the beginning and end of current 1D bin with extrapolated values
bin_pad = np.pad(self.bins[i], (1,1), 'constant', constant_values=(self.bins[i][0]-dx1, self.bins[i][-1]+dx2))
bins_padded.append(bin_pad)
#compute the midpoints of the padded bins
x_mid_pad = [0.5*(bins_padded[i][1:] + bins_padded[i][0:-1]) for i in range(self.N_c)]
self.x_mid_pad_tensor = np.array(list(product(*x_mid_pad)))
#total number bins
self.max_binnumber = self.x_mid_pad_tensor.shape[0]
mapping = np.zeros(self.max_binnumber).astype('int')
for i in range(self.max_binnumber):
#bin is nonempty, just use current idx
if np.in1d(i, self.unique_binnumbers) == True:
mapping[i] = i
#bin is empty, find nearest non-empty bin
else:
binnumbers_i = np.array([i])
self.check_outliers(binnumbers_i, self.x_mid_pad_tensor[i].reshape([1, self.N_c]))
mapping[i] = binnumbers_i[0]
self.mapping = mapping
#visual representation of a 2D binning object. Also shows the mapping
#between empty to nearest non-empty bins.
def plot_2D_binning_object(self):
if self.N_c != 2:
print 'Only works for N_c = 2'
return
fig = plt.figure()
ax = fig.add_subplot(111, xlabel=r'conditioning variable 1', ylabel=r'conditioning variable 2')
#plot bins and (c1, c2) which corresponding to a r sample point
ax.plot(self.c[:,0], self.c[:,1], '+', color='lightgray', alpha=0.3)
ax.vlines(self.bins[0], np.min(self.c[:,1]), np.max(self.c[:,1]))
ax.hlines(self.bins[1], np.min(self.c[:,0]), np.max(self.c[:,0]))
ax.plot(self.x_mid_pad_tensor[:,0], self.x_mid_pad_tensor[:,1], 'g+')
#plot the mapping
for i in range(self.max_binnumber):
ax.plot([self.x_mid_pad_tensor[i][0], self.x_mid_pad_tensor[self.mapping[i]][0]], \
[self.x_mid_pad_tensor[i][1], self.x_mid_pad_tensor[self.mapping[i]][1]], 'b', alpha=0.4)
ax.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.tight_layout()
plt.show()
#the data-driven model for the unresolved scales
#Given c_i return r at time i+1 (r_ip1)
def get_r_ip1(self, c_i, n_mc=1):
#find in which bins the c_i samples fall
_, _, binnumbers_i = stats.binned_statistic_dd(c_i, np.zeros(self.N**2), bins=self.bins)
#dynamically corrects binnumbers_i if outliers are found
#self.check_outliers(binnumbers_i, c_i)
#static correction for outliers, using precomputed mapping array
binnumbers_i = self.mapping[binnumbers_i]
#self.check_outliers(binnumbers_i, c_i)
#convert 1D binnumbers_i to equivalent ND indices
x_idx = np.unravel_index(binnumbers_i, [len(b) + 1 for b in self.bins])
x_idx = [x_idx[i] - 1 for i in range(self.N_c)]
#random integers between 0 and max bin count for each index in binnumbers_i
I = np.floor(self.count[x_idx].reshape([self.N**2, 1])*np.random.rand(self.N**2, n_mc)).astype('int')
#the correct offset for the 1D array idx_of_bin
start = self.offset[binnumbers_i]
#get random r sample from each bin indexed by binnumbers_i
r = np.zeros([n_mc, self.N, self.N])
for i in range(n_mc):
r[i, :, :] = self.r_ip1[self.idx_of_bin[start + I[:, i]]].reshape([self.N, self.N])
return np.mean(r, 0) #, binnumbers_i
#the data-driven model for the unresolved scales
#Given c_i return bin averaged r at time i+1
def get_mean_r_ip1(self, c_i):
#find in which bins the c_i samples fall
_, _, binnumbers_i = stats.binned_statistic_dd(c_i, np.zeros(self.N**2), bins=self.bins)
#dynamically corrects binnumbers_i if outliers are found
#self.check_outliers(binnumbers_i, c_i)
#static correction for outliers, using precomputed mapping array
binnumbers_i = self.mapping[binnumbers_i]
#convert 1D binnumbers_i to equivalent ND indices
x_idx = np.unravel_index(binnumbers_i, [len(b) + 1 for b in self.bins])
x_idx = [x_idx[i] - 1 for i in range(self.N_c)]
return self.rmean[x_idx].reshape([self.N, self.N]) #, self.rstd[x_idx].reshape([self.N, self.N])
#Given c_i, randomly sample from sub bin averages, according to the jump pmfs
def get_sub_mean_r_ip1(self, c_i):
#find in which bins the c_i samples fall
_, _, binnumbers_i = stats.binned_statistic_dd(c_i, np.zeros(self.N**2), bins=self.bins)
#dynamically corrects binnumbers_i if outliers are found
#self.check_outliers(binnumbers_i, c_i)
#static correction for outliers, using precomputed mapping array
binnumbers_i = self.mapping[binnumbers_i]
#the selected jump pmfs for each spatial point
pmfs = self.jump_pmfs[binnumbers_i]
#N**2 random U(0, 1) samples to select the sub bin
xi = np.random.rand(self.N**2)
#draw a random sample from each pmf
sub_idx = [pmf_i.ppf(xi_i) for xi_i, pmf_i in zip(xi, pmfs)] #QUESTION: CAN THIS BE VECTORIZED?
sub_idx = np.array(sub_idx).astype('int')
return self.r_sub_mean[binnumbers_i, sub_idx].reshape([self.N, self.N])
#append the covariates supplied to the binning object during simulation
#to self.covars
#Note: use list to dynamically append, array is very slow
def append_covar(self, c_i):
for i in range(self.N_c):
self.covar[i].append(c_i[:,i])
#if max number of covariates is reached, remove first item
if len(self.covar[i]) > self.max_lag:
self.covar[i].pop(0)
#return lagged covariates, assumes constant lag
#Note: typecast to array if spatially varying lag is required
def get_covar(self, lags):
c_i = np.zeros([self.N**2, self.N_c])
for i in range(self.N_c):
if lags[i] <= self.max_lag:
c_i[:, i] = self.covar[i][-lags[i]]
else:
print 'Warning, max lag exceeded'
import sys; sys.exit()
return c_i
#compute the probabilities of staying or jumping to the left/right of the current bin,
#when sampling randomly from each bin
#ASSUMES AUTOCORRELATED SURROGATE WITH ONE COVARIATE
def compute_surrogate_jump_probabilities(self, plot=False):
if self.N_c != 1:
print 'Only works for N_c = 1'
return
#store the jump probabilities of the surrogate per bin
self.surrogate_jump_probs = np.zeros([self.N_bins+1, 3])
#store the r sample means of each jump probability per bin
self.r_sub_mean = np.zeros([self.N_bins+1, 3])
if plot == True:
fig = plt.figure()
ax = fig.add_subplot(111, xlabel=r'$\mathcal{C}_i$', ylabel=r'$\overline{r}_{i+1}$')
for j in range(self.unique_binnumbers.size):
J = self.unique_binnumbers[j]
#indices of all r samples in the current bin
idx = np.where(self.binnumber == J)[0]
#r samples of the current bin
r_idx = self.r_ip1[idx]
if r_idx.size > 0:
#indices of r_idx, IF USED AS COVARIATE
_, _, bins_idx = stats.binned_statistic_dd(r_idx, np.zeros(r_idx.size), bins=self.bins)
#number of samples in the current bin
nsamples_bin = np.double(idx.size)
print '++++++++++++++++++++'
print 'Bin ', J
#indices that are unchanged, larger or smaller than the current bin index
same_idx = np.where(bins_idx== J)[0]
left_idx = np.where(bins_idx < J)[0]
right_idx = np.where(bins_idx > J)[0]
#compute jump probabilities
self.surrogate_jump_probs[J, 0] = left_idx.size/nsamples_bin
self.surrogate_jump_probs[J, 1] = same_idx.size/nsamples_bin
self.surrogate_jump_probs[J, 2] = right_idx.size/nsamples_bin
print 'Number of samples in bin =', nsamples_bin
print 'Jump left probability = ', left_idx.size/nsamples_bin
print 'Stay probability =', same_idx.size/nsamples_bin
print 'Jump right probability =', right_idx.size/nsamples_bin
#bin mean associated with each jump event
self.r_sub_mean[J, 0] = np.mean(self.r_ip1[idx[left_idx]])
self.r_sub_mean[J, 1] = np.mean(self.r_ip1[idx[same_idx]])
self.r_sub_mean[J, 2] = np.mean(self.r_ip1[idx[right_idx]])
#colorcode samples per bin based on jump behavior
if plot == True:
#color code samples in current bin based on the bin that will be occupied in next iteration
ax.plot(self.c[idx[same_idx]], self.r_ip1[idx[same_idx]], '+', color='darkgray', alpha = 0.1)
ax.plot(self.c[idx[left_idx]], self.r_ip1[idx[left_idx]], '+', color='lightgray', alpha = 0.1)
ax.plot(self.c[idx[right_idx]], self.r_ip1[idx[right_idx]], '+', color='lightgray', alpha = 0.1)
#plot mean of the stay, jump left/right samples per bin
ax.plot(np.mean(self.c[idx[left_idx]]), self.r_sub_mean[J, 0], 'ko')
ax.plot(np.mean(self.c[idx[same_idx]]), self.r_sub_mean[J, 1], 'ko')
ax.plot(np.mean(self.c[idx[right_idx]]), self.r_sub_mean[J, 2], 'ko')
ax.vlines(self.bins, np.min(self.r_ip1), np.max(self.r_ip1))
plt.tight_layout()
#plt.savefig('jump.png', dpi=400)
#Compute the probabilities on staying, jumping to the left or the right of the current bin
#Based on the r data
def compute_jump_probabilities(self):
print 'Computing jump probabilities using', self.N_s, 'data fields'
#reshape the binnumbers of the data
binnumbers_k = self.binnumber.reshape([self.N**2, self.N_s])
#store the number/prob of jumps to the left, in place or to the right
#size = N_bins+1 since bin indices start at 1
self.jumps = np.zeros([self.N_bins+1, 3])
self.jump_probs = np.zeros([self.N_bins+1, 3])
self.jump_pmfs = []
self.jump_pmfs.append([])
for s in range(np.max(self.lags), self.N_s):
#the binnumbers of the s-th data entry
binnumbers_i = binnumbers_k[:, s-np.max(self.lags)]
#the binnumbers of the (s+1)-th data entry
binnumbers_ip1 = binnumbers_k[:, s]
#no bin change indices
no_jump_idx = np.where(binnumbers_i == binnumbers_ip1)[0]
#jumped to the left indices
jump_left_idx = np.where(binnumbers_i > binnumbers_ip1)[0]
#jumped to the right indices
jump_right_idx = np.where(binnumbers_i < binnumbers_ip1)[0]
#count number of jumps (IN CASE OF A FIELD SAMPLE)
#count_bins = np.arange(0.5, self.N_bins+1, 1) #bins surrounding the binnumbers 1, 2, 3, ... (bins=[0.5, 1.5, 2.5, 3.5, ...])
#jump_left_count, _, _ = stats.binned_statistic(binnumbers_i[jump_left_idx], np.zeros(jump_left_idx.size), statistic='count', bins=count_bins)
#stay_count, _, _ = stats.binned_statistic(binnumbers_i[no_jump_idx], np.zeros(no_jump_idx.size), statistic='count', bins=count_bins)
#jump_right_count, _, _ = stats.binned_statistic(binnumbers_i[jump_right_idx], np.zeros(jump_right_idx.size), statistic='count', bins=count_bins)
#
#self.jumps[1:, 0] += jump_left_count
#self.jumps[1:, 1] += stay_count
#self.jumps[1:, 2] += jump_right_count
jump_left_count = jump_left_idx.size
stay_count = no_jump_idx.size
jump_right_count = jump_right_idx.size
self.jumps[binnumbers_i, 0] += jump_left_count
self.jumps[binnumbers_i, 1] += stay_count
self.jumps[binnumbers_i, 2] += jump_right_count
#compute jump probabilities per bin
for j in range(1, self.N_bins+1):
self.jump_probs[j,:] = self.jumps[j,:]/np.sum(self.jumps[j,:])
print 'Jump probabities bin ', j, '=', self.jump_probs[j, :]
#create probability mass functions
self.jump_pmfs.append(stats.rv_discrete(values=(range(3), self.jump_probs[j,:])))
self.jump_pmfs = np.array(self.jump_pmfs)
#plot the jump probability of the data and the surrogate per bin
def plot_jump_pmfs(self):
try:
fig = plt.figure(figsize = [6.4, 5.8])
x = np.arange(1, self.N_bins+1)
#left probs
ax = fig.add_subplot(311, ylabel=r'jump left prob.', xlabel=r'bin', ylim=[0, 1])
ax.plot(x + 0.2, self.jump_probs[1:, 0], 'bo', ms=12, label=r'$\mathbb{P}\left(\bar{r}_{i+1}\in b_{j-k}\mid\bar{r}_i\in b_j\right)$')
ax.vlines(x + 0.2, 0, self.jump_probs[1:, 0], colors='b', lw=4)
ax.plot(x - 0.2, self.surrogate_jump_probs[1:, 0], 'rs', ms=12, label=r'$\mathbb{P}\left(\widetilde{r}_{i+1}\in b_{j-k}\mid\widetilde{r}_i\in b_j\right)$')
ax.vlines(x - 0.2, 0, self.surrogate_jump_probs[1:, 0], colors='r', lw=4)
leg = plt.legend(loc=0)
leg.draggable(True)
ax.set_xticks(range(self.N_bins+1))
#right probs
ax = fig.add_subplot(312, ylabel=r'stay prob.', xlabel=r'bin', ylim=[0, 1])
ax.plot(x + 0.2, self.jump_probs[1:, 1], 'bo', ms=12, label=r'$\mathbb{P}\left(\bar{r}_{i+1}\in b_{j}\mid\bar{r}_i\in b_j\right)$')
ax.vlines(x + 0.2, 0, self.jump_probs[1:, 1], colors='b', lw=4)
ax.plot(x - 0.2, self.surrogate_jump_probs[1:, 1], 'rs', ms = 12, label=r'$\mathbb{P}\left(\widetilde{r}_{i+1}\in b_{j}\mid\widetilde{r}_i\in b_j\right)$')
ax.vlines(x - 0.2, 0, self.surrogate_jump_probs[1:, 1], colors='r', lw=4)
leg = plt.legend(loc=0)
leg.draggable(True)
ax.set_xticks(range(self.N_bins+1))
#right probs
ax = fig.add_subplot(313, ylabel=r'jump right prob.', xlabel=r'bin', ylim=[0, 1])
ax.plot(x + 0.2, self.jump_probs[1:, 2], 'bo', ms = 12, label=r'$\mathbb{P}\left(\bar{r}_{i+1}\in b_{j+k}\mid\bar{r}_i\in b_j\right)$')
ax.vlines(x + 0.2, 0, self.jump_probs[1:, 2], colors='b', lw=4)
ax.plot(x - 0.2, self.surrogate_jump_probs[1:, 2], 'rs', ms = 12, label=r'$\mathbb{P}\left(\widetilde{r}_{i+1}\in b_{j+k}\mid\widetilde{r}_i\in b_j\right)$')
ax.vlines(x - 0.2, 0, self.surrogate_jump_probs[1:, 2], colors='r', lw=4)
leg = plt.legend(loc=0)
leg.draggable(True)
ax.set_xticks(range(self.N_bins+1))
plt.tight_layout()
#if jump probabilities do not exist
except AttributeError:
plt.close()
print 'Compute probabilities first'
#c samples per bin stored as an array of arrays instead of 1d array
def compute_c_samples_per_bin(self):
c_per_bin = []
for b in range(np.max(self.unique_binnumbers)+1):
idx = np.where(self.binnumber == b)[0]
c_per_bin.append(self.c[idx])
self.c_per_bin = np.array(c_per_bin)
#same as idx_per_bin only stored as a dict instead of 1d array
def compute_binnumber_per_bin(self):
idx_per_bin_dict = {}
for b in self.unique_binnumbers:
#r_ip1 indices of bin b
idx_per_bin_dict[b] = np.where(self.binnumber == b)[0]
self.idx_per_bin_dict = idx_per_bin_dict
def plot_samples_per_bin(self, subsample=1):
if self.N_c == 1:
fig = plt.figure()
ax = fig.add_subplot(111, xlabel=r'$\mathrm{conditioning\;variable}$', ylabel=r'$\mathrm{reference\;data}$')
for b in self.unique_binnumbers:
c_b = self.c[self.idx_per_bin_dict[b]]
r_b = self.r_ip1[self.idx_per_bin_dict[b]]
ax.plot(c_b[0:-1:subsample], r_b[0:-1:subsample], '+', color='lightgray', alpha=0.3)
ax.plot(np.mean(c_b), np.mean(r_b), 'ko', markersize=6)
ax.vlines(self.bins, np.min(self.r_ip1), np.max(self.r_ip1))
ax.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.tight_layout()
#plt.savefig('bins.png', dpi=300)
plt.show()
#plots the mean and std dev per bin, ASSUMES 2 COVARIATES
def plot_bin_stats(self):
if self.N_c != 2:
print 'Only works for 2D c'
else:
fig = plt.figure(figsize=[8.0, 3.71])
ax = fig.add_subplot(121)
ax.set_title(r'$\mathbb{E}[r]$', fontsize=20)
ax.set_xlabel(r'$c_1\;\mathrm{bins}$', fontsize=20)
ax.set_ylabel(r'$c_2\;\mathrm{bins}$', fontsize=20)
im = ax.imshow(self.rmean)
plt.colorbar(im)
#
ax = fig.add_subplot(122)
ax.set_title(r'$\sqrt{\mathbb{V}ar[r]}$', fontsize=20)
ax.set_xlabel(r'$c_1\;\mathrm{bins}$', fontsize=20)
ax.set_ylabel(r'$c_2\;\mathrm{bins}$', fontsize=20)
im = ax.imshow(self.rstd)
plt.colorbar(im)
plt.tight_layout()
plt.show()
def print_bin_info(self):
print '-------------------------------'
print 'Total number of samples= ', self.r_ip1.size
print 'Total number of bins = ', self.N_bins**self.N_c
print 'Total number of non-empty bins = ', self.binnumbers_nonempty.size
print 'Percentage filled = ', np.double(self.binnumbers_nonempty.size)/self.N_bins**self.N_c*100., ' %'
print '-------------------------------'
#compute the uniform bins of the conditional variables in c
def get_bins(self, N_bins):
bins = []
for i in range(self.N_c):
bins.append(np.linspace(np.min(self.c[:,i]), np.max(self.c[:,i]), N_bins+1))
return bins
def get_bins_same_nsamples(self, N_bins):
if self.N_c != 1:
print 'Only works for N_c = 1'
return
Ns = self.c.size
delta = Ns/N_bins
c_sorted = np.sort(self.c.reshape(Ns))
bins = []
bins.append(c_sorted[0])
for i in range(N_bins-1):
bins.append(c_sorted[(i+1)*delta])
bins.append(c_sorted[-1])
#plt.figure()
#plt.plot(c_sorted, 'b')
#plt.plot(np.zeros(N_bins+1), bins, 'ro')
return [np.array(bins)]
def std_per_bin(x):
return np.var(x)**0.5
import numpy as np
from scipy import stats
from itertools import chain, product
import matplotlib.pyplot as plt
import itertools