-
Notifications
You must be signed in to change notification settings - Fork 1
/
generalKernel.h
377 lines (294 loc) · 14 KB
/
generalKernel.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#ifndef __GENERAL_KERNEL__
#define __GENERAL_KERNEL__
//#define STANDALONE
#ifndef STANDALONE
#include "lsst/afw/image.h"
namespace afwImage = lsst::afw::image;
namespace afwMath = lsst::afw::math;
#endif
#include <stdio.h>
#include "Halide.h"
#include <bitset>
#include "clock.h"
#include <math.h>
using namespace std;
using namespace Halide;
using Halide::Image;
#define PI_FLOAT 3.14159265359f //
#define NUMBER_OF_RUNS 5 //number of runs when performance testing
//Each output mask pixel is the bitwise OR of each input mask pixel that
//overlaps a nonzero kernel element for the given output pixel.
//If OR_ALL_MASK_PIXELS is true, all mask pixels overlapping a kernel
//element will be ORed without checking whether the kernel element is zero.
//Gaussians only reach zero at infinity, so this should technically be ok.
//The LSST implementation appears to calculate the kernel with doubles
//and OR when the double is nonzero, but this cutoff seems arbitrary.
//Look into an appropriate cutoff value close to zero.
#define OR_ALL_MASK_PIXELS true
// from lesson_12_using_the_gpu.cpp
// We're going to want to schedule a pipeline in several ways, so we
// define the pipeline in a class so that we can recreate it several
// times with different schedules.
// Define some Vars to use.
Var x, y, i, j, i_v, y_0, yi;
//represents polynomial:
//polynomial(x, y) = a + u*x + v*y + uu*x*x + uv*x*y + vv*y*y + uuu*x*x*x + uuv*x*x*y
// uvv*x*y*y + vvv*y*y*y
class polynomial{
public:
polynomial(float a_, float u_, float v_, float uu_, float uv_, float vv_, float uuu_,
float uuv_, float uvv_, float vvv_): a(a_), u(u_), v(v_), uu(uu_),
uv(uv_), vv(vv_), uuu(uuu_), uuv(uuv_), uvv(uvv_), vvv(vvv_) {}
Halide::Expr operator()(){
return (a + u*x + v*y + uu*x*x + uv*x*y + vv*y*y + uuu*x*x*x + uuv*x*x*y +
uvv*x*y*y + vvv*y*y*y);
}
private:
float a, u, v, uu, uv, vv, uuu, uuv, uvv, vvv;
};
//Abstract base class for 2 dimensional kernels
class kernel2D{
public:
//return a Halide Expr for the kernel at location (i, j) in the kernel
virtual Halide::Expr operator()(int i, int j) = 0;
//return a Halide Expr for the kernel in terms of Vars i and j
//used for interpolation
virtual Halide::Expr operator()() = 0;
};
//Represents a 2 dimensional gaussian with standard deviations sigmaI and sigmaJ in its
//i and j dimensions respectively. The guassian's i, j coordinate system is rotated by theta
//radians with respect to the image's x, y coordinate system.
//The guassian is normalized from -infinity to infinity, but will not be normalized for
//a finite kernel size. Normalization could be removed to save dividing by the normalization
//factor, but is here so that output matches the LSST reference more closely (this came up
//as an issue when perfoming linear interpolation with a spatially varying gaussian)
class gaussian2D: public kernel2D{
public:
gaussian2D(float sigmaI_, float sigmaJ_, float theta_)
: sigmaI(sigmaI_), sigmaJ(sigmaJ_), theta(theta_){}
// Halide::Expr operator()(int i, int j){
// return(exp(-((i*cos(theta) + j*sin(theta))*(i*cos(theta) + j*sin(theta)))
// /(2*sigmaI*sigmaI))
// *exp(-((j*cos(theta) - i*sin(theta))*(j*cos(theta) - i*sin(theta)))
// /(2*sigmaJ*sigmaJ)) / (2.0f*PI_FLOAT*sigmaI*sigmaJ));
// }
//
// Halide::Expr operator()(){
// return(exp(-((i*cos(theta) + j*sin(theta))*(i*cos(theta) + j*sin(theta)))
// /(2*sigmaI*sigmaI))
// *exp(-((j*cos(theta) - i*sin(theta))*(j*cos(theta) - i*sin(theta)))
// /(2*sigmaJ*sigmaJ)) / (2.0f*PI_FLOAT*sigmaI*sigmaJ));
// }
//Simplify the expression:
Halide::Expr operator()(int i, int j){
return(exp(-((i*cos(theta) + j*sin(theta))*(i*cos(theta) + j*sin(theta)))
/(2*sigmaI*sigmaI)
-((j*cos(theta) - i*sin(theta))*(j*cos(theta) - i*sin(theta)))
/(2*sigmaJ*sigmaJ)) / (2.0f*PI_FLOAT*sigmaI*sigmaJ));
}
Halide::Expr operator()(){
return(exp(-((i*cos(theta) + j*sin(theta))*(i*cos(theta) + j*sin(theta)))
/(2*sigmaI*sigmaI)
-((j*cos(theta) - i*sin(theta))*(j*cos(theta) - i*sin(theta)))
/(2*sigmaJ*sigmaJ)) / (2.0f*PI_FLOAT*sigmaI*sigmaJ));
}
private:
float sigmaI, sigmaJ, theta;
};
//A 2 dimensional gaussian with spatially varying standard deviations represented
//by polynomials sigmaI(x, y) and sigmaJ(x, y) in its i and j dimensions respectively. The
//guassian's i, j coordinate system is rotated by spatialTheta(x, y) (a polynomial) radians
//with respect to the image's x, y coordinate system.
//The guassian is normalized from -infinity to infinity, but will not be normalized for
//a finite kernel size. Normalization could be removed to save dividing by the normalization
//factor, but is here so that output matches the LSST reference more closely (this came up
//as an issue when perfoming linear interpolation with a spatially varying gaussian)
class spatiallyVaryingGaussian2D: public kernel2D{
public:
spatiallyVaryingGaussian2D(polynomial spatialSigmaI_, polynomial spatialSigmaJ_,
polynomial spatialTheta_)
: spatialSigmaI(spatialSigmaI_), spatialSigmaJ(spatialSigmaJ_)
, spatialTheta(spatialTheta_){}
/* Halide::Expr operator()(int i, int j){
return (exp(-((i*cos(spatialTheta()) + j*sin(spatialTheta())) *
(i*cos(spatialTheta()) + j*sin(spatialTheta()))) /
(2*spatialSigmaI()*spatialSigmaI()))
* exp(-((j*cos(spatialTheta()) - i*sin(spatialTheta())) *
(j*cos(spatialTheta()) - i*sin(spatialTheta()))) /
(2*spatialSigmaJ()*spatialSigmaJ())))
/(2.0f*PI_FLOAT*spatialSigmaI()*spatialSigmaJ());
}
Halide::Expr operator()(){
return (exp(-((i*cos(spatialTheta()) + j*sin(spatialTheta())) *
(i*cos(spatialTheta()) + j*sin(spatialTheta()))) /
(2*spatialSigmaI()*spatialSigmaI()))
* exp(-((j*cos(spatialTheta()) - i*sin(spatialTheta())) *
(j*cos(spatialTheta()) - i*sin(spatialTheta()))) /
(2*spatialSigmaJ()*spatialSigmaJ())))
/(2.0f*PI_FLOAT*spatialSigmaI()*spatialSigmaJ());
}
*/
Halide::Expr operator()(int i, int j){
return exp(-((i*cos(spatialTheta()) + j*sin(spatialTheta())) *
(i*cos(spatialTheta()) + j*sin(spatialTheta()))) /
(2*spatialSigmaI()*spatialSigmaI())
- ((j*cos(spatialTheta()) - i*sin(spatialTheta())) *
(j*cos(spatialTheta()) - i*sin(spatialTheta()))) /
(2*spatialSigmaJ()*spatialSigmaJ()));
}
Halide::Expr operator()(){
return exp(-((i*cos(spatialTheta()) + j*sin(spatialTheta())) *
(i*cos(spatialTheta()) + j*sin(spatialTheta()))) /
(2*spatialSigmaI()*spatialSigmaI())
- ((j*cos(spatialTheta()) - i*sin(spatialTheta())) *
(j*cos(spatialTheta()) - i*sin(spatialTheta()))) /
(2*spatialSigmaJ()*spatialSigmaJ()));
}
private:
polynomial spatialSigmaI, spatialSigmaJ, spatialTheta;
};
//Abstract base class for 1 dimensional kernels
class kernel1D{
public:
virtual Halide::Expr operator()(int i) = 0;
};
//Represents a 1 dimensional gaussian with standard deviation sigma.
//The guassian is not normalized.
class gaussian1D:public kernel1D{
public:
gaussian1D(float sigma_): sigma(sigma_) {}
Halide::Expr operator()(int i){
return exp(-(i*i)/(2*sigma*sigma));
}
private:
float sigma;
};
//A 1 dimensional gaussian whose standard deviation is represented by the
//spatially varying polynomial spatialSigma(x, y)
class spatiallyVaryingGaussian1D : public kernel1D{
public:
spatiallyVaryingGaussian1D(polynomial spatialSigma_): spatialSigma(spatialSigma_) {}
Halide::Expr operator()(int i){
return exp(-(i*i)/(2*spatialSigma()*spatialSigma()));
}
private:
polynomial spatialSigma;
};
//the total kernel is computed as a (possibly spatially varying) linear combination of each basis
//kernel (if there are multiple basis kernels) and convolved once with each input plane.
//Using a tuple to combine the computation of the three input planes is optional. Tuples are
//generally faster on the CPU but slower on the GPU.
class generalKernel {
public:
Image<float> image;
Image<float> variance;
Image<uint16_t> mask;
//output planes calculated using test_performance_cpu()
Image<float> image_output_cpu;
Image<float> variance_output_cpu;
Image<uint16_t> mask_output_cpu;
//output planes calculated using test_performance_gpu()
Image<float> image_output_gpu;
Image<float> variance_output_gpu;
Image<uint16_t> mask_output_gpu;
double avg_cpu_time;
double avg_gpu_time;
generalKernel(string imageLocation, int kernelSize);
//Functions to create specific types of kernels
virtual void createLinearCombinationProgram(
vector<polynomial> weights, vector<kernel2D *> kernels);
virtual void createLinearCombinationProgramWithInterpolation(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
virtual void createLinearCombinationProgramWithInterpolationNormalizeAfterInterpolation(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
//older versions that had different performance, stuck them in here for side by side
//comparison
virtual void createLinearCombinationProgramWithInterpolation1(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
virtual void createLinearCombinationProgramWithInterpolationNormalizeAfterInterpolation1(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
//Save .fits images using the LSST stack (does nothing if STANDALONE defined)
//Before running, load the LSST stack using
//$ source ./loadLSST.bash
//$ setup afw -t v10_1 (or appropriate version, also may work with simply $setup afw)
//save_fits_image("folder1/folder2/imageName") will save image to:
//folder1/folder2/imageNameKERNELSIZExKERNELSIZE.fits
//e.g. for the case of a 5x5 kernel:
//folder1/folder2/imageName5x5.fits
void save_fits_image(string imageDestination);
//Kernel schedules
virtual void schedule_for_cpu() = 0;
virtual void schedule_for_gpu() = 0;
//Benchmark kernels and get output image
virtual void test_performance_cpu(int printInfo) = 0;
virtual void test_performance_gpu(int printInfo) = 0;
//check whether the CPU output matches the .fits image
//referred to by referenceLocation
//enter 0 for little information, 1 for more details
void test_correctness(string referenceLocation, int details);
//just print tab separated numbers in a table format
void test_correctness_clean(string referenceLocation);
virtual void debug();
protected:
//Kernel is size (bounding_box*2 + 1) x (bounding_box*2 + 1)
int bounding_box;
//Functions used to bound input planes
Func image_bounded, variance_bounded, mask_bounded;
//For interpolation
Func compressedKernel;
//Halide expressions used to compute output functions
Expr total_image_output;
Expr total_variance_output;
Expr total_mask_output;
};
class generalKernelWithTuples: public generalKernel{
public:
generalKernelWithTuples(string imageLocation, int kernelSize)
: generalKernel(imageLocation, kernelSize){}
void createLinearCombinationProgram(vector<polynomial> weights, vector<kernel2D *> kernels);
void createLinearCombinationProgramWithInterpolation(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void createLinearCombinationProgramWithInterpolationNormalizeAfterInterpolation(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void createLinearCombinationProgramWithInterpolation1(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void createLinearCombinationProgramWithInterpolationNormalizeAfterInterpolation1(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void schedule_for_cpu();
void schedule_for_gpu();
void schedule_interpolation_for_cpu();
//print performance times for printInfo == 1
//do no print for printInfo == 0
void test_performance_cpu(int printInfo);
void test_performance_gpu(int printInfo);
void debug();
protected:
//Tuple containing final output of all 3 planes
Func combined_output;
};
class generalKernelWithoutTuples: public generalKernel{
public:
generalKernelWithoutTuples(string imageLocation, int kernelSize)
: generalKernel(imageLocation, kernelSize){}
void createLinearCombinationProgram(vector<polynomial> weights, vector<kernel2D *> kernels);
void createLinearCombinationProgramWithInterpolation(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void createLinearCombinationProgramWithInterpolationNormalizeAfterInterpolation(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void createLinearCombinationProgramWithInterpolation1(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void createLinearCombinationProgramWithInterpolationNormalizeAfterInterpolation1(
vector<polynomial> weights, vector<kernel2D *> kernels, int interpDist);
void schedule_for_cpu();
void schedule_for_gpu();
void schedule_interpolation_for_cpu();
//print performance times for printInfo == 1
//do no print for printInfo == 0
void test_performance_cpu(int printInfo);
void test_performance_gpu(int printInfo);
void debug();
protected:
//Final outputs of the planes without using a tuple
Func image_output, variance_output, mask_output;
};
#endif //__GENERAL_KERNEL_H__