-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathPyCUDABackProjectionKernel.cu
186 lines (149 loc) · 5.86 KB
/
PyCUDABackProjectionKernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* vim: set tabstop=4, shiftwidth=4, expandtab */
#include "cuda.h" /* CUDA */
/***
* Compiler logics
* **/
#define CLIGHT 299792458.0f /* c: speed of light, m/s */
#define PI 3.141592653589793116f /* pi, accurate to 128-bits */
#define PI2 6.283185307179586232f /* 2*pi */
#define PI_4__CLIGHT (4.0f * PI / CLIGHT)
#define REAL(vec) (vec.x)
#define IMAG(vec) (vec.y)
/* The following macros are defined for common arithmetic operations. We use
* the __f*_rn() CUDA intrinsics in order to prevent these operations from
* being performed on FMADs, whose implementation forces the truncation of the
* result between the multiple and add, giving us less accurate results.
*/
#define CAREFUL_AMINUSB_SQ(x,y) __fmul_rn(__fadd_rn((x), -1.0f*(y)), __fadd_rn((x), -1.0f*(y)))
#define MAKERADIUS(xpixel,ypixel, xa,ya,za) sqrtf(CAREFUL_AMINUSB_SQ(xpixel, xa) + CAREFUL_AMINUSB_SQ(ypixel, ya) + __fmul_rn(za, za))
#define BLOCKWIDTH 16
#define BLOCKHEIGHT 16
/***
* Type defs
* ***/
typedef float FloatType; /* FIXME: this should be used everywhere */
/***
* Prototypes
* ***/
__device__ float2 expjf(float in);
__device__ float2 expjf_div_2(float in);
/* Complex textures containing range profiles */
texture<float2, 2, cudaReadModeElementType> tex_projections;
/* Main kernel.
* */
__global__ void backprojection_loop(float2 * full_image,
int Nphi, int IMG_HEIGHT, float delta_pixel_x, float delta_pixel_y,
int PROJ_LENGTH,
float * PI_4_F0__CLIGHT,
float LEFT, float BOTTOM,
float4 * PLATFORM_INFO,
float rmin, float rmax) {
float2 subimage;
subimage = make_float2(0.0f, 0.0f);
float2 csum; // For compensated sum
float y, t;
csum = make_float2(0.0f, 0.0f);
float xpixel = LEFT + (float)(blockIdx.x * BLOCKWIDTH + threadIdx.x) *
delta_pixel_x;
float ypixel = BOTTOM + (float)(blockIdx.y * BLOCKHEIGHT + threadIdx.y) *
delta_pixel_y;
float2 texel;
__shared__ int proj_num;
__shared__ float4 platform;
__shared__ int copyblock;
__shared__ float delta_r;
delta_r = rmax - rmin;
__shared__ float Nl1_dr;
Nl1_dr = __fdiv_rn((float)PROJ_LENGTH - 1.0f, delta_r);
copyblock = (blockIdx.y * BLOCKHEIGHT) * IMG_HEIGHT + blockIdx.x * BLOCKWIDTH;
/* Now, let's loop through these projections!
* */
#pragma unroll 3
for (proj_num=0; proj_num < Nphi; ++proj_num) {
platform = PLATFORM_INFO[proj_num];
/* R_reciprocal = 1/R = 1/sqrt(sum_{# in xyz} [#pixel - #platform]^2),
* This is the distance between the platform and every pixel.
*/
/* // This is what's *really* happening.
float R = sqrtf(
(xpixel - platform.x) *
(xpixel - platform.x) +
(ypixel - platform.y) *
(ypixel - platform.y) +
platform.z * platform.z);*/
float R = MAKERADIUS(xpixel, ypixel, platform.x, platform.y, platform.z);
/* Per-pixel-projection phasor = exp(1j 4 pi/c * f_min * R). */
// Faster, less accurate:
// float2 pixel_scale = expjf_div_2(PI_4_F0__CLIGHT[proj_num] * R * 0.5f);
// Slower, more accurate:
float2 pixel_scale = expjf(PI_4_F0__CLIGHT[proj_num] * R);
/* The fractional range bin for this pixel, this projection. Implements
* effective_idx = Nl1_dr*( R-platform.w-rmin )
*/
float effective_idx = __fmul_rn(Nl1_dr , __fadd_rn(__fadd_rn(R, -1.0f*platform.w), -1.0f*rmin));
/* This is the interpolated range profile element for this pulse */
texel = tex2D(tex_projections, 0.5f+(float)proj_num, 0.5f+effective_idx);
/* Scale "texel" by "pixel_scale".
The RHS of these 2 lines just implement complex multiplication.
*/
/*
subimage.x += REAL(texel)*REAL(pixel_scale) -
IMAG(texel)*IMAG(pixel_scale);
subimage.y += REAL(texel)*IMAG(pixel_scale) +
IMAG(texel)*REAL(pixel_scale);
*/
// I'm experimenting with cumulative summing to try and reduce errors.
// See http://en.wikipedia.org/wiki/Kahan_summation_algorithm
y = REAL(texel)*REAL(pixel_scale) - REAL(csum);
t = subimage.x + y;
csum.x = (t-subimage.x) - y;
subimage.x = t;
y = -1.0f*IMAG(texel)*IMAG(pixel_scale) - REAL(csum);
t = subimage.x + y;
csum.x = (t-subimage.x) - y;
subimage.x = t;
y = REAL(texel)*IMAG(pixel_scale) - IMAG(csum);
t = subimage.y + y;
csum.y = (t-subimage.y) - y;
subimage.y = t;
y = IMAG(texel)*REAL(pixel_scale) - IMAG(csum);
t = subimage.y + y;
csum.y = (t-subimage.y) - y;
subimage.y = t;
}
/* Copy this thread's pixel back to global memory */
//full_image[(blockIdx.y * BLOCKHEIGHT + threadIdx.y) * IMG_HEIGHT +
// blockIdx.x * BLOCKWIDTH + threadIdx.x] = subimage;
full_image[copyblock + (threadIdx.y) * IMG_HEIGHT + threadIdx.x] = subimage;
}
/* Credits: from BackProjectionKernal.c: "originally by reinke".
* Given a float X, returns float2 Y = exp(j * X).
*
* __device__ code is always inlined. */
__device__
float2 expjf(float in) {
float2 out;
float t, tb;
t = tan(in / 2.0f);
tb = t*t + 1.0f;
out.x = (2.0f - tb) / tb; /* Real */
out.y = (2.0f * t) / tb; /* Imag */
return out;
}
/* This is derived from the above function, but is way faster because it uses
* hardware __tanf(). This is allowable only if the argument to it is
* "small"---within +/- 2\pi.
*
* Given float X, returns exp(j * 2*X).
*/
__device__
float2 expjf_div_2(float in) {
float2 out;
float t, tb;
//t = __tanf(in - (float)((int)(in/(PI2)))*PI2 );
t = __tanf(in - PI * rintf(in/PI) );
tb = t*t + 1.0f;
out.x = (2.0f - tb) / tb; /* Real */
out.y = (2.0f * t) / tb; /* Imag */
return out;
}