-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtranspose.c
107 lines (96 loc) · 3.77 KB
/
transpose.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/**
* Transpose functions.
*
* @author Connor Imes <[email protected]>
* @author Kaushik Datta <[email protected]>
* @date 2019-07-15
*/
#include <complex.h>
#include <stdlib.h>
#include "transpose.h"
#define TRANSPOSE_BLK(A, B, A_rows, A_cols, r_min, c_min, r_max, c_max) { \
size_t r, c; \
for (r = (r_min); r < (r_max); r++) { \
for (c = (c_min); c < (c_max); c++) { \
(B)[(c) * (A_rows) + (r)] = (A)[(r) * (A_cols) + (c)]; \
} \
} \
}
#define TRANSPOSE_BLOCKED(A, B, A_rows, A_cols, blk_rows, blk_cols) { \
/* take the ceiling of (A_rows / blk_rows) */ \
const size_t n_rblks = (A_rows + blk_rows - 1) / blk_rows; \
/* take the ceiling of (A_cols / blk_cols) */ \
const size_t n_cblks = (A_cols + blk_cols - 1) / blk_cols; \
const size_t n_full_rblks = A_rows / blk_rows; \
const size_t n_full_cblks = A_cols / blk_cols; \
const size_t rblk_remainder = A_rows % blk_rows; \
const size_t cblk_remainder = A_cols % blk_cols; \
size_t rblk_num, cblk_num, r_min, c_min, r_max, c_max; \
/* perform transpose over all blocks (both full and partial) */ \
for (rblk_num = 0; rblk_num < n_rblks; rblk_num++) { \
r_min = rblk_num * blk_rows; \
if (rblk_num < n_full_rblks) { \
r_max = r_min + blk_rows; \
} else { \
r_max = r_min + rblk_remainder; \
} \
for (cblk_num = 0; cblk_num < n_cblks; cblk_num++) { \
c_min = cblk_num * blk_cols; \
if (cblk_num < n_full_cblks) { \
c_max = c_min + blk_cols; \
} else { \
c_max = c_min + cblk_remainder; \
} \
/* perform actual transpose over current block */ \
TRANSPOSE_BLK(A, B, A_rows, A_cols, r_min, c_min, r_max, c_max); \
} \
} \
}
void transpose_flt_naive(const float* restrict A, float* restrict B,
size_t A_rows, size_t A_cols)
{
TRANSPOSE_BLK(A, B, A_rows, A_cols, 0, 0, A_rows, A_cols);
}
void transpose_dbl_naive(const double* restrict A, double* restrict B,
size_t A_rows, size_t A_cols)
{
TRANSPOSE_BLK(A, B, A_rows, A_cols, 0, 0, A_rows, A_cols);
}
void transpose_fcmplx_naive(const float complex* restrict A,
float complex* restrict B,
size_t A_rows, size_t A_cols)
{
TRANSPOSE_BLK(A, B, A_rows, A_cols, 0, 0, A_rows, A_cols);
}
void transpose_dcmplx_naive(const double complex* restrict A,
double complex* restrict B,
size_t A_rows, size_t A_cols)
{
TRANSPOSE_BLK(A, B, A_rows, A_cols, 0, 0, A_rows, A_cols);
}
void transpose_flt_blocked(const float* restrict A, float* restrict B,
size_t A_rows, size_t A_cols,
size_t blk_rows, size_t blk_cols)
{
TRANSPOSE_BLOCKED(A, B, A_rows, A_cols, blk_rows, blk_cols);
}
void transpose_dbl_blocked(const double* restrict A, double* restrict B,
size_t A_rows, size_t A_cols,
size_t blk_rows, size_t blk_cols)
{
TRANSPOSE_BLOCKED(A, B, A_rows, A_cols, blk_rows, blk_cols);
}
void transpose_fcmplx_blocked(const float complex* restrict A,
float complex* restrict B,
size_t A_rows, size_t A_cols,
size_t blk_rows, size_t blk_cols)
{
TRANSPOSE_BLOCKED(A, B, A_rows, A_cols, blk_rows, blk_cols);
}
void transpose_dcmplx_blocked(const double complex* restrict A,
double complex* restrict B,
size_t A_rows, size_t A_cols,
size_t blk_rows, size_t blk_cols)
{
TRANSPOSE_BLOCKED(A, B, A_rows, A_cols, blk_rows, blk_cols);
}