-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspectralnorm.gpp-6.c++
136 lines (110 loc) · 3.54 KB
/
spectralnorm.gpp-6.c++
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Original C contributed by Sebastien Loisel
// Conversion to C++ by Jon Harrop
// OpenMP parallelize by The Anh Tran
// Add SSE by The Anh Tran
// Additional SSE optimization by Krzysztof Jakubowski
// g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \
// ./spec.c++ -o ./spec.run
#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <sched.h>
#include <omp.h>
#include <emmintrin.h>
template <bool modei> int Index(int i, int j) {
return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1;
}
template <bool modei>
void EvalPart(double *__restrict__ src, double *__restrict__ dst,
int begin, int end, int length) {
int i = begin;
for(; i + 1 < end; i += 2) {
__m128d sum = _mm_set_pd(
src[0] / double(Index<modei>(i + 1, 0)),
src[0] / double(Index<modei>(i + 0, 0)));
__m128d ti = modei?
_mm_set_pd(i + 1, i + 0) :
_mm_set_pd(i + 2, i + 1);
__m128d last = _mm_set_pd(
Index<modei>(i + 1, 0),
Index<modei>(i + 0, 0));
for(int j = 1; j < length; j++) {
__m128d idx = last + ti + _mm_set1_pd(j);
last = idx;
sum = sum + _mm_set1_pd(src[j]) / idx;
}
_mm_storeu_pd(dst + i + 0, sum);
}
for(; i < end; i++) {
double sum = 0;
for (int j = 0; j < length; j++)
sum += src[j] / double(Index<modei>(i, j));
dst[i] = sum;
}
}
void EvalATimesU(double *src, double *dst, int begin, int end, int N) {
EvalPart<1>(src, dst, begin, end, N);
}
void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) {
EvalPart<0>(src, dst, begin, end, N);
}
void EvalAtATimesU(double *src, double *dst, double *tmp,
int begin, int end, int N) {
EvalATimesU (src, tmp, begin, end, N);
#pragma omp barrier
EvalAtTimesU(tmp, dst, begin, end, N);
#pragma omp barrier
}
int GetThreadCount() {
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
int count = 0;
for (int i = 0; i < CPU_SETSIZE; ++i)
if (CPU_ISSET(i, &cs))
++count;
return count;
}
double spectral_game(int N) {
__attribute__((aligned(16))) double u[N];
__attribute__((aligned(16))) double v[N], tmp[N];
double vBv = 0.0;
double vv = 0.0;
#pragma omp parallel default(shared) num_threads(GetThreadCount())
{
// this block will be executed by NUM_THREADS
// variable declared in this block is private for each thread
int threadid = omp_get_thread_num();
int threadcount = omp_get_num_threads();
int chunk = N / threadcount;
// calculate each thread's working range [r1 .. r2) => static schedule
int begin = threadid * chunk;
int end = (threadid < (threadcount -1)) ? (begin + chunk) : N;
for(int i = begin; i < end; i++)
u[i] = 1.0;
#pragma omp barrier
for (int ite = 0; ite < 10; ++ite) {
EvalAtATimesU(u, v, tmp, begin, end, N);
EvalAtATimesU(v, u, tmp, begin, end, N);
}
double sumvb = 0.0, sumvv = 0.0;
for (int i = begin; i < end; i++) {
sumvv += v[i] * v[i];
sumvb += u[i] * v[i];
}
#pragma omp critical
{
vBv += sumvb;
vv += sumvv;
}
}
return sqrt(vBv / vv);
}
int main(int argc, char *argv[]) {
int N = ((argc >= 2) ? atoi(argv[1]) : 2000);
printf("%.9f\n", spectral_game(N));
return 0;
}