-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_ve_fastdiv.cpp
145 lines (134 loc) · 4.97 KB
/
test_ve_fastdiv.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* Copyright (c) 2019 by NEC Corporation
* This file is part of ve-jit */
/** \file
* test ve_fastdiv.h . ncc adds a fair amount of "overhead" ops as it
* vectorizes the 3 division algs, so much that the 'bounded' actually
* ends up using about the same ops as general fastdiv. But both
* fastdiv loops are still 4x faster than compiler's VDIV division :)
*/
#include "ve_fastdiv.h"
#include "timer.h"
#include <iostream>
#include <iomanip>
#include <cassert>
#if defined(FTRACE) && !defined(__ve) // FTRACE **only** for VE
#warning "ignoring attempt to use FTRACE with non-VE compiler (ftrace header may be missing)"
#undef FTRACE
#endif
#ifdef FTRACE
#include <ftrace.h>
#define FTRACE_BEG(...) ftrace_region_begin(__VA_ARGS__)
#define FTRACE_END(...) ftrace_region_end(__VA_ARGS__)
#define FTRACE_IF(...) __VA_ARGS__
#else
#define FTRACE_BEG(...) do{}while(0)
#define FTRACE_END(...) do{}while(0)
#define FTRACE_IF(...) do{}while(0)
#endif
#ifndef SPEED
/** SPEED 0 means assert correctness, SPEED 1 means timing run */
#define SPEED 0
#endif
using namespace std;
int main(int,char**){
// divMax < 2^21-1 for FASTDIV21 alg (bounded divisor case)
// numMax can be any uint32_t
#if SPEED
// this code vectorizes, so can cover a bigger range.
// (but we do not check correctness)
uint64_t const numMax = 1<<24;
uint64_t const divMax = 1<<16;
#else
uint64_t const numMax = 1<<18;
uint64_t const divMax = 1<<16;
#endif
uint64_t bogus = 1234567U;
uint64_t normal_cyc=0ULL;
uint64_t general_cyc=0ULL;
uint64_t bounded_cyc=0ULL;
uint64_t c0, c1;
printf("%s %s\n",__FILE__, (SPEED?"(speed)":""));
fflush(stdout);
//auto rd1 = [](double const x) {return (int64_t)(x*10.) / 10.;};
#define rd1(x) (((int64_t)(x*10.))/10.)
for(uint64_t d=1U; d<divMax; ++d){
#if SPEED
FTRACE_BEG("standard div");
c0 = __cycle();
#if 1 // 64-bit VDIV
for(uint64_t num=0; num<numMax; ++num){ // num/d~200 s
bogus += num/d; // uses VDIV (vdivu)
}
#else // 32-bit VDIV
uint32_t const d32 = d;
for(uint32_t num=0; num<numMax; ++num){ // num/d32 still 4x slower
bogus += num/d32; // uses VDIV (vdivu.w)
}
#endif
normal_cyc += __cycle()-c0;
FTRACE_END("standard div");
#endif
struct ve_fastdiv fd; // general method
vednn_fastdiv( &fd, d ); // divide-by-d
uint64_t const mul = fd.mul;
uint64_t const add = fd.add;
uint64_t const shift = fd.shift;
struct ve_fastdiv fd2; // bounded method optimization
vednn_fastdiv_bounded( &fd2, d, divMax ); // divide-by-d
uint64_t const mul2 = fd2.mul;
uint64_t const add2 = fd2.add;
uint64_t const shift2 = fd2.shift;
assert(mul2==1U || add2==0U || shift2==0U); // <= 2 ops jit
assert(add2==0U);
#if SPEED
FTRACE_BEG("vednn_fastdiv");
c0 = __cycle();
for(uint64_t num=0; num<numMax; ++num){
bogus += (num*mul + add) >> shift;
}
general_cyc += __cycle()-c0;
FTRACE_END("vednn_fastdiv");
FTRACE_BEG("vednn_fastdiv_bounded");
c0 = __cycle();
for(uint64_t num=0; num<numMax; ++num){
bogus += (num*mul2) >> shift;
}
bounded_cyc += __cycle()-c0;
FTRACE_END("vednn_fastdiv_bounded");
if(d % (divMax/100U) == 0U || d+1U == divMax ){
cout<<(d * 100 / divMax)<<"% done (d="<<setw(8)<<d<<") ";
cout<<" cycles: normal "<<setw(17)<<(double)normal_cyc
<<" general "<<setw(17)<<(double)general_cyc
<<" bounded "<<setw(17)<<(double)bounded_cyc
<<endl;
// x86 cycles: normal > general > bounded
// ve: normal > general < ~= bounded
// 99% done (d= 65535) cycles: normal 1.74265e+11 general 3.21873e+10 bounded 3.22145e+10
// Finished unsigned n/d tests for n < 16777216 and d < 65536
// time(s): normal 198 general 36.5 bounded 36.6
//
}
#else
for(uint64_t num=0; num<numMax; ++num){
uint64_t const normal = num/d;
uint64_t const general = (num * mul + add) >> shift;
uint64_t const bounded = (num * mul2 ) >> shift2;
assert( general == normal );
assert( bounded == normal );
}
#endif
}
cout<<" Finished unsigned n/d tests for n < "<<numMax<<" and d < "<<divMax<<endl;
#if SPEED // final conversion to seconds
double const cyc2s = cycle2ns() * 1.e-9;
cout<<" time(s): "
<<" normal "<<setw(17)<<rd1(normal_cyc *cyc2s)
<<" general "<<setw(17)<<rd1(general_cyc*cyc2s)
<<" bounded "<<setw(17)<<rd1(bounded_cyc*cyc2s)
<<endl;
#else
cout<<" All divisions were correct"<<endl;
#endif
cout<<"\nGoodbye: bogus="<<bogus<<endl;
}
// vim: ts=4 sw=4 et cindent cino=^=l0,\:.5s,=-.5s,N-s,g.5s,b1 cinkeys=0{,0},0),\:,0#,!^F,o,O,e,0=break