-
Notifications
You must be signed in to change notification settings - Fork 549
Expand file tree
/
Copy pathtiming.cpp
More file actions
109 lines (99 loc) · 3.31 KB
/
timing.cpp
File metadata and controls
109 lines (99 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*******************************************************
* Copyright (c) 2014, ArrayFire
* All rights reserved.
*
* This file is distributed under 3-clause BSD license.
* The complete license agreement can be obtained at:
* http://arrayfire.com/licenses/BSD-3-Clause
********************************************************/
#include <af/device.h>
#include <af/timing.h>
#include <algorithm>
#include <array>
#include <cmath>
#include <vector>
using namespace af;
// get current time
static inline timer time_now() {
#if defined(OS_WIN)
timer time;
QueryPerformanceCounter(&time.val);
#elif defined(OS_MAC)
timer time = {mach_absolute_time()};
#elif defined(OS_LNX)
timer time;
gettimeofday(&time.val, NULL);
#endif
return time;
}
// absolute difference between two times (in seconds)
static inline double time_seconds(timer start, timer end) {
#if defined(OS_WIN)
if (start.val.QuadPart > end.val.QuadPart) {
timer temp = end;
end = start;
start = temp;
}
timer system_freq;
QueryPerformanceFrequency(&system_freq.val);
return (double)(end.val.QuadPart - start.val.QuadPart) /
system_freq.val.QuadPart;
#elif defined(OS_MAC)
if (start.val > end.val) {
timer temp = start;
start = end;
end = temp;
}
// calculate platform timing epoch
thread_local mach_timebase_info_data_t info;
mach_timebase_info(&info);
double nano = (double)info.numer / (double)info.denom;
return (end.val - start.val) * nano * 1e-9;
#elif defined(OS_LNX)
struct timeval elapsed {};
timersub(&start.val, &end.val, &elapsed);
long sec = elapsed.tv_sec;
long usec = elapsed.tv_usec;
double t = sec + usec * 1e-6;
return t >= 0 ? t : -t;
#endif
}
namespace af {
thread_local timer _timer_;
timer timer::start() { return _timer_ = time_now(); }
double timer::stop(timer start) { return time_seconds(start, time_now()); }
double timer::stop() { return time_seconds(_timer_, time_now()); }
double timeit(void (*fn)()) {
// Minimum target duration to limit impact of clock precision
constexpr double targetDurationPerTest = 0.050;
// samples during which the nr of cycles are determined to obtain target
// duration
constexpr int testSamples = 2;
// cycles needed to include CPU-GPU overlapping (if present)
constexpr int minCycles = 3;
// initial cycles used for the test samples
int cycles = minCycles;
// total number of real samples taken, of which the median is returned
constexpr int nrSamples = 10;
std::array<double, nrSamples> X;
for (int s = -testSamples; s < nrSamples; ++s) {
af::sync();
af::timer start = af::timer::start();
for (int i = cycles; i > 0; --i) { fn(); }
af::sync();
const double time = af::timer::stop(start);
if (s >= 0) {
// real sample, so store it for later processing
X[s] = time;
} else {
// test sample, so improve nr cycles
cycles = std::max(
minCycles,
static_cast<int>(trunc(targetDurationPerTest / time * cycles)));
};
}
std::sort(X.begin(), X.end());
// returns the median (iso of mean), to limit impact of outliers
return X[nrSamples / 2] / cycles;
}
} // namespace af