Skip to content

Commit

Permalink
Move performance test to a separate app
Browse files Browse the repository at this point in the history
  • Loading branch information
dsharlet committed Dec 29, 2023
1 parent a789645 commit 05198b9
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 112 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ jobs:

- name: Build and test
run: make test -j8

- name: Build and run performance app
run: make bin/performance && bin/performance
47 changes: 47 additions & 0 deletions apps/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#ifndef SLINKY_APPS_BENCHMARK_H
#define SLINKY_APPS_BENCHMARK_H

#include <chrono>
#include <cmath>

namespace slinky {

// Benchmark a call.
template <class F>
double benchmark(F op) {
op();

const int max_trials = 10;
const double min_time_s = 0.5;
double time_per_iteration_s = 0;
long iterations = 1;
for (int trials = 0; trials < max_trials; trials++) {
auto t1 = std::chrono::high_resolution_clock::now();
for (int j = 0; j < iterations; j++) {
op();
}
auto t2 = std::chrono::high_resolution_clock::now();
time_per_iteration_s = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / (iterations * 1e9);
if (time_per_iteration_s * iterations > min_time_s) {
break;
}

long next_iterations = static_cast<long>(std::ceil((min_time_s * 2) / time_per_iteration_s));
iterations = std::min(std::max(next_iterations, iterations), iterations * 10);
}
return time_per_iteration_s;
}

// Tricks the compiler into not stripping away dead objects.
template <class T>
__attribute__((noinline)) void assert_used(const T&) {}

// Tricks the compiler into not constant folding the result of x.
template <class T>
__attribute__((noinline)) T not_constant(T x) {
return x;
}

} // namespace slinky

#endif // SLINKY_APPS_BENCHMARK_H
92 changes: 92 additions & 0 deletions apps/performance.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#include "pipeline.h"
#include "benchmark.h"

#include <cstdlib>
#include <cassert>
#include <iostream>

using namespace slinky;

// Copy from input to output.
// TODO: We should be able to just do this with raw_buffer and not make it a template.
template <typename T>
index_t copy(const buffer<const T>& in, const buffer<T>& out) {
const T* src = &in(out.dim(0).min(), out.dim(1).min());
T* dst = &out(out.dim(0).min(), out.dim(1).min());
std::size_t size = out.dim(0).extent() * out.elem_size;
for (index_t y = out.dim(1).begin(); y < out.dim(1).end(); ++y) {
std::copy(src, src + size, dst);
dst += out.dim(1).stride_bytes();
src += in.dim(1).stride_bytes();
}
return 0;
}

pipeline make_pipeline(bool explicit_y) {
// Make the pipeline
node_context ctx;

auto in = buffer_expr::make(ctx, "in", sizeof(char), 2);
auto out = buffer_expr::make(ctx, "out", sizeof(char), 2);

expr x = make_variable(ctx, "x");
expr y = make_variable(ctx, "y");

func copy = func::make<const char, char>(::copy<char>, {in, {point(x), point(y)}}, {out, {x, y}});

if (explicit_y) {
copy.loops({y});
}

pipeline p(ctx, {in}, {out}, build_options{.no_checks = true});

return p;
}

int main(int argc, const char** argv) {
pipeline loop = make_pipeline(true);
pipeline no_loop = make_pipeline(false);

const int total_sizes[] = {32, 128, 1024, 4096};
const int copy_sizes[] = {2, 4, 16, 64};

std::cout << std::endl;
for (int total_size : total_sizes) {
std::cout << "total size (KB): " << total_size << std::endl;
total_size *= 1024;

std::cout << "copy size (KB), loop (GB/s), no loop (GB/s), ratio" << std::endl;
for (int copy_size : copy_sizes) {
std::cout << copy_size << ", ";
copy_size *= 1024;

if (total_size < copy_size) continue;

buffer<char, 2> in_buf({copy_size, total_size / copy_size});
buffer<char, 2> out_buf({copy_size, total_size / copy_size});
in_buf.allocate();
out_buf.allocate();
for (index_t i = 0; i < total_size; ++i) {
in_buf.base()[i] = rand() % 64;
}

const raw_buffer* inputs[] = {&in_buf};
const raw_buffer* outputs[] = {&out_buf};

memset(out_buf.base(), 0, total_size);
double loop_t = benchmark([&]() { loop.evaluate(inputs, outputs); });
assert(memcmp(out_buf.base(), in_buf.base(), total_size) == 0);
std::cout << total_size / (loop_t * 1e9) << ", ";

memset(out_buf.base(), 0, total_size);
double no_loop_t = benchmark([&]() { no_loop.evaluate(inputs, outputs); });
assert(memcmp(out_buf.base(), in_buf.base(), total_size) == 0);
std::cout << total_size / (no_loop_t * 1e9) << ", ";

std::cout << no_loop_t / loop_t << std::endl;
}
std::cout << std::endl;
}

return 0;
}
2 changes: 0 additions & 2 deletions src/pipeline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -328,8 +328,6 @@ stmt build_pipeline(node_context& ctx, const std::vector<buffer_expr_ptr>& input
result = remove_checks().mutate(result);
}

print(std::cerr, result, &ctx);

return result;
}

Expand Down
17 changes: 1 addition & 16 deletions test/funcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,7 @@ void for_each_index(const raw_buffer& b, F&& f) {
for_each_index({b.dims, b.rank}, f);
}

// Copy from input to output.
// TODO: We should be able to just do this with raw_buffer and not make it a template.
template <typename T>
index_t copy(const buffer<const T>& in, const buffer<T>& out) {
const T* src = &in(out.dim(0).min(), out.dim(1).min());
T* dst = &out(out.dim(0).min(), out.dim(1).min());
std::size_t size = out.dim(0).extent() * out.elem_size;
for (index_t y = out.dim(1).begin(); y < out.dim(1).end(); ++y) {
std::copy(src, src + size, dst);
dst += out.dim(1).stride_bytes();
src += in.dim(1).stride_bytes();
}
return 0;
}

// Like copy, but flips in the y dimension.
// Copy rows, where the output y is -y in the input.
template <typename T>
index_t flip_y(const buffer<const T>& in, const buffer<T>& out) {
assert(in.rank == 2);
Expand Down
60 changes: 0 additions & 60 deletions test/performance.cc

This file was deleted.

34 changes: 0 additions & 34 deletions test/test.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,40 +54,6 @@ class assert_stream {
#define ASSERT_LE(a, b) ASSERT(a <= b) << "\n" << #a << "=" << a << "\n" << #b << "=" << b << " "
#define ASSERT_GE(a, b) ASSERT(a >= b) << "\n" << #a << "=" << a << "\n" << #b << "=" << b << " "

// Benchmark a call.
template <class F>
double benchmark(F op) {
op();

const int max_trials = 10;
const double min_time_s = 0.5;
double time_per_iteration_s = 0;
long iterations = 1;
for (int trials = 0; trials < max_trials; trials++) {
auto t1 = std::chrono::high_resolution_clock::now();
for (int j = 0; j < iterations; j++) {
op();
}
auto t2 = std::chrono::high_resolution_clock::now();
time_per_iteration_s = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / (iterations * 1e9);
if (time_per_iteration_s * iterations > min_time_s) { break; }

long next_iterations = static_cast<long>(std::ceil((min_time_s * 2) / time_per_iteration_s));
iterations = std::min(std::max(next_iterations, iterations), iterations * 10);
}
return time_per_iteration_s;
}

// Tricks the compiler into not stripping away dead objects.
template <class T>
__attribute__((noinline)) void assert_used(const T&) {}

// Tricks the compiler into not constant folding the result of x.
template <class T>
__attribute__((noinline)) T not_constant(T x) {
return x;
}

// This type generates compiler errors if it is copied.
struct move_only {
move_only() = default;
Expand Down

0 comments on commit 05198b9

Please sign in to comment.