Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bench cpp #9

Merged
merged 12 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ name = "poc-kokkos-rs"
version = "0.1.0"
edition = "2021"

# OPTIMIZATION

[profile.release]
codegen-units = 1
lto = "fat"
panic = "abort"

# FEATURES

[features]
Expand Down Expand Up @@ -34,15 +41,23 @@ cxx-build = "*"
## misc

[[bench]]
name = "layout"
name = "view_init"
harness = false

[[bench]]
name = "view_init"
name = "view_access"
harness = false

## layout

[[bench]]
name = "view_access"
name = "layout-size"
path = "benches/layout/size.rs"
harness = false

[[bench]]
name = "layout-comparison"
path = "benches/layout/comparison.rs"
harness = false

## blas speedup measures
Expand Down
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,27 @@ cargo bench --bench bench_name
All results are compiled to the `target/criterion/` folder. The following
benchmarks are available:

- `layout`: Matrix-Vector product computation; This is used to put numbers on the
importance of data layout in memory.
- `view_init`: Compare initialization performances of regular vectors to [Views][view]; This
is used to spot potential scaling issues induced by the more complex structure of Views.
- `view_access`: Compare data access performances of regular vectors to [Views][view]; This
is used to spot potential scaling issues induced by the more complex structure of Views.
**Layout:**
- `layout-comparison`: Bench a Matrix-Matrix product three times, using the worst possible layout,
the usual layout, and then the optimal layout for the operation. This shows the importance of layout
selection for performances.
- `layout-size`: Bench a Matrix-Matrix product using the usual layout and the optimal layout,
over a range of sizes for the square matrices. This shows the influence of cache size over
layout importance.
**Computation:**
- `axpy` / `gemv` / `gemm`: Measure speedup on basic BLAS implementations by running the same kernel
in serial mode first, then using parallelization on CPU. _Meant to be executed using features_.
- `hardcoded_gemm`: Compute the same operations as the `gemm` benchmark, but using a hardcoded implementation
instead of methods from the PoC. Used to assess the additional cost induced by the library.
**Library overhead:**
- `view_init`: Compare initialization performances of regular vectors to [Views][view]; This
is used to spot potential scaling issues induced by the more complex structure of Views.
- `view_access`: Compare data access performances of regular vectors to [Views][view]; This
is used to spot potential scaling issues induced by the more complex structure of Views.

Additionally, a kokkos-equivalent of the blas kernels can be found in the `blas-speedup-kokkos/`
subdirectory. These are far from being the most optimized implementation, instead they are written
as close-ish counterparts to the Rust benchmarks.


### Examples
Expand Down
2 changes: 2 additions & 0 deletions benches/blas-speedup-kokkos/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# target folder
build/
18 changes: 18 additions & 0 deletions benches/blas-speedup-kokkos/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
cmake_minimum_required(VERSION 3.16)

# set(CMAKE_C_COMPILER "/opt/homebrew/opt/llvm/bin/clang") # uncomment if using homebrew clang
# set(CMAKE_CXX_COMPILER "/opt/homebrew/opt/llvm/bin/clang++") # uncomment if using homebrew clang

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_BUILD_TYPE "Release")

project(KokkosBenchmarks)
add_subdirectory($KOKKOS_INSTALL_FOLDER dep/kokkos) # add kokkos files dep

add_executable(gemm gemm.cpp)
add_executable(gemv gemv.cpp)
add_executable(axpy axpy.cpp)

target_link_libraries(gemm Kokkos::kokkos)
target_link_libraries(gemv Kokkos::kokkos)
target_link_libraries(axpy Kokkos::kokkos)
75 changes: 75 additions & 0 deletions benches/blas-speedup-kokkos/axpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include <execution>

#include <Kokkos_Core.hpp>
#include <cstdio>
#include <cstdint>
#include <cmath>
#include <random>

#define DATA_SIZE 20
#define N_REPEAT 100


int main( int argc, char* argv[] )
{
Kokkos::initialize(argc, argv);
{
// Readability
typedef Kokkos::View<double*, Kokkos::LayoutLeft> Vec; // technically layout doesn't matter here

// declare data
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
const uint64_t length = pow(2, DATA_SIZE);

Vec x("vecx", length);
Vec y("vecy", length);
double alpha;

// fill with rand doubles
alpha = dis(gen);
// need to parallelize this if possible
Kokkos::parallel_for("inits", length, KOKKOS_LAMBDA(const uint64_t ii) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
x(ii) = dis(gen);
y(ii) = dis(gen);
});

// run the kernel N_REPEAT times

std::chrono::duration<double> times[N_REPEAT];
for (int idx = 0; idx < N_REPEAT; idx++) {

const auto start{std::chrono::steady_clock::now()}; // start timer
Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) {
// assign to y
y(i) = alpha * x(i) + y(i);
});
const auto end{std::chrono::steady_clock::now()}; // end timer

times[idx] = {end - start}; // save duration
std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration
}

// process times
double avg = 0.0;
for (auto t : times) {
avg += t.count();
}
avg /= (double) N_REPEAT;
printf("average time: %fs\n", avg);

double variance = 0.0;
for (auto t : times) {
variance += pow(t.count() - avg, 2.0);
}
variance /= (double) N_REPEAT;
double stddev = sqrt(variance);
printf("standard deviation: %.5fs\n", stddev);

}
Kokkos::finalize();
}
4 changes: 4 additions & 0 deletions benches/blas-speedup-kokkos/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

cmake -DKokkos_ENABLE_OPENMP=ON -B build/
cmake --build build --parallel
95 changes: 95 additions & 0 deletions benches/blas-speedup-kokkos/gemm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// THIS CODE IS MADE FOR COMPILATION USING A PROPER KOKKOS SETUP.
// REQUIRES C++20
// COMPILE USING OPENMP BACKEND TO HAVE SOMETHING COMPARABLE TO RAYON
//
// This file is here in order to provide a comparable implementation of the blas
// benchmarks using the Kokkos library. It is not by any means the best way to
// write such kernels.

#include <cmath>
#include <cstdint>
#include <cstdio>
#include <execution>
#include <random>

#include <Kokkos_Core.hpp>

#define DATA_SIZE 10
#define N_REPEAT 100

int main( int argc, char* argv[] )
{
Kokkos::initialize(argc, argv);
{
// Readability
typedef Kokkos::View<double**, Kokkos::LayoutRight> MatRight;
typedef Kokkos::View<double**, Kokkos::LayoutLeft> MatLeft;

// declare data

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
const uint64_t length = pow(2, DATA_SIZE);

// runtime dims
MatRight A("matA", length, length);
MatLeft B("matB", length, length);
MatRight C("matC", length, length);
double alpha;
double beta;

// fill with rand doubles
alpha = dis(gen);
beta = dis(gen);
for (int ii = 0; ii < length; ii++) {
for (int jj = 0; jj < length; jj++) {
A(ii,jj) = dis(gen);
B(ii,jj) = dis(gen);
C(ii,jj) = dis(gen);
}}

// run the kernel N_REPEAT times

std::chrono::duration<double> times[N_REPEAT];
for (int idx = 0; idx < N_REPEAT; idx++) {

const auto start{std::chrono::steady_clock::now()}; // start timer
Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) {
for (uint64_t j = 0; j < length; j++) {
// this computation is the most costly part of the kernel
// trying to turn this into a proper reduction significantly
// obfuscate the code.
// I think this is pretty interesting since it can be done with
// decent performances using just one line in Rust.
double AB_ij = 0.0;
for (uint64_t k = 0; k < length; k++) { AB_ij += A(i,k) * B(k,j); }
// assign to C
C(i, j) = alpha * AB_ij + beta * C(i, j);
}
});
const auto end{std::chrono::steady_clock::now()}; // end timer

times[idx] = {end - start}; // save duration
std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration
}

// process times
double avg = 0.0;
for (auto t : times) {
avg += t.count();
}
avg /= (double) N_REPEAT;
printf("average time: %fs\n", avg);

double variance = 0.0;
for (auto t : times) {
variance += pow(t.count() - avg, 2.0);
}
variance /= (double) N_REPEAT;
double stddev = sqrt(variance);
printf("standard deviation: %.5fs\n", stddev);

}
Kokkos::finalize();
}
93 changes: 93 additions & 0 deletions benches/blas-speedup-kokkos/gemv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// THIS CODE IS MADE FOR COMPILATION USING A PROPER KOKKOS SETUP.
// REQUIRES C++20
// COMPILE USING OPENMP BACKEND TO HAVE SOMETHING COMPARABLE TO RAYON
//
// This file is here in order to provide a comparable implementation of the blas
// benchmarks using the Kokkos library. It is not by any means the best way to
// write such kernels.

#include <cmath>
#include <cstdint>
#include <cstdio>
#include <execution>
#include <random>

#include <Kokkos_Core.hpp>

#define DATA_SIZE 15
#define N_REPEAT 100


int main( int argc, char* argv[] )
{
Kokkos::initialize(argc, argv);
{
// Readability
typedef Kokkos::View<double**, Kokkos::LayoutRight> MatRight;
typedef Kokkos::View<double*, Kokkos::LayoutLeft> VecColumn; // technically layout doesn't matter here

// declare data
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
const uint64_t length = pow(2, DATA_SIZE);

MatRight A("matA", length, length);
VecColumn x("vecx", length);
VecColumn y("vecy", length);
double alpha;
double beta;

// fill with rand doubles
alpha = dis(gen);
beta = dis(gen);
// need to parallelize this if possible
Kokkos::parallel_for("inits", length, KOKKOS_LAMBDA(const uint64_t ii) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
x(ii) = dis(gen);
y(ii) = dis(gen);
for (int jj = 0; jj < length; jj++) {
A(ii,jj) = dis(gen);
}
});

// run the kernel N_REPEAT times

std::chrono::duration<double> times[N_REPEAT];
for (int idx = 0; idx < N_REPEAT; idx++) {

const auto start{std::chrono::steady_clock::now()}; // start timer
Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) {
// compute (A*x)(i)
double Ax_i = 0.0;
for (uint64_t j = 0; j < length; j++) { Ax_i += A(i, j) * x(j); }
// assign to y
y(i) = alpha * Ax_i + beta * y(i);
});
const auto end{std::chrono::steady_clock::now()}; // end timer

times[idx] = {end - start}; // save duration
std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration
}

// process times
double avg = 0.0;
for (auto t : times) {
avg += t.count();
}
avg /= (double) N_REPEAT;
printf("average time: %fs\n", avg);

double variance = 0.0;
for (auto t : times) {
variance += pow(t.count() - avg, 2.0);
}
variance /= (double) N_REPEAT;
double stddev = sqrt(variance);
printf("standard deviation: %.5fs\n", stddev);

}
Kokkos::finalize();
}
4 changes: 2 additions & 2 deletions benches/blas-speedup/gemm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ fn f1(
KernelArgs::Index1D(i) => {
// cols
for j in 0..length {
// b[j, k] because was init using a layout left
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([j, k])).sum();
// all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
cc.set([i, j], val);
}
Expand Down
Loading
Loading