-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* fixed error in serial gemm this should also bring the serial performances on par with its hardcoded counterpart * added a kokkos impl of the gemm bench * added gemv * moved cpp benches * added axpy cpp * added cmake setup for kokkos benches * ignroe build folder assume CMakeLists.txts, build.sh unchanged * added build specs for sloghtly better perf (5%?) * abort on panic produces 20% more perf may or may not revert this: are there situations where we want to catch a panic ? * added layout bench using views grouped layout bench in a folder * added a scaling comparisons between layouts * update doc & readme
- Loading branch information
Showing
13 changed files
with
750 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# target folder | ||
build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
cmake_minimum_required(VERSION 3.16) | ||
|
||
# set(CMAKE_C_COMPILER "/opt/homebrew/opt/llvm/bin/clang") # uncomment if using homebrew clang | ||
# set(CMAKE_CXX_COMPILER "/opt/homebrew/opt/llvm/bin/clang++") # uncomment if using homebrew clang | ||
|
||
set(CMAKE_CXX_STANDARD 20) | ||
set(CMAKE_BUILD_TYPE "Release") | ||
|
||
project(KokkosBenchmarks) | ||
add_subdirectory($KOKKOS_INSTALL_FOLDER dep/kokkos) # add kokkos files dep | ||
|
||
add_executable(gemm gemm.cpp) | ||
add_executable(gemv gemv.cpp) | ||
add_executable(axpy axpy.cpp) | ||
|
||
target_link_libraries(gemm Kokkos::kokkos) | ||
target_link_libraries(gemv Kokkos::kokkos) | ||
target_link_libraries(axpy Kokkos::kokkos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#include <execution> | ||
|
||
#include <Kokkos_Core.hpp> | ||
#include <cstdio> | ||
#include <cstdint> | ||
#include <cmath> | ||
#include <random> | ||
|
||
#define DATA_SIZE 20 | ||
#define N_REPEAT 100 | ||
|
||
|
||
int main( int argc, char* argv[] ) | ||
{ | ||
Kokkos::initialize(argc, argv); | ||
{ | ||
// Readability | ||
typedef Kokkos::View<double*, Kokkos::LayoutLeft> Vec; // technically layout doesn't matter here | ||
|
||
// declare data | ||
std::random_device rd; | ||
std::mt19937 gen(rd()); | ||
std::uniform_real_distribution<> dis(0.0, 1.0); | ||
const uint64_t length = pow(2, DATA_SIZE); | ||
|
||
Vec x("vecx", length); | ||
Vec y("vecy", length); | ||
double alpha; | ||
|
||
// fill with rand doubles | ||
alpha = dis(gen); | ||
// need to parallelize this if possible | ||
Kokkos::parallel_for("inits", length, KOKKOS_LAMBDA(const uint64_t ii) { | ||
std::random_device rd; | ||
std::mt19937 gen(rd()); | ||
std::uniform_real_distribution<> dis(0.0, 1.0); | ||
x(ii) = dis(gen); | ||
y(ii) = dis(gen); | ||
}); | ||
|
||
// run the kernel N_REPEAT times | ||
|
||
std::chrono::duration<double> times[N_REPEAT]; | ||
for (int idx = 0; idx < N_REPEAT; idx++) { | ||
|
||
const auto start{std::chrono::steady_clock::now()}; // start timer | ||
Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) { | ||
// assign to y | ||
y(i) = alpha * x(i) + y(i); | ||
}); | ||
const auto end{std::chrono::steady_clock::now()}; // end timer | ||
|
||
times[idx] = {end - start}; // save duration | ||
std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration | ||
} | ||
|
||
// process times | ||
double avg = 0.0; | ||
for (auto t : times) { | ||
avg += t.count(); | ||
} | ||
avg /= (double) N_REPEAT; | ||
printf("average time: %fs\n", avg); | ||
|
||
double variance = 0.0; | ||
for (auto t : times) { | ||
variance += pow(t.count() - avg, 2.0); | ||
} | ||
variance /= (double) N_REPEAT; | ||
double stddev = sqrt(variance); | ||
printf("standard deviation: %.5fs\n", stddev); | ||
|
||
} | ||
Kokkos::finalize(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
|
||
cmake -DKokkos_ENABLE_OPENMP=ON -B build/ | ||
cmake --build build --parallel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
// THIS CODE IS MADE FOR COMPILATION USING A PROPER KOKKOS SETUP. | ||
// REQUIRES C++20 | ||
// COMPILE USING OPENMP BACKEND TO HAVE SOMETHING COMPARABLE TO RAYON | ||
// | ||
// This file is here in order to provide a comparable implementation of the blas | ||
// benchmarks using the Kokkos library. It is not by any means the best way to | ||
// write such kernels. | ||
|
||
#include <cmath> | ||
#include <cstdint> | ||
#include <cstdio> | ||
#include <execution> | ||
#include <random> | ||
|
||
#include <Kokkos_Core.hpp> | ||
|
||
#define DATA_SIZE 10 | ||
#define N_REPEAT 100 | ||
|
||
int main( int argc, char* argv[] ) | ||
{ | ||
Kokkos::initialize(argc, argv); | ||
{ | ||
// Readability | ||
typedef Kokkos::View<double**, Kokkos::LayoutRight> MatRight; | ||
typedef Kokkos::View<double**, Kokkos::LayoutLeft> MatLeft; | ||
|
||
// declare data | ||
|
||
std::random_device rd; | ||
std::mt19937 gen(rd()); | ||
std::uniform_real_distribution<> dis(0.0, 1.0); | ||
const uint64_t length = pow(2, DATA_SIZE); | ||
|
||
// runtime dims | ||
MatRight A("matA", length, length); | ||
MatLeft B("matB", length, length); | ||
MatRight C("matC", length, length); | ||
double alpha; | ||
double beta; | ||
|
||
// fill with rand doubles | ||
alpha = dis(gen); | ||
beta = dis(gen); | ||
for (int ii = 0; ii < length; ii++) { | ||
for (int jj = 0; jj < length; jj++) { | ||
A(ii,jj) = dis(gen); | ||
B(ii,jj) = dis(gen); | ||
C(ii,jj) = dis(gen); | ||
}} | ||
|
||
// run the kernel N_REPEAT times | ||
|
||
std::chrono::duration<double> times[N_REPEAT]; | ||
for (int idx = 0; idx < N_REPEAT; idx++) { | ||
|
||
const auto start{std::chrono::steady_clock::now()}; // start timer | ||
Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) { | ||
for (uint64_t j = 0; j < length; j++) { | ||
// this computation is the most costly part of the kernel | ||
// trying to turn this into a proper reduction significantly | ||
// obfuscate the code. | ||
// I think this is pretty interesting since it can be done with | ||
// decent performances using just one line in Rust. | ||
double AB_ij = 0.0; | ||
for (uint64_t k = 0; k < length; k++) { AB_ij += A(i,k) * B(k,j); } | ||
// assign to C | ||
C(i, j) = alpha * AB_ij + beta * C(i, j); | ||
} | ||
}); | ||
const auto end{std::chrono::steady_clock::now()}; // end timer | ||
|
||
times[idx] = {end - start}; // save duration | ||
std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration | ||
} | ||
|
||
// process times | ||
double avg = 0.0; | ||
for (auto t : times) { | ||
avg += t.count(); | ||
} | ||
avg /= (double) N_REPEAT; | ||
printf("average time: %fs\n", avg); | ||
|
||
double variance = 0.0; | ||
for (auto t : times) { | ||
variance += pow(t.count() - avg, 2.0); | ||
} | ||
variance /= (double) N_REPEAT; | ||
double stddev = sqrt(variance); | ||
printf("standard deviation: %.5fs\n", stddev); | ||
|
||
} | ||
Kokkos::finalize(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
// THIS CODE IS MADE FOR COMPILATION USING A PROPER KOKKOS SETUP. | ||
// REQUIRES C++20 | ||
// COMPILE USING OPENMP BACKEND TO HAVE SOMETHING COMPARABLE TO RAYON | ||
// | ||
// This file is here in order to provide a comparable implementation of the blas | ||
// benchmarks using the Kokkos library. It is not by any means the best way to | ||
// write such kernels. | ||
|
||
#include <cmath> | ||
#include <cstdint> | ||
#include <cstdio> | ||
#include <execution> | ||
#include <random> | ||
|
||
#include <Kokkos_Core.hpp> | ||
|
||
#define DATA_SIZE 15 | ||
#define N_REPEAT 100 | ||
|
||
|
||
int main( int argc, char* argv[] ) | ||
{ | ||
Kokkos::initialize(argc, argv); | ||
{ | ||
// Readability | ||
typedef Kokkos::View<double**, Kokkos::LayoutRight> MatRight; | ||
typedef Kokkos::View<double*, Kokkos::LayoutLeft> VecColumn; // technically layout doesn't matter here | ||
|
||
// declare data | ||
std::random_device rd; | ||
std::mt19937 gen(rd()); | ||
std::uniform_real_distribution<> dis(0.0, 1.0); | ||
const uint64_t length = pow(2, DATA_SIZE); | ||
|
||
MatRight A("matA", length, length); | ||
VecColumn x("vecx", length); | ||
VecColumn y("vecy", length); | ||
double alpha; | ||
double beta; | ||
|
||
// fill with rand doubles | ||
alpha = dis(gen); | ||
beta = dis(gen); | ||
// need to parallelize this if possible | ||
Kokkos::parallel_for("inits", length, KOKKOS_LAMBDA(const uint64_t ii) { | ||
std::random_device rd; | ||
std::mt19937 gen(rd()); | ||
std::uniform_real_distribution<> dis(0.0, 1.0); | ||
x(ii) = dis(gen); | ||
y(ii) = dis(gen); | ||
for (int jj = 0; jj < length; jj++) { | ||
A(ii,jj) = dis(gen); | ||
} | ||
}); | ||
|
||
// run the kernel N_REPEAT times | ||
|
||
std::chrono::duration<double> times[N_REPEAT]; | ||
for (int idx = 0; idx < N_REPEAT; idx++) { | ||
|
||
const auto start{std::chrono::steady_clock::now()}; // start timer | ||
Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) { | ||
// compute (A*x)(i) | ||
double Ax_i = 0.0; | ||
for (uint64_t j = 0; j < length; j++) { Ax_i += A(i, j) * x(j); } | ||
// assign to y | ||
y(i) = alpha * Ax_i + beta * y(i); | ||
}); | ||
const auto end{std::chrono::steady_clock::now()}; // end timer | ||
|
||
times[idx] = {end - start}; // save duration | ||
std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration | ||
} | ||
|
||
// process times | ||
double avg = 0.0; | ||
for (auto t : times) { | ||
avg += t.count(); | ||
} | ||
avg /= (double) N_REPEAT; | ||
printf("average time: %fs\n", avg); | ||
|
||
double variance = 0.0; | ||
for (auto t : times) { | ||
variance += pow(t.count() - avg, 2.0); | ||
} | ||
variance /= (double) N_REPEAT; | ||
double stddev = sqrt(variance); | ||
printf("standard deviation: %.5fs\n", stddev); | ||
|
||
} | ||
Kokkos::finalize(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.