Bench cpp (#9)

* fixed error in serial gemm this should also bring the serial performances on par with its hardcoded counterpart * added a kokkos impl of the gemm bench * added gemv * moved cpp benches * added axpy cpp * added cmake setup for kokkos benches * ignroe build folder assume CMakeLists.txts, build.sh unchanged * added build specs for sloghtly better perf (5%?) * abort on panic produces 20% more perf may or may not revert this: are there situations where we want to catch a panic ? * added layout bench using views grouped layout bench in a folder * added a scaling comparisons between layouts * update doc & readme
imrn99 · Nov 27, 2023 · 4cc7cb1 · 4cc7cb1
1 parent d932467
commit 4cc7cb1
Show file tree

Hide file tree

Showing 13 changed files with 750 additions and 140 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,6 +3,13 @@ name = "poc-kokkos-rs"
 version = "0.1.0"
 edition = "2021"
 
+# OPTIMIZATION
+
+[profile.release]
+codegen-units = 1
+lto = "fat"
+panic = "abort"
+
 # FEATURES 
 
 [features]
@@ -34,15 +41,23 @@ cxx-build = "*"
 ## misc 
 
 [[bench]]
-name = "layout"
+name = "view_init"
 harness = false
 
 [[bench]]
-name = "view_init"
+name = "view_access"
 harness = false
 
+## layout 
+
 [[bench]]
-name = "view_access"
+name = "layout-size"
+path = "benches/layout/size.rs"
+harness = false
+
+[[bench]]
+name = "layout-comparison"
+path = "benches/layout/comparison.rs"
 harness = false
 
 ## blas speedup measures

diff --git a/README.md b/README.md
@@ -39,16 +39,27 @@ cargo bench --bench bench_name
 All results are compiled to the `target/criterion/` folder. The following
 benchmarks are available:
 
-- `layout`: Matrix-Vector product computation; This is used to put numbers on the
-  importance of data layout in memory.
-- `view_init`: Compare initialization performances of regular vectors to [Views][view]; This
-  is used to spot potential scaling issues induced by the more complex structure of Views.
-- `view_access`: Compare data access performances of regular vectors to [Views][view]; This
-  is used to spot potential scaling issues induced by the more complex structure of Views.
+**Layout:**
+- `layout-comparison`: Bench a Matrix-Matrix product three times, using the worst possible layout,
+  the usual layout, and then the optimal layout for the operation. This shows the importance of layout
+  selection for performances.
+- `layout-size`: Bench a Matrix-Matrix product using the usual layout and the optimal layout,
+  over a range of sizes for the square matrices. This shows the influence of cache size over
+  layout importance.
+**Computation:**
 - `axpy` / `gemv` / `gemm`: Measure speedup on basic BLAS implementations by running the same kernel
   in serial mode first, then using parallelization on CPU. _Meant to be executed using features_.
 - `hardcoded_gemm`: Compute the same operations as the `gemm` benchmark, but using a hardcoded implementation
   instead of methods from the PoC. Used to assess the additional cost induced by the library.
+**Library overhead:**
+- `view_init`: Compare initialization performances of regular vectors to [Views][view]; This
+  is used to spot potential scaling issues induced by the more complex structure of Views.
+- `view_access`: Compare data access performances of regular vectors to [Views][view]; This
+  is used to spot potential scaling issues induced by the more complex structure of Views.
+
+Additionally, a kokkos-equivalent of the blas kernels can be found in the `blas-speedup-kokkos/`
+subdirectory. These are far from being the most optimized implementation, instead they are written
+as close-ish counterparts to the Rust benchmarks.
 
 
 ### Examples

diff --git a/benches/blas-speedup-kokkos/.gitignore b/benches/blas-speedup-kokkos/.gitignore
@@ -0,0 +1,2 @@
+# target folder
+build/
diff --git a/benches/blas-speedup-kokkos/CMakeLists.txt b/benches/blas-speedup-kokkos/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.16)
+
+# set(CMAKE_C_COMPILER "/opt/homebrew/opt/llvm/bin/clang")     # uncomment if using homebrew clang
+# set(CMAKE_CXX_COMPILER "/opt/homebrew/opt/llvm/bin/clang++") # uncomment if using homebrew clang
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_BUILD_TYPE "Release")
+
+project(KokkosBenchmarks)
+add_subdirectory($KOKKOS_INSTALL_FOLDER dep/kokkos) # add kokkos files dep
+
+add_executable(gemm gemm.cpp)
+add_executable(gemv gemv.cpp)
+add_executable(axpy axpy.cpp)
+
+target_link_libraries(gemm Kokkos::kokkos)
+target_link_libraries(gemv Kokkos::kokkos)
+target_link_libraries(axpy Kokkos::kokkos)
diff --git a/benches/blas-speedup-kokkos/axpy.cpp b/benches/blas-speedup-kokkos/axpy.cpp
@@ -0,0 +1,75 @@
+#include <execution>
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <random>
+
+#define DATA_SIZE 20
+#define N_REPEAT 100
+
+
+int main( int argc, char* argv[] )
+{ 
+    Kokkos::initialize(argc, argv);
+    {
+        // Readability
+        typedef Kokkos::View<double*, Kokkos::LayoutLeft>   Vec; // technically layout doesn't matter here
+
+        // declare data
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<> dis(0.0, 1.0);
+        const uint64_t length = pow(2, DATA_SIZE);
+
+        Vec x("vecx", length);
+        Vec y("vecy", length);
+        double alpha;
+
+        // fill with rand doubles
+        alpha = dis(gen);
+        // need to parallelize this if possible
+        Kokkos::parallel_for("inits", length, KOKKOS_LAMBDA(const uint64_t ii) {
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_real_distribution<> dis(0.0, 1.0);
+            x(ii) = dis(gen);
+            y(ii) = dis(gen);
+        });
+
+        // run the kernel N_REPEAT times
+
+        std::chrono::duration<double> times[N_REPEAT];
+        for (int idx = 0; idx < N_REPEAT; idx++) {
+
+            const auto start{std::chrono::steady_clock::now()};   // start timer
+            Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) {
+                // assign to y
+                y(i) = alpha * x(i) + y(i);
+            });
+            const auto end{std::chrono::steady_clock::now()};     // end timer
+
+            times[idx] = {end - start}; // save duration
+            std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration
+        }
+
+        // process times
+        double avg = 0.0;
+        for (auto t : times) {
+            avg += t.count();
+        }
+        avg /= (double) N_REPEAT;
+        printf("average time: %fs\n", avg);
+
+        double variance = 0.0;
+        for (auto t : times) {
+            variance += pow(t.count() - avg, 2.0);
+        }
+        variance /= (double) N_REPEAT;
+        double stddev = sqrt(variance);
+        printf("standard deviation: %.5fs\n", stddev);
+
+    }
+    Kokkos::finalize();
+}
diff --git a/benches/blas-speedup-kokkos/build.sh b/benches/blas-speedup-kokkos/build.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cmake -DKokkos_ENABLE_OPENMP=ON -B build/
+cmake --build build --parallel
diff --git a/benches/blas-speedup-kokkos/gemm.cpp b/benches/blas-speedup-kokkos/gemm.cpp
@@ -0,0 +1,95 @@
+// THIS CODE IS MADE FOR COMPILATION USING A PROPER KOKKOS SETUP.
+// REQUIRES C++20
+// COMPILE USING OPENMP BACKEND TO HAVE SOMETHING COMPARABLE TO RAYON
+//
+// This file is here in order to provide a comparable implementation of the blas
+// benchmarks using the Kokkos library. It is not by any means the best way to 
+// write such kernels.
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <execution>
+#include <random>
+
+#include <Kokkos_Core.hpp>
+
+#define DATA_SIZE 10
+#define N_REPEAT 100
+
+int main( int argc, char* argv[] )
+{ 
+    Kokkos::initialize(argc, argv);
+    {
+        // Readability
+        typedef Kokkos::View<double**, Kokkos::LayoutRight>  MatRight;
+        typedef Kokkos::View<double**, Kokkos::LayoutLeft>  MatLeft;
+
+        // declare data
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<> dis(0.0, 1.0);
+        const uint64_t length = pow(2, DATA_SIZE);
+
+        // runtime dims
+        MatRight A("matA", length, length);
+        MatLeft B("matB", length, length);
+        MatRight C("matC", length, length);
+        double alpha;
+        double beta; 
+
+        // fill with rand doubles
+        alpha = dis(gen);
+        beta = dis(gen);
+        for (int ii = 0; ii < length; ii++) {
+        for (int jj = 0; jj < length; jj++) {
+            A(ii,jj) = dis(gen);
+            B(ii,jj) = dis(gen);
+            C(ii,jj) = dis(gen);
+        }}
+
+        // run the kernel N_REPEAT times
+
+        std::chrono::duration<double> times[N_REPEAT];
+        for (int idx = 0; idx < N_REPEAT; idx++) {
+
+            const auto start{std::chrono::steady_clock::now()};   // start timer
+            Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) {
+                for (uint64_t j = 0; j < length; j++) {
+                    // this computation is the most costly part of the kernel
+                    // trying to turn this into a proper reduction significantly
+                    // obfuscate the code.
+                    // I think this is pretty interesting since it can be done with 
+                    // decent performances using just one line in Rust.
+                    double AB_ij = 0.0; 
+                    for (uint64_t k = 0; k < length; k++) { AB_ij += A(i,k) * B(k,j); }
+                    // assign to C
+                    C(i, j) = alpha * AB_ij + beta * C(i, j);
+                }
+            });
+            const auto end{std::chrono::steady_clock::now()};     // end timer
+
+            times[idx] = {end - start}; // save duration
+            std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration
+        }
+
+        // process times
+        double avg = 0.0;
+        for (auto t : times) {
+            avg += t.count();
+        }
+        avg /= (double) N_REPEAT;
+        printf("average time: %fs\n", avg);
+
+        double variance = 0.0;
+        for (auto t : times) {
+            variance += pow(t.count() - avg, 2.0);
+        }
+        variance /= (double) N_REPEAT;
+        double stddev = sqrt(variance);
+        printf("standard deviation: %.5fs\n", stddev);
+
+    }
+    Kokkos::finalize();
+}
diff --git a/benches/blas-speedup-kokkos/gemv.cpp b/benches/blas-speedup-kokkos/gemv.cpp
@@ -0,0 +1,93 @@
+// THIS CODE IS MADE FOR COMPILATION USING A PROPER KOKKOS SETUP.
+// REQUIRES C++20
+// COMPILE USING OPENMP BACKEND TO HAVE SOMETHING COMPARABLE TO RAYON
+//
+// This file is here in order to provide a comparable implementation of the blas
+// benchmarks using the Kokkos library. It is not by any means the best way to 
+// write such kernels.
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <execution>
+#include <random>
+
+#include <Kokkos_Core.hpp>
+
+#define DATA_SIZE 15
+#define N_REPEAT 100
+
+
+int main( int argc, char* argv[] )
+{ 
+    Kokkos::initialize(argc, argv);
+    {
+        // Readability
+        typedef Kokkos::View<double**, Kokkos::LayoutRight>  MatRight;
+        typedef Kokkos::View<double*, Kokkos::LayoutLeft>   VecColumn; // technically layout doesn't matter here
+
+        // declare data
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<> dis(0.0, 1.0);
+        const uint64_t length = pow(2, DATA_SIZE);
+
+        MatRight A("matA", length, length);
+        VecColumn x("vecx", length);
+        VecColumn y("vecy", length);
+        double alpha;
+        double beta; 
+
+        // fill with rand doubles
+        alpha = dis(gen);
+        beta = dis(gen);
+        // need to parallelize this if possible
+        Kokkos::parallel_for("inits", length, KOKKOS_LAMBDA(const uint64_t ii) {
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_real_distribution<> dis(0.0, 1.0);
+            x(ii) = dis(gen);
+            y(ii) = dis(gen);
+            for (int jj = 0; jj < length; jj++) {
+                A(ii,jj) = dis(gen);
+            }
+        });
+
+        // run the kernel N_REPEAT times
+
+        std::chrono::duration<double> times[N_REPEAT];
+        for (int idx = 0; idx < N_REPEAT; idx++) {
+
+            const auto start{std::chrono::steady_clock::now()};   // start timer
+            Kokkos::parallel_for("GEMM kernel", length, KOKKOS_LAMBDA(const uint64_t i) {
+                // compute (A*x)(i)
+                double Ax_i = 0.0;
+                for (uint64_t j = 0; j < length; j++) { Ax_i += A(i, j) * x(j); }
+                // assign to y
+                y(i) = alpha * Ax_i + beta * y(i);
+            });
+            const auto end{std::chrono::steady_clock::now()};     // end timer
+
+            times[idx] = {end - start}; // save duration
+            std::cout << "iteration " << idx << ": " << times[idx] << '\n'; // print duration
+        }
+
+        // process times
+        double avg = 0.0;
+        for (auto t : times) {
+            avg += t.count();
+        }
+        avg /= (double) N_REPEAT;
+        printf("average time: %fs\n", avg);
+
+        double variance = 0.0;
+        for (auto t : times) {
+            variance += pow(t.count() - avg, 2.0);
+        }
+        variance /= (double) N_REPEAT;
+        double stddev = sqrt(variance);
+        printf("standard deviation: %.5fs\n", stddev);
+
+    }
+    Kokkos::finalize();
+}
diff --git a/benches/blas-speedup/gemm.rs b/benches/blas-speedup/gemm.rs
@@ -41,8 +41,8 @@ fn f1(
         KernelArgs::Index1D(i) => {
             // cols
             for j in 0..length {
-                // b[j, k] because was init using a layout left
-                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([j, k])).sum();
+                // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
+                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
                 let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
                 cc.set([i, j], val);
             }