Dev more bench (#8)

* gemm skeleton * completed gemm speedup > n_cores ???? * hardcoded gemm bench it seems the library creates much more overhead for serial execution than parallel * grouped blas speedup benches in a folder * fixed bench paths * update doc & readme * fixed warnings when testing using parallel features
imrn99 · Nov 23, 2023 · d932467 · d932467
1 parent c9b2c32
commit d932467
Show file tree

Hide file tree

Showing 8 changed files with 415 additions and 26 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,12 +23,16 @@ num_cpus = {version = "1.0", optional=true}
 [dev-dependencies]
 criterion = { version = "*", features = ["html_reports"] }
 rand      = { version = "*", features = ["small_rng", "alloc"] }
+atomic   = {version = "0.5.3"}
+rayon    = {version = "*"}
 
 [build-dependencies]
 cxx-build = "*"
 
 # BENCHMARKS
 
+## misc 
+
 [[bench]]
 name = "layout"
 harness = false
@@ -41,10 +45,25 @@ harness = false
 name = "view_access"
 harness = false
 
+## blas speedup measures
+
 [[bench]]
 name = "axpy"
+path = "benches/blas-speedup/axpy.rs"
 harness = false
 
 [[bench]]
 name = "gemv"
+path = "benches/blas-speedup/gemv.rs"
+harness = false
+
+[[bench]]
+name = "gemm"
+path = "benches/blas-speedup/gemm.rs"
 harness = false
+
+## library overhead measures
+
+[[bench]]
+name = "hardcoded_gemm"
+harness = false
diff --git a/README.md b/README.md
@@ -23,27 +23,66 @@ This makes limit-testing an fundamental part of the project.
 
 ## Quickstart
 
-### Example
+The PoC itself is a library, but you can run benchmarks and examples out of the box.
 
-The PoC itself is a library, but you can run examples by using the following command: 
+### Benchmarks
 
+Benchmarks can be run using the following command:
+
+```bash
+# all benchmarks
+cargo bench
+# a specific benchmark
+cargo bench --bench bench_name
 ```
+
+All results are compiled to the `target/criterion/` folder. The following
+benchmarks are available:
+
+- `layout`: Matrix-Vector product computation; This is used to put numbers on the
+  importance of data layout in memory.
+- `view_init`: Compare initialization performances of regular vectors to [Views][view]; This
+  is used to spot potential scaling issues induced by the more complex structure of Views.
+- `view_access`: Compare data access performances of regular vectors to [Views][view]; This
+  is used to spot potential scaling issues induced by the more complex structure of Views.
+- `axpy` / `gemv` / `gemm`: Measure speedup on basic BLAS implementations by running the same kernel
+  in serial mode first, then using parallelization on CPU. _Meant to be executed using features_.
+- `hardcoded_gemm`: Compute the same operations as the `gemm` benchmark, but using a hardcoded implementation
+  instead of methods from the PoC. Used to assess the additional cost induced by the library.
+
+
+### Examples
+
+```bash
 cargo run --example hello-world
 ```
 
-The following examples are available: 
+The following examples are available:
 
-- `hello-world`: ...
-- `openmp-parallel`: ...
+- `hello_world`: ...
+- `hello_world_omp`: ...
 
 
-### Documentation
+## Features
 
-A consise documentation can be generated and accessed using the following command: 
+Using `features`, the crate can be compiled to use different backend for execution of parallel section.
+These can also be enabled in benchmarks.
 
+```bash
+cargo build --features <FEATURE>
 ```
-cargo doc --open --no-deps
-```
+
+Available features:
+
+- `rayon`: Uses the [rayon][2] crate to handle parallelization on CPU.
+- `threads` : Uses [`std::thread`] methods to handle parallelization on CPU.
+- `gpu`: Currently used as a way to gate GPU usage as this cannot be done in pure Rust.
+
+## Compilation
+
+The build script will read the `CXX` environment variable to choose which C++ compiler to use
+for Rust/C++ interop. Note that the crate itself does not currently use C++ code, only examples
+do.
 
 ## References
 
@@ -54,16 +93,9 @@ cargo doc --open --no-deps
 - `move` keyword semantic & implementation: [link][MOVE]
 
 
-### Functor Implementation
-
-- A very specific answer to a very specific rust-lang issue: [link][FNIMPL]
-
-
-
 [1]: https://kokkos.github.io/kokkos-core-wiki/index.html
 [2]: https://docs.rs/rayon/latest/rayon/
 
 [NDARRAY]: https://docs.rs/ndarray/latest/ndarray/
 [CONSTG]: https://doc.rust-lang.org/reference/items/generics.html
-[FNIMPL]: https://github.com/rust-lang/rust/issues/29625#issuecomment-1692602873
 [MOVE]: https://stackoverflow.com/questions/30288782/what-are-move-semantics-in-rust
diff --git a/benches/axpy.rs → benches/blas-speedup/axpy.rs b/benches/axpy.rs → benches/blas-speedup/axpy.rs
diff --git a/benches/blas-speedup/gemm.rs b/benches/blas-speedup/gemm.rs
@@ -0,0 +1,168 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use poc_kokkos_rs::{
+    functor::KernelArgs,
+    routines::{
+        parallel_for,
+        parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule},
+    },
+    view::{parameters::Layout, ViewOwned},
+};
+use rand::{
+    distributions::{Distribution, Uniform},
+    rngs::SmallRng,
+    SeedableRng,
+};
+
+// Serial GEMM
+fn f1(
+    length: usize,
+    aa_init: Vec<f64>,
+    bb_init: Vec<f64>,
+    cc_init: Vec<f64>,
+    alpha: f64,
+    beta: f64,
+) {
+    let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]);
+    let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :)
+    let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]);
+    black_box(&mut aa);
+    black_box(&mut bb);
+    black_box(&mut cc);
+
+    let execp = ExecutionPolicy {
+        space: ExecutionSpace::Serial,
+        range: RangePolicy::RangePolicy(0..length),
+        schedule: Schedule::Static,
+    };
+
+    // C = alpha * A * B + beta * C
+    let gemm_kernel = |arg: KernelArgs<1>| match arg {
+        // lines
+        KernelArgs::Index1D(i) => {
+            // cols
+            for j in 0..length {
+                // b[j, k] because was init using a layout left
+                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([j, k])).sum();
+                let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
+                cc.set([i, j], val);
+            }
+        }
+        KernelArgs::IndexND(_) => unimplemented!(),
+        KernelArgs::Handle => unimplemented!(),
+    };
+    parallel_for(execp, gemm_kernel).unwrap();
+    black_box(&cc);
+}
+
+// DeviceCPU GEMM
+fn f2(
+    length: usize,
+    aa_init: Vec<f64>,
+    bb_init: Vec<f64>,
+    cc_init: Vec<f64>,
+    alpha: f64,
+    beta: f64,
+) {
+    let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]);
+    let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :)
+    let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]);
+    black_box(&mut aa);
+    black_box(&mut bb);
+    black_box(&mut cc);
+
+    let execp = ExecutionPolicy {
+        space: ExecutionSpace::DeviceCPU,
+        range: RangePolicy::RangePolicy(0..length),
+        schedule: Schedule::Static,
+    };
+
+    // C = alpha * A * B + beta * C
+    let gemm_kernel = |arg: KernelArgs<1>| match arg {
+        // lines
+        KernelArgs::Index1D(i) => {
+            // cols
+            for j in 0..length {
+                // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
+                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
+                let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
+                cc.set([i, j], val);
+            }
+        }
+        KernelArgs::IndexND(_) => unimplemented!(),
+        KernelArgs::Handle => unimplemented!(),
+    };
+    parallel_for(execp, gemm_kernel).unwrap();
+    black_box(&cc);
+}
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    // Generate/Define the input
+    const DATA_SIZE: u32 = 10;
+    let length = 2_usize.pow(DATA_SIZE);
+    let seed: u64 = 9817498146784;
+    let mut rng = SmallRng::seed_from_u64(seed);
+    let range: Uniform<f64> = rand::distributions::Uniform::new(0.0, 100.0);
+    let aa_init: Vec<f64> = (0..length * length)
+        .map(|_| range.sample(&mut rng))
+        .collect();
+    let bb_init: Vec<f64> = (0..length * length)
+        .map(|_| range.sample(&mut rng))
+        .collect();
+    let cc_init: Vec<f64> = (0..length * length)
+        .map(|_| range.sample(&mut rng))
+        .collect();
+    let alpha: f64 = range.sample(&mut rng);
+    let beta: f64 = range.sample(&mut rng);
+
+    let mut group = c.benchmark_group("gemm");
+    group.bench_with_input(
+        BenchmarkId::new("exec-serial", ""),
+        &(
+            length,
+            aa_init.clone(),
+            bb_init.clone(),
+            cc_init.clone(),
+            alpha,
+            beta,
+        ),
+        |b, (length, aa_init, bb_init, cc_init, alpha, beta)| {
+            b.iter(|| {
+                f1(
+                    *length,
+                    aa_init.clone(),
+                    bb_init.clone(),
+                    cc_init.clone(),
+                    *alpha,
+                    *beta,
+                )
+            })
+        },
+    );
+    group.bench_with_input(
+        BenchmarkId::new("exec-devicecpu", ""),
+        &(
+            length,
+            aa_init.clone(),
+            bb_init.clone(),
+            cc_init.clone(),
+            alpha,
+            beta,
+        ),
+        |b, (length, aa_init, bb_init, cc_init, alpha, beta)| {
+            b.iter(|| {
+                f2(
+                    *length,
+                    aa_init.clone(),
+                    bb_init.clone(),
+                    cc_init.clone(),
+                    *alpha,
+                    *beta,
+                )
+            })
+        },
+    );
+    group.finish()
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/benches/gemv.rs → benches/blas-speedup/gemv.rs b/benches/gemv.rs → benches/blas-speedup/gemv.rs