View Implementation (#2)

* fixed hyperlink in doc * removed useless code from example * added base structure and traits for view * added view types * added view documentation * corrected types to use slices of generics scalar implementation usign the base structure does not seem trivial * completed Dim/Stride traits with basic methods to use in View constructor * added macros for basic dim/stride implem * replaced dim/stride traits with SmallVec type generic traits made stride computing too much of a hassle * moed back data traits to view module * added stride computation for all layout need to add unit test * right layout test * added left stride ciloutation test * removed small vec to use instead const generics * renamed generics for consistency * removed data traits in favor of an enum * added basic dataless constructor * added mirror functions * docs * added Index<> implems for view backend * corrected(?) mirror creation functions * added assertions & comments * added limit case test for 1D views' stride * added basic code for benchmarking gemvoperation * rewrote gemv benches to only use base types criterion produces a nice violin plot with performances for on the layout * added initialization bench * added 2D view init to the bench no variation between cases appear when dim changes * added data access benchmark no performance loss caused by the convoluted Index<> implementation for views * rewrote access bench to use random accesses weird behavior on 3D unchecked accesses; cf. comments in code * access benchmark ok issue lies with the blackbox? * doc update
imrn99 · Oct 12, 2023 · 4bc33b2 · 4bc33b2
1 parent e2992db
commit 4bc33b2
Show file tree

Hide file tree

Showing 19 changed files with 1,387 additions and 87 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,10 +3,28 @@ name = "poc-kokkos-rs"
 version = "0.1.0"
 edition = "2021"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+# DEPENDENCIES
 
 [dependencies]
 cxx = "*"
 
+[dev-dependencies]
+criterion = { version = "*", features = ["html_reports"] }
+rand = { version = "*", features = ["small_rng", "alloc"] }
+
 [build-dependencies]
 cxx-build = "*"
+
+# BENCHMARKS
+
+[[bench]]
+name = "gemv"
+harness = false
+
+[[bench]]
+name = "view_init"
+harness = false
+
+[[bench]]
+name = "view_access"
+harness = false
diff --git a/benches/gemv.rs b/benches/gemv.rs
@@ -0,0 +1,113 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+// Currently a partial gemv
+// y = Ax / y = xA
+// instead of
+// y = s1*Au + s2*v
+
+// regular matrix-vector product
+fn f1(size: u32) {
+    let length = 2_usize.pow(size);
+    let x = vec![1.0; length];
+    #[allow(non_snake_case)]
+    let A = vec![1.0; length * length];
+    // in this case, we can use Rust's iterator directly to easily operate
+    // line by line.
+    let y: Vec<f64> = A
+        .chunks(length)
+        .map(|row| row.iter().zip(x.iter()).map(|(r_i, x_i)| r_i * x_i).sum())
+        .collect();
+    black_box(y);
+}
+
+// regular matrix-vector product; using indexes
+fn f1_b(size: u32) {
+    let length = 2_usize.pow(size);
+    let x = vec![1.0; length];
+    #[allow(non_snake_case)]
+    let A = vec![1.0; length * length];
+    // As a reference, an implementation using indexes
+    let mut y: Vec<f64> = vec![0.0; length];
+    // col and row indexes of the matrix
+    for row in 0..length {
+        for col in 0..length {
+            // using unchecked accesses to keep the comparison "fair"
+            // as iterators bypass those
+            unsafe {
+                *y.get_unchecked_mut(row) +=
+                    A.get_unchecked(row * length + col) * x.get_unchecked(col)
+            }
+        }
+    }
+    black_box(y);
+}
+
+// regular vector-matrix product
+fn f2(size: u32) {
+    let length = 2_usize.pow(size);
+    let x = vec![1.0; length];
+    #[allow(non_snake_case)]
+    let A = vec![1.0; length * length];
+    // in the case of a vector-matrix product, the "row-first" layout (i.e. 2D LayoutRight)
+    // does not allow us to make use of Rust's iterators -> back to indexes
+    let mut y: Vec<f64> = vec![0.0; length];
+    // col and row indexes of the matrix
+    for col in 0..length {
+        for row in 0..length {
+            // using unchecked accesses to keep the comparison "fair"
+            // as iterators bypass those
+            unsafe {
+                *y.get_unchecked_mut(col) +=
+                    x.get_unchecked(row) * A.get_unchecked(row * length + col)
+            }
+        }
+    }
+    black_box(y);
+}
+
+// vector-matrix product with an adapted layout
+fn f3(size: u32) {
+    let length = 2_usize.pow(size);
+    let x = vec![1.0; length];
+    #[allow(non_snake_case)]
+    let A = vec![1.0; length * length];
+    // Thanks to the "row first" layout (i.e. 2D LayoutLeft), we can use
+    // the iterators again
+    // The code is essentially the same as the matrix-vector product
+    let y: Vec<f64> = A
+        .chunks(length)
+        .map(|col| x.iter().zip(col.iter()).map(|(x_i, c_i)| x_i * c_i).sum())
+        .collect();
+    black_box(y);
+}
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    // Generate/Define the input
+    let data_size: u32 = 11; // 2048 length vector, 2048*2048 matrix
+
+    let mut group = c.benchmark_group("gemv");
+    group.bench_with_input(
+        BenchmarkId::new("Matrix-Vector Product (iterators)", ""),
+        &data_size,
+        |b, &n| b.iter(|| f1(n)),
+    );
+    group.bench_with_input(
+        BenchmarkId::new("Matrix-Vector Product (indexes)", ""),
+        &data_size,
+        |b, &n| b.iter(|| f1_b(n)),
+    );
+    group.bench_with_input(
+        BenchmarkId::new("Vector-Matrix Product (indexes)", ""),
+        &data_size,
+        |b, &n| b.iter(|| f2(n)),
+    );
+    group.bench_with_input(
+        BenchmarkId::new("Vector-Matrix Product w/ adapted layout (iterators)", ""),
+        &data_size,
+        |b, &n| b.iter(|| f3(n)),
+    );
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/benches/view_access.rs b/benches/view_access.rs
@@ -0,0 +1,150 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use poc_kokkos_rs::view::{parameters::Layout, ViewOwned};
+use rand::prelude::*;
+
+// this bench is used to evaluate the cost of accessing views' data
+// all benched functions contain 10^3 accesses.
+
+// 1D vector access
+fn f1(length: usize, indices: &[usize]) {
+    let y: Vec<f64> = vec![0.0; length];
+    let idx = &indices[0..length];
+
+    idx.iter().for_each(|i| {
+        let tmp = y[*i];
+        black_box(tmp);
+    })
+}
+
+// 1D view access
+fn f1_b(length: usize, indices: &[usize]) {
+    let v_y: ViewOwned<'_, 1, f64> =
+        ViewOwned::new_from_data(vec![0.0; length], Layout::Right, [length]);
+    let idx = &indices[0..length];
+
+    idx.iter().for_each(|i| {
+        let tmp = v_y[[*i]];
+        black_box(tmp);
+    })
+}
+
+// 2D vector access
+fn f2(length: usize, indices: &[(usize, usize)]) {
+    let y: Vec<f64> = vec![0.0; length * length];
+    let idx = &indices[0..length];
+
+    idx.iter().for_each(|(i, j)| {
+        let tmp = unsafe { y.get_unchecked(i * length + j) };
+        black_box(tmp);
+    });
+}
+
+// 2D view access
+fn f2_b(length: usize, indices: &[(usize, usize)]) {
+    let v_y: ViewOwned<'_, 2, f64> =
+        ViewOwned::new_from_data(vec![0.0; length * length], Layout::Right, [length, length]);
+    let idx = &indices[0..length];
+
+    idx.iter().for_each(|(i, j)| {
+        let tmp = v_y[[*i, *j]];
+        black_box(tmp);
+    })
+}
+
+// 3D vector access
+fn f3(length: usize, indices: &[(usize, usize, usize)]) {
+    let y: Vec<f64> = vec![0.0; length * length * length];
+    let idx = &indices[0..length];
+
+    idx.iter().for_each(|(i, j, k)| {
+        // WARNING
+        // For some reason, if the access is not dereferenced, it gets optimized away
+        // You can verify it by running the benchmark twice:
+        // - once with the blackbox, without the deref operator *
+        // - once without the blackbox, with the deref operator *
+        // both yields the same result;
+        // the blackbox is supposed to prevent this, works in the 2D case, but not here
+        let tmp = *unsafe { y.get_unchecked(i * length * length + j * length + k) };
+        black_box(tmp);
+    })
+}
+
+// 3D view access
+fn f3_b(length: usize, indices: &[(usize, usize, usize)]) {
+    let v_y: ViewOwned<'_, 3, f64> = ViewOwned::new_from_data(
+        vec![0.0; length * length * length],
+        Layout::Right,
+        [length, length, length],
+    );
+    let idx = &indices[0..length];
+
+    idx.iter().for_each(|(i, j, k)| {
+        let tmp = v_y[[*i, *j, *k]];
+        black_box(tmp);
+    })
+}
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    // Generate/Define the input
+    const DATA_SIZE: u32 = 11; // 2048 length vector, 2048*2048 matrix
+    let length = 2_usize.pow(DATA_SIZE);
+    let mut rng = SmallRng::from_entropy();
+    let indices1: Vec<usize> = rand::seq::index::sample(&mut rng, length, length).into_vec();
+    let indices1b: Vec<usize> = rand::seq::index::sample(&mut rng, length, length).into_vec();
+    let indices1bb: Vec<usize> = rand::seq::index::sample(&mut rng, length, length).into_vec();
+
+    let indices2: Vec<(usize, usize)> = indices1
+        .iter()
+        .zip(indices1b.iter())
+        .map(|(i1, i2)| (*i1, *i2))
+        .collect();
+
+    let indices3: Vec<(usize, usize, usize)> = indices1
+        .iter()
+        .zip(indices1b.iter())
+        .zip(indices1bb.iter())
+        .map(|((i1, i2), i3)| (*i1, *i2, *i3))
+        .collect();
+
+    let mut group1 = c.benchmark_group("1D access");
+    group1.bench_with_input(
+        BenchmarkId::new("Vector Access", ""),
+        &(length, indices1.clone()),
+        |b, (n, i)| b.iter(|| f1(*n, i)),
+    );
+    group1.bench_with_input(
+        BenchmarkId::new("View Access", ""),
+        &(length, indices1),
+        |b, (n, i)| b.iter(|| f1_b(*n, i)),
+    );
+    group1.finish();
+
+    let mut group2 = c.benchmark_group("2D access");
+    group2.bench_with_input(
+        BenchmarkId::new("Vector Access", ""),
+        &(length, (indices2.clone())),
+        |b, (n, i)| b.iter(|| f2(*n, i)),
+    );
+    group2.bench_with_input(
+        BenchmarkId::new("View Access", ""),
+        &(length, (indices2)),
+        |b, (n, i)| b.iter(|| f2_b(*n, i)),
+    );
+    group2.finish();
+
+    let mut group3 = c.benchmark_group("3D access");
+    group3.bench_with_input(
+        BenchmarkId::new("Vector Access", ""),
+        &(length, indices3.clone()),
+        |b, (n, i)| b.iter(|| f3(*n, i)),
+    );
+    group3.bench_with_input(
+        BenchmarkId::new("View Access", ""),
+        &(length, indices3),
+        |b, (n, i)| b.iter(|| f3_b(*n, i)),
+    );
+    group3.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);