Skip to content

Commit

Permalink
View Implementation (#2)
Browse files Browse the repository at this point in the history
* fixed hyperlink in doc

* removed useless code from example

* added base structure and traits for view

* added view types

* added view documentation

* corrected types to use slices of generics

scalar implementation usign the base structure does not seem trivial

* completed Dim/Stride traits with basic methods to use in View
constructor

* added macros for basic dim/stride implem

* replaced dim/stride traits with SmallVec type

generic traits made stride computing too much of a hassle

* moed back data traits to view module

* added stride computation for all layout

need to add unit test

* right layout test

* added left stride ciloutation test

* removed small vec to use instead const generics

* renamed generics for consistency

* removed data traits in favor of an enum

* added basic dataless constructor

* added mirror functions

* docs

* added Index<> implems for view backend

* corrected(?) mirror creation functions

* added assertions & comments

* added limit case test for 1D views' stride

* added basic code for benchmarking gemvoperation

* rewrote gemv benches to only use base types

criterion produces a nice violin plot with performances for
on the layout

* added initialization bench

* added 2D view init to the bench

no variation between cases appear when dim changes

* added data access benchmark

no performance loss caused by the convoluted Index<> implementation for
views

* rewrote access bench to use random accesses

weird behavior on 3D unchecked accesses; cf. comments in code

* access benchmark ok

issue lies with the blackbox?

* doc update
  • Loading branch information
imrn99 authored Oct 12, 2023
1 parent e2992db commit 4bc33b2
Show file tree
Hide file tree
Showing 19 changed files with 1,387 additions and 87 deletions.
624 changes: 624 additions & 0 deletions Cargo.lock

Large diffs are not rendered by default.

20 changes: 19 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,28 @@ name = "poc-kokkos-rs"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
# DEPENDENCIES

[dependencies]
cxx = "*"

[dev-dependencies]
criterion = { version = "*", features = ["html_reports"] }
rand = { version = "*", features = ["small_rng", "alloc"] }

[build-dependencies]
cxx-build = "*"

# BENCHMARKS

[[bench]]
name = "gemv"
harness = false

[[bench]]
name = "view_init"
harness = false

[[bench]]
name = "view_access"
harness = false
113 changes: 113 additions & 0 deletions benches/gemv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

// Currently a partial gemv
// y = Ax / y = xA
// instead of
// y = s1*Au + s2*v

// regular matrix-vector product
fn f1(size: u32) {
let length = 2_usize.pow(size);
let x = vec![1.0; length];
#[allow(non_snake_case)]
let A = vec![1.0; length * length];
// in this case, we can use Rust's iterator directly to easily operate
// line by line.
let y: Vec<f64> = A
.chunks(length)
.map(|row| row.iter().zip(x.iter()).map(|(r_i, x_i)| r_i * x_i).sum())
.collect();
black_box(y);
}

// regular matrix-vector product; using indexes
fn f1_b(size: u32) {
let length = 2_usize.pow(size);
let x = vec![1.0; length];
#[allow(non_snake_case)]
let A = vec![1.0; length * length];
// As a reference, an implementation using indexes
let mut y: Vec<f64> = vec![0.0; length];
// col and row indexes of the matrix
for row in 0..length {
for col in 0..length {
// using unchecked accesses to keep the comparison "fair"
// as iterators bypass those
unsafe {
*y.get_unchecked_mut(row) +=
A.get_unchecked(row * length + col) * x.get_unchecked(col)
}
}
}
black_box(y);
}

// regular vector-matrix product
fn f2(size: u32) {
let length = 2_usize.pow(size);
let x = vec![1.0; length];
#[allow(non_snake_case)]
let A = vec![1.0; length * length];
// in the case of a vector-matrix product, the "row-first" layout (i.e. 2D LayoutRight)
// does not allow us to make use of Rust's iterators -> back to indexes
let mut y: Vec<f64> = vec![0.0; length];
// col and row indexes of the matrix
for col in 0..length {
for row in 0..length {
// using unchecked accesses to keep the comparison "fair"
// as iterators bypass those
unsafe {
*y.get_unchecked_mut(col) +=
x.get_unchecked(row) * A.get_unchecked(row * length + col)
}
}
}
black_box(y);
}

// vector-matrix product with an adapted layout
fn f3(size: u32) {
let length = 2_usize.pow(size);
let x = vec![1.0; length];
#[allow(non_snake_case)]
let A = vec![1.0; length * length];
// Thanks to the "row first" layout (i.e. 2D LayoutLeft), we can use
// the iterators again
// The code is essentially the same as the matrix-vector product
let y: Vec<f64> = A
.chunks(length)
.map(|col| x.iter().zip(col.iter()).map(|(x_i, c_i)| x_i * c_i).sum())
.collect();
black_box(y);
}

pub fn criterion_benchmark(c: &mut Criterion) {
// Generate/Define the input
let data_size: u32 = 11; // 2048 length vector, 2048*2048 matrix

let mut group = c.benchmark_group("gemv");
group.bench_with_input(
BenchmarkId::new("Matrix-Vector Product (iterators)", ""),
&data_size,
|b, &n| b.iter(|| f1(n)),
);
group.bench_with_input(
BenchmarkId::new("Matrix-Vector Product (indexes)", ""),
&data_size,
|b, &n| b.iter(|| f1_b(n)),
);
group.bench_with_input(
BenchmarkId::new("Vector-Matrix Product (indexes)", ""),
&data_size,
|b, &n| b.iter(|| f2(n)),
);
group.bench_with_input(
BenchmarkId::new("Vector-Matrix Product w/ adapted layout (iterators)", ""),
&data_size,
|b, &n| b.iter(|| f3(n)),
);
group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
150 changes: 150 additions & 0 deletions benches/view_access.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use poc_kokkos_rs::view::{parameters::Layout, ViewOwned};
use rand::prelude::*;

// this bench is used to evaluate the cost of accessing views' data
// all benched functions contain 10^3 accesses.

// 1D vector access
fn f1(length: usize, indices: &[usize]) {
let y: Vec<f64> = vec![0.0; length];
let idx = &indices[0..length];

idx.iter().for_each(|i| {
let tmp = y[*i];
black_box(tmp);
})
}

// 1D view access
fn f1_b(length: usize, indices: &[usize]) {
let v_y: ViewOwned<'_, 1, f64> =
ViewOwned::new_from_data(vec![0.0; length], Layout::Right, [length]);
let idx = &indices[0..length];

idx.iter().for_each(|i| {
let tmp = v_y[[*i]];
black_box(tmp);
})
}

// 2D vector access
fn f2(length: usize, indices: &[(usize, usize)]) {
let y: Vec<f64> = vec![0.0; length * length];
let idx = &indices[0..length];

idx.iter().for_each(|(i, j)| {
let tmp = unsafe { y.get_unchecked(i * length + j) };
black_box(tmp);
});
}

// 2D view access
fn f2_b(length: usize, indices: &[(usize, usize)]) {
let v_y: ViewOwned<'_, 2, f64> =
ViewOwned::new_from_data(vec![0.0; length * length], Layout::Right, [length, length]);
let idx = &indices[0..length];

idx.iter().for_each(|(i, j)| {
let tmp = v_y[[*i, *j]];
black_box(tmp);
})
}

// 3D vector access
fn f3(length: usize, indices: &[(usize, usize, usize)]) {
let y: Vec<f64> = vec![0.0; length * length * length];
let idx = &indices[0..length];

idx.iter().for_each(|(i, j, k)| {
// WARNING
// For some reason, if the access is not dereferenced, it gets optimized away
// You can verify it by running the benchmark twice:
// - once with the blackbox, without the deref operator *
// - once without the blackbox, with the deref operator *
// both yields the same result;
// the blackbox is supposed to prevent this, works in the 2D case, but not here
let tmp = *unsafe { y.get_unchecked(i * length * length + j * length + k) };
black_box(tmp);
})
}

// 3D view access
fn f3_b(length: usize, indices: &[(usize, usize, usize)]) {
let v_y: ViewOwned<'_, 3, f64> = ViewOwned::new_from_data(
vec![0.0; length * length * length],
Layout::Right,
[length, length, length],
);
let idx = &indices[0..length];

idx.iter().for_each(|(i, j, k)| {
let tmp = v_y[[*i, *j, *k]];
black_box(tmp);
})
}

pub fn criterion_benchmark(c: &mut Criterion) {
// Generate/Define the input
const DATA_SIZE: u32 = 11; // 2048 length vector, 2048*2048 matrix
let length = 2_usize.pow(DATA_SIZE);
let mut rng = SmallRng::from_entropy();
let indices1: Vec<usize> = rand::seq::index::sample(&mut rng, length, length).into_vec();
let indices1b: Vec<usize> = rand::seq::index::sample(&mut rng, length, length).into_vec();
let indices1bb: Vec<usize> = rand::seq::index::sample(&mut rng, length, length).into_vec();

let indices2: Vec<(usize, usize)> = indices1
.iter()
.zip(indices1b.iter())
.map(|(i1, i2)| (*i1, *i2))
.collect();

let indices3: Vec<(usize, usize, usize)> = indices1
.iter()
.zip(indices1b.iter())
.zip(indices1bb.iter())
.map(|((i1, i2), i3)| (*i1, *i2, *i3))
.collect();

let mut group1 = c.benchmark_group("1D access");
group1.bench_with_input(
BenchmarkId::new("Vector Access", ""),
&(length, indices1.clone()),
|b, (n, i)| b.iter(|| f1(*n, i)),
);
group1.bench_with_input(
BenchmarkId::new("View Access", ""),
&(length, indices1),
|b, (n, i)| b.iter(|| f1_b(*n, i)),
);
group1.finish();

let mut group2 = c.benchmark_group("2D access");
group2.bench_with_input(
BenchmarkId::new("Vector Access", ""),
&(length, (indices2.clone())),
|b, (n, i)| b.iter(|| f2(*n, i)),
);
group2.bench_with_input(
BenchmarkId::new("View Access", ""),
&(length, (indices2)),
|b, (n, i)| b.iter(|| f2_b(*n, i)),
);
group2.finish();

let mut group3 = c.benchmark_group("3D access");
group3.bench_with_input(
BenchmarkId::new("Vector Access", ""),
&(length, indices3.clone()),
|b, (n, i)| b.iter(|| f3(*n, i)),
);
group3.bench_with_input(
BenchmarkId::new("View Access", ""),
&(length, indices3),
|b, (n, i)| b.iter(|| f3_b(*n, i)),
);
group3.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Loading

0 comments on commit 4bc33b2

Please sign in to comment.