Dev dispatch (#6)

* modified functor traits to handle backedn specific closures should upcasting ever be stabilized, this version may compile * Serial code compiles again kernels are written in a Box, using an enum as argument this comes at a ~20% perf cost according to the mdrange_populate bench * rayon features compiles need to write a bench with a simple parallel for to make sure the feature works * moved imports inside test functions to fix warnings these warnings are technically erroneous since the auto fix creates errors * added feature bench to check beckend usage there is indeed a 2* speedup when using `--features rayon` * docs * docs again * update readme * new PartialEq impl for view data * added a raw_val method to views use for test assertions * moved flat index computation to an inline function * added feature-specific writing interface to views * update mirroring function * finished universal writing interface * updated feature bench; automatic kernel typing now works! * small doc update * modified layout bench to operate of a range of data sizes perf significantly worsen past the size of my L1 caches * new ForKernel type for std::thread usage * better kernel type definition * added cfg_if in dispatch code * added basic code for thread backend; currently doesn't work * indices dispatch ok; need to figure out how to share the kernel * kernel dispatch ok; need to fix lifetime issues * no more static lifetime using std threads; kernel dispatch using raw pointers produces sigsegv :) * small fixes * cloning kernel works with Box<impl ...>; impl trait in type aliases have yet to be stabilized, might switch to nightly * gated gpu impl behind a specific feature * removed DEPTH const generics from parallel_for method its just bloat at this point * parallel_for works with all features! * feature docs
imrn99 · Nov 17, 2023 · 7544eb1 · 7544eb1
1 parent 11578a8
commit 7544eb1
Show file tree

Hide file tree

Showing 14 changed files with 746 additions and 238 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,20 +6,23 @@ edition = "2021"
 # FEATURES 
 
 [features]
-serial = []
-threads = []
-rayon = ["dep:rayon"]
+threads = ["dep:atomic", "dep:num_cpus"]
+rayon =   ["dep:atomic", "dep:num_cpus", "dep:rayon"]
+gpu =     ["dep:atomic"]
 
 # DEPENDENCIES
 
 [dependencies]
-cxx = "*"
-cfg-if = "*"
-rayon = {version = "*", optional=true}
+cxx      = "*"
+cfg-if   = "*"
+rayon    = {version = "*", optional=true}
+atomic   = {version = "0.5.3", optional=true}
+num_cpus = {version = "1.0", optional=true}
+#bytemuck = {version = "*", optional=true} # needed for atomic >= 0.6.0
 
 [dev-dependencies]
 criterion = { version = "*", features = ["html_reports"] }
-rand = { version = "*", features = ["small_rng", "alloc"] }
+rand      = { version = "*", features = ["small_rng", "alloc"] }
 
 [build-dependencies]
 cxx-build = "*"
@@ -41,3 +44,7 @@ harness = false
 [[bench]]
 name = "mdrange_populate"
 harness = false
+
+[[bench]]
+name = "feature"
+harness = false
diff --git a/README.md b/README.md
@@ -9,12 +9,16 @@ proof and verification of that statement.
 
 ## Scope of the Project
 
-The main focus of this Proof-of-Concept is the architecture and approach used by
+~~The main focus of this Proof-of-Concept is the architecture and approach used by
 Kokkos for data management. While multiple targets support (Serial, [rayon][2], OpenMP)
-could be interesting, it is not the priority. 
+could be interesting, it is not the priority.~~
+
+Rudimentary data structure implementation being done, the goal is now to write a simple
+program using a `parallel_for` statement with satisfying portability as defined by Kokkos.
 
 Additionally, some features of Kokkos are not reproducible in Rust (GPU targetting, 
 templating); These create limits for the implementation that may or may not be bypassed.
+This makes limit-testing an fundamental part of the project.
 
 
 ## Quickstart
@@ -47,6 +51,7 @@ cargo doc --open --no-deps
 
 - `ndarray` Rust implementation: [link][NDARRAY]
 - Const generics documentation from The Rust Reference: [link][CONSTG]
+- `move` keyword semantic & implementation: [link][MOVE]
 
 
 ### Functor Implementation
@@ -60,4 +65,5 @@ cargo doc --open --no-deps
 
 [NDARRAY]: https://docs.rs/ndarray/latest/ndarray/
 [CONSTG]: https://doc.rust-lang.org/reference/items/generics.html
-[FNIMPL]: https://github.com/rust-lang/rust/issues/29625#issuecomment-1692602873
+[FNIMPL]: https://github.com/rust-lang/rust/issues/29625#issuecomment-1692602873
+[MOVE]: https://stackoverflow.com/questions/30288782/what-are-move-semantics-in-rust
diff --git a/benches/feature.rs b/benches/feature.rs
@@ -0,0 +1,52 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use poc_kokkos_rs::{
+    functor::KernelArgs,
+    routines::{
+        parallel_for,
+        parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule},
+    },
+    view::{parameters::Layout, ViewOwned},
+};
+
+// this bench is used to assess whether the parallel_for routines
+// switches backend accordingly to feature. It should be executed
+// multiple time by the user, each time with a different feature
+
+// 1D regular for init & populating
+fn f1(length: usize) {
+    let mut v_y = ViewOwned::new_from_data(vec![0.0; length], Layout::Right, [length]);
+    black_box(&mut v_y); // prevents the first init to be optimized away
+    let execp = ExecutionPolicy {
+        space: ExecutionSpace::DeviceCPU,
+        range: RangePolicy::RangePolicy(0..length),
+        schedule: Schedule::Static,
+    };
+
+    let kernel = |arg: KernelArgs<1>| match arg {
+        KernelArgs::Index1D(i) => {
+            v_y.set([i], 1.0);
+            black_box(&v_y[[i]]);
+        }
+        KernelArgs::IndexND(_) => unimplemented!(),
+        KernelArgs::Handle => unimplemented!(),
+    };
+    parallel_for(execp, kernel).unwrap();
+    black_box(&v_y);
+}
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    // Generate/Define the input
+    const DATA_SIZE: u32 = 20;
+    let length = 2_usize.pow(DATA_SIZE);
+
+    let mut group = c.benchmark_group("parallel_for");
+    group.bench_with_input(
+        BenchmarkId::new("feature-specific time", ""),
+        &(length),
+        |b, n| b.iter(|| f1(*n)),
+    );
+    group.finish()
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/benches/layout.rs b/benches/layout.rs
@@ -1,4 +1,4 @@
-use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 
 // Currently a partial gemv showing the importance of layout
 // y = Ax / y = xA
@@ -82,31 +82,41 @@ fn f3(size: u32) {
 }
 
 pub fn criterion_benchmark(c: &mut Criterion) {
-    // Generate/Define the input
-    let data_size: u32 = 11; // 2048 length vector, 2048*2048 matrix
+    // 2^6..2^11 length vector, (2^6..2^11)*(2^6..2^11) square matrix
 
-    let mut group = c.benchmark_group("Layout Effect");
-    group.bench_with_input(
-        BenchmarkId::new("Matrix-Vector Product (iterators)", ""),
-        &data_size,
-        |b, &n| b.iter(|| f1(n)),
-    );
-    group.bench_with_input(
-        BenchmarkId::new("Matrix-Vector Product (indexes)", ""),
-        &data_size,
-        |b, &n| b.iter(|| f1_b(n)),
-    );
-    group.bench_with_input(
-        BenchmarkId::new("Vector-Matrix Product (indexes)", ""),
-        &data_size,
-        |b, &n| b.iter(|| f2(n)),
-    );
-    group.bench_with_input(
-        BenchmarkId::new("Vector-Matrix Product w/ adapted layout (iterators)", ""),
-        &data_size,
-        |b, &n| b.iter(|| f3(n)),
-    );
-    group.finish();
+    let mut group1 = c.benchmark_group("Matrix-Vector Product");
+    for data_size in 6..12 {
+        // f64 uses 8 bytes, we consider the inp to be the length of data long one dim
+        group1.throughput(Throughput::Bytes((8 * 2_usize.pow(data_size)).pow(2) as u64));
+        group1.bench_with_input(
+            BenchmarkId::new("using iterators", ""),
+            &data_size,
+            |b, &n| b.iter(|| f1(n)),
+        );
+        group1.bench_with_input(
+            BenchmarkId::new("using indices", ""),
+            &data_size,
+            |b, &n| b.iter(|| f1_b(n)),
+        );
+    }
+    group1.finish();
+
+    let mut group2 = c.benchmark_group("Vector-Matrix Product");
+    for data_size in 6..12 {
+        group2.throughput(Throughput::Bytes((8 * 2_usize.pow(data_size)).pow(2) as u64));
+        group2.bench_with_input(
+            BenchmarkId::new("using regular layout & indices", ""),
+            &data_size,
+            |b, &n| b.iter(|| f2(n)),
+        );
+        group2.bench_with_input(
+            BenchmarkId::new("using adapted layout & iterators", ""),
+            &data_size,
+            |b, &n| b.iter(|| f3(n)),
+        );
+    }
+
+    group2.finish();
 }
 
 criterion_group!(benches, criterion_benchmark);

diff --git a/benches/mdrange_populate.rs b/benches/mdrange_populate.rs
@@ -1,7 +1,8 @@
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
 use poc_kokkos_rs::{
+    functor::KernelArgs,
     routines::{
-        parallel_for,
+        dispatch::serial,
         parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule},
     },
     view::{parameters::Layout, ViewOwned},
@@ -15,12 +16,11 @@ use poc_kokkos_rs::{
 fn f1(length: usize) {
     let mut v_y = ViewOwned::new_from_data(vec![0.0; length], Layout::Right, [length]);
     black_box(&mut v_y); // prevents the first init to be optimized away
-    (0..500).for_each(|_| {
-        (0..length).for_each(|i| {
-            v_y[[i]] = 1.0;
-        });
-        black_box(&v_y);
-    })
+
+    (0..length).for_each(|i| {
+        v_y[[i]] = 1.0;
+    });
+    black_box(&v_y);
 }
 
 // 1D parallel_for (serial) init & populating
@@ -34,11 +34,14 @@ fn f1_b(length: usize) {
         schedule: Schedule::Static,
     };
 
-    (0..500).for_each(|_| {
-        let execp_loc = execp.clone();
-        parallel_for::<0, 1, _>(execp_loc, |[i]| v_y[[i]] = 1.0).unwrap();
-        black_box(&v_y);
-    })
+    let kernel = Box::new(|arg: KernelArgs<1>| match arg {
+        KernelArgs::Index1D(_) => unimplemented!(),
+        KernelArgs::IndexND(indices) => v_y[indices] = 1.0,
+        KernelArgs::Handle => unimplemented!(),
+    });
+
+    serial(execp, kernel).unwrap();
+    black_box(&v_y);
 }
 
 // 3D regular for init & populating
@@ -74,7 +77,13 @@ fn f2_b(length: usize) {
         schedule: Schedule::Static,
     };
 
-    parallel_for::<0, 3, _>(execp, |indices| v_y[indices] = 1.0).unwrap();
+    let kernel = Box::new(|arg: KernelArgs<3>| match arg {
+        KernelArgs::Index1D(_) => unimplemented!(),
+        KernelArgs::IndexND(indices) => v_y[indices] = 1.0,
+        KernelArgs::Handle => unimplemented!(),
+    });
+
+    serial(execp, kernel).unwrap();
     black_box(&v_y);
 }
 
@@ -115,7 +124,13 @@ fn f3_b(length: usize) {
         schedule: Schedule::Static,
     };
 
-    parallel_for::<0, 5, _>(execp, |indices| v_y[indices] = 1.0).unwrap();
+    let kernel = Box::new(|arg: KernelArgs<5>| match arg {
+        KernelArgs::Index1D(_) => unimplemented!(),
+        KernelArgs::IndexND(indices) => v_y[indices] = 1.0,
+        KernelArgs::Handle => unimplemented!(),
+    });
+
+    serial(execp, kernel).unwrap();
     black_box(&v_y);
 }
 

diff --git a/build.rs b/build.rs
@@ -26,7 +26,7 @@ fn main() {
     match env::consts::OS {
         "macos" => {
             println!("cargo:rustc-link-arg=-L/opt/homebrew/opt/libomp/lib");
-            println!("cargo:rustc-link-arg=-ld_classic");
+            //println!("cargo:rustc-link-arg=-ld_classic");
             println!("cargo:rustc-link-arg=-lomp");
         }
         "linux" => {

diff --git a/src/functor.rs b/src/functor.rs
@@ -1,7 +1,38 @@
+//! functor & kernel related code
 //!
-//!
-//!
-//!
+//! This module contains all functor and kernel related code. Its content
+//! is highly dependant on the features enabled since the traits that a
+//! kernel must satisfy changes totally depending on the backend used.
+
+/// Kernel argument types
+///
+/// Until some work is done to have a better solution[^sol1][^sol2], this will
+/// be an enum and kernels will be written in an idiomatic way.
+///
+/// [^sol1]: Current tracking issue for upcasting implementation: <https://github.com/rust-lang/rust/issues/65991>
+///
+/// [^sol2]: Current tracking issue to allow impl trait usage in types aliases: <https://github.com/rust-lang/rust/issues/63063>
+pub enum KernelArgs<const N: usize> {
+    /// Arguments of a one-dimensionnal kernel (e.g. a RangePolicy).
+    Index1D(usize),
+    /// Arguments of a `N`-dimensionnal kernel (e.g. a MDRangePolicy).
+    IndexND([usize; N]),
+    /// Arguments of a team-based kernel.
+    Handle,
+}
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "rayon")] {
+        /// `rayon`-specific kernel type.
+        pub type ForKernelType<'a, const N: usize> = Box<dyn Fn(KernelArgs<N>) + Send + Sync + 'a>;
+    } else if #[cfg(feature = "threads")] {
+        /// Standard threads specific kernel type.
+        pub type ForKernelType<'a, const N: usize> = Box<dyn Fn(KernelArgs<N>) + Send + 'a>;
+    } else {
+        /// Fall back kernel type.
+        pub type ForKernelType<'a, const N: usize> = SerialForKernelType<'a, N>;
+    }
+}
 
-/// Functor trait. User can implement its own functor by implementing this trait.
-pub trait Functor<Args, Output>: Fn(Args) -> Output {}
+/// Serial kernel type.
+pub type SerialForKernelType<'a, const N: usize> = Box<dyn FnMut(KernelArgs<N>) + 'a>;