-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* gemm skeleton * completed gemm speedup > n_cores ???? * hardcoded gemm bench it seems the library creates much more overhead for serial execution than parallel * grouped blas speedup benches in a folder * fixed bench paths * update doc & readme * fixed warnings when testing using parallel features
- Loading branch information
Showing
8 changed files
with
415 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; | ||
use poc_kokkos_rs::{ | ||
functor::KernelArgs, | ||
routines::{ | ||
parallel_for, | ||
parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule}, | ||
}, | ||
view::{parameters::Layout, ViewOwned}, | ||
}; | ||
use rand::{ | ||
distributions::{Distribution, Uniform}, | ||
rngs::SmallRng, | ||
SeedableRng, | ||
}; | ||
|
||
// Serial GEMM | ||
fn f1( | ||
length: usize, | ||
aa_init: Vec<f64>, | ||
bb_init: Vec<f64>, | ||
cc_init: Vec<f64>, | ||
alpha: f64, | ||
beta: f64, | ||
) { | ||
let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]); | ||
let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :) | ||
let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]); | ||
black_box(&mut aa); | ||
black_box(&mut bb); | ||
black_box(&mut cc); | ||
|
||
let execp = ExecutionPolicy { | ||
space: ExecutionSpace::Serial, | ||
range: RangePolicy::RangePolicy(0..length), | ||
schedule: Schedule::Static, | ||
}; | ||
|
||
// C = alpha * A * B + beta * C | ||
let gemm_kernel = |arg: KernelArgs<1>| match arg { | ||
// lines | ||
KernelArgs::Index1D(i) => { | ||
// cols | ||
for j in 0..length { | ||
// b[j, k] because was init using a layout left | ||
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([j, k])).sum(); | ||
let val: f64 = alpha * ab_ij + beta * cc.get([i, j]); | ||
cc.set([i, j], val); | ||
} | ||
} | ||
KernelArgs::IndexND(_) => unimplemented!(), | ||
KernelArgs::Handle => unimplemented!(), | ||
}; | ||
parallel_for(execp, gemm_kernel).unwrap(); | ||
black_box(&cc); | ||
} | ||
|
||
// DeviceCPU GEMM | ||
fn f2( | ||
length: usize, | ||
aa_init: Vec<f64>, | ||
bb_init: Vec<f64>, | ||
cc_init: Vec<f64>, | ||
alpha: f64, | ||
beta: f64, | ||
) { | ||
let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]); | ||
let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :) | ||
let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]); | ||
black_box(&mut aa); | ||
black_box(&mut bb); | ||
black_box(&mut cc); | ||
|
||
let execp = ExecutionPolicy { | ||
space: ExecutionSpace::DeviceCPU, | ||
range: RangePolicy::RangePolicy(0..length), | ||
schedule: Schedule::Static, | ||
}; | ||
|
||
// C = alpha * A * B + beta * C | ||
let gemm_kernel = |arg: KernelArgs<1>| match arg { | ||
// lines | ||
KernelArgs::Index1D(i) => { | ||
// cols | ||
for j in 0..length { | ||
// all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft | ||
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum(); | ||
let val: f64 = alpha * ab_ij + beta * cc.get([i, j]); | ||
cc.set([i, j], val); | ||
} | ||
} | ||
KernelArgs::IndexND(_) => unimplemented!(), | ||
KernelArgs::Handle => unimplemented!(), | ||
}; | ||
parallel_for(execp, gemm_kernel).unwrap(); | ||
black_box(&cc); | ||
} | ||
|
||
pub fn criterion_benchmark(c: &mut Criterion) { | ||
// Generate/Define the input | ||
const DATA_SIZE: u32 = 10; | ||
let length = 2_usize.pow(DATA_SIZE); | ||
let seed: u64 = 9817498146784; | ||
let mut rng = SmallRng::seed_from_u64(seed); | ||
let range: Uniform<f64> = rand::distributions::Uniform::new(0.0, 100.0); | ||
let aa_init: Vec<f64> = (0..length * length) | ||
.map(|_| range.sample(&mut rng)) | ||
.collect(); | ||
let bb_init: Vec<f64> = (0..length * length) | ||
.map(|_| range.sample(&mut rng)) | ||
.collect(); | ||
let cc_init: Vec<f64> = (0..length * length) | ||
.map(|_| range.sample(&mut rng)) | ||
.collect(); | ||
let alpha: f64 = range.sample(&mut rng); | ||
let beta: f64 = range.sample(&mut rng); | ||
|
||
let mut group = c.benchmark_group("gemm"); | ||
group.bench_with_input( | ||
BenchmarkId::new("exec-serial", ""), | ||
&( | ||
length, | ||
aa_init.clone(), | ||
bb_init.clone(), | ||
cc_init.clone(), | ||
alpha, | ||
beta, | ||
), | ||
|b, (length, aa_init, bb_init, cc_init, alpha, beta)| { | ||
b.iter(|| { | ||
f1( | ||
*length, | ||
aa_init.clone(), | ||
bb_init.clone(), | ||
cc_init.clone(), | ||
*alpha, | ||
*beta, | ||
) | ||
}) | ||
}, | ||
); | ||
group.bench_with_input( | ||
BenchmarkId::new("exec-devicecpu", ""), | ||
&( | ||
length, | ||
aa_init.clone(), | ||
bb_init.clone(), | ||
cc_init.clone(), | ||
alpha, | ||
beta, | ||
), | ||
|b, (length, aa_init, bb_init, cc_init, alpha, beta)| { | ||
b.iter(|| { | ||
f2( | ||
*length, | ||
aa_init.clone(), | ||
bb_init.clone(), | ||
cc_init.clone(), | ||
*alpha, | ||
*beta, | ||
) | ||
}) | ||
}, | ||
); | ||
group.finish() | ||
} | ||
|
||
criterion_group!(benches, criterion_benchmark); | ||
criterion_main!(benches); |
File renamed without changes.
Oops, something went wrong.