From a66e417b113dd7f4ef23c67892634aace762f64e Mon Sep 17 00:00:00 2001 From: louisfd Date: Mon, 3 Feb 2025 13:08:37 -0500 Subject: [PATCH 01/14] wip --- .../src/runtime_tests/memcpy_async.rs | 119 ++++++++++++++++++ crates/cubecl-core/src/runtime_tests/mod.rs | 2 + 2 files changed, 121 insertions(+) create mode 100644 crates/cubecl-core/src/runtime_tests/memcpy_async.rs diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs new file mode 100644 index 000000000..3e3541082 --- /dev/null +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -0,0 +1,119 @@ +use crate::{self as cubecl, as_bytes, runtime_tests::memcpy_async, Feature}; +use cubecl::prelude::*; +use pipeline::Pipeline; + +#[cube] +fn tile_computation( + lhs: Slice>, + rhs: Slice>, + mut out: SliceMut>, +) { + for i in 0..lhs.len() { + out[i] = Line::cast_from(F::new(10.)) * lhs[i] + rhs[i]; + } +} + +#[cube] +fn memcpy_sync(source: Slice>, mut destination: SliceMut>) { + for i in 0..source.len() { + destination[i] = source[i]; + } +} + +#[cube(launch)] +fn computation( + lhs: &Tensor>, + rhs: &Tensor>, + output: &mut Tensor>, +) { + let unit_id = UNIT_POS_X; + let mut lhs_smem = SharedMemory::::new_lined(16u32, 1u32); + let mut rhs_smem = SharedMemory::::new_lined(16u32, 1u32); + + let pipeline = Pipeline::new(); + + // Load Lhs to SMEM + pipeline.memcpy_async( + lhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), + lhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32), + ); + + // Load Rhs to SMEM + pipeline.memcpy_async( + rhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), + rhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32), + ); + + // sync + sync_units(); + + // Perform matmul on SMEM + tile_computation( + lhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), + rhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), + output.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32), + ); +} + +pub fn test_memcpy( + client: ComputeClient, +) { + // if !client.properties().feature_enabled(Feature::Pipeline) { + // // We can't execute the test, skip. + // return; + // } + + let lhs = client + .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]); + let rhs = client + .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]); + let output = client.empty(16 * core::mem::size_of::()); + + unsafe { + computation::launch::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(8, 1, 1), + TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), + TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), + TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), + ) + }; + + let actual = client.read_one(output.binding()); + let actual = F::from_bytes(&actual); + let expected = [ + F::new(0.0), + F::new(11.0), + F::new(22.0), + F::new(33.0), + F::new(44.0), + F::new(55.0), + F::new(66.0), + F::new(77.0), + F::new(88.0), + F::new(99.0), + F::new(110.0), + F::new(121.0), + F::new(132.0), + F::new(143.0), + F::new(154.0), + F::new(165.0), + ]; + + assert_eq!(actual, expected); +} + +#[allow(missing_docs)] +#[macro_export] +macro_rules! testgen_memcpy_async { + () => { + use super::*; + + #[test] + fn test_memcpy_async() { + let client = TestRuntime::client(&Default::default()); + cubecl_core::runtime_tests::memcpy_async::test_memcpy::(client); + } + }; +} diff --git a/crates/cubecl-core/src/runtime_tests/mod.rs b/crates/cubecl-core/src/runtime_tests/mod.rs index 2f37056dc..033a69fb2 100644 --- a/crates/cubecl-core/src/runtime_tests/mod.rs +++ b/crates/cubecl-core/src/runtime_tests/mod.rs @@ -10,6 +10,7 @@ pub mod different_rank; pub mod index; pub mod launch; pub mod line; +pub mod memcpy_async; pub mod metadata; pub mod pipeline; pub mod plane; @@ -83,6 +84,7 @@ macro_rules! testgen_float { cubecl_core::testgen_launch!(); cubecl_core::testgen_line!(); cubecl_core::testgen_pipeline!(); + cubecl_core::testgen_memcpy_async!(); cubecl_core::testgen_plane!(); cubecl_core::testgen_sequence!(); cubecl_core::testgen_slice!(); From 3ccb9de75568cb89c522f1fd9d27fd382659dd3d Mon Sep 17 00:00:00 2001 From: louisfd Date: Mon, 3 Feb 2025 15:42:55 -0500 Subject: [PATCH 02/14] wip --- .../src/runtime_tests/memcpy_async.rs | 73 +++++++++---------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 3e3541082..ce0f8cc3d 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -8,8 +8,13 @@ fn tile_computation( rhs: Slice>, mut out: SliceMut>, ) { - for i in 0..lhs.len() { - out[i] = Line::cast_from(F::new(10.)) * lhs[i] + rhs[i]; + if UNIT_POS_X == 0 { + for i in 0..2 { + out[i] = lhs[i]; + } + for i in 0..2 { + out[i + 2] = rhs[i]; + } } } @@ -26,33 +31,33 @@ fn computation( rhs: &Tensor>, output: &mut Tensor>, ) { - let unit_id = UNIT_POS_X; - let mut lhs_smem = SharedMemory::::new_lined(16u32, 1u32); - let mut rhs_smem = SharedMemory::::new_lined(16u32, 1u32); + let mut lhs_smem = SharedMemory::::new_lined(2u32, 1u32); + let mut rhs_smem = SharedMemory::::new_lined(2u32, 1u32); let pipeline = Pipeline::new(); - // Load Lhs to SMEM - pipeline.memcpy_async( - lhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), - lhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32), - ); + pipeline.producer_acquire(); - // Load Rhs to SMEM - pipeline.memcpy_async( - rhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), - rhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32), - ); + let start = 0u32; + let end = 2u32; + if UNIT_POS_X == 0 { + pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); + } else { + // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); + } + + pipeline.producer_commit(); - // sync sync_units(); + pipeline.consumer_wait(); // Perform matmul on SMEM tile_computation( - lhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), - rhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32), - output.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32), + lhs_smem.slice(start, end), + rhs_smem.slice(start, end), + output.slice_mut(start, end), ); + pipeline.consumer_release(); } pub fn test_memcpy( @@ -63,17 +68,15 @@ pub fn test_memcpy( // return; // } - let lhs = client - .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]); - let rhs = client - .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]); - let output = client.empty(16 * core::mem::size_of::()); + let lhs = client.create(as_bytes![F: 10., 11., 12., 13., 14., 15.]); + let rhs = client.create(as_bytes![F: 10., 11., 12., 13.]); + let output = client.empty(4 * core::mem::size_of::()); unsafe { computation::launch::( &client, CubeCount::Static(1, 1, 1), - CubeDim::new(8, 1, 1), + CubeDim::new(2, 1, 1), TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), @@ -83,22 +86,12 @@ pub fn test_memcpy( let actual = client.read_one(output.binding()); let actual = F::from_bytes(&actual); let expected = [ - F::new(0.0), + F::new(10.0), F::new(11.0), - F::new(22.0), - F::new(33.0), - F::new(44.0), - F::new(55.0), - F::new(66.0), - F::new(77.0), - F::new(88.0), - F::new(99.0), - F::new(110.0), - F::new(121.0), - F::new(132.0), - F::new(143.0), - F::new(154.0), - F::new(165.0), + F::new(12.0), + F::new(13.0), + F::new(14.0), + F::new(15.0), ]; assert_eq!(actual, expected); From 4359ac5baf25a5c0dda2b6fdae331d2981c990c5 Mon Sep 17 00:00:00 2001 From: louisfd Date: Mon, 3 Feb 2025 15:57:21 -0500 Subject: [PATCH 03/14] choosable pipeline steps --- crates/cubecl-core/src/frontend/pipeline.rs | 8 ++++---- crates/cubecl-core/src/runtime_tests/memcpy_async.rs | 2 +- crates/cubecl-core/src/runtime_tests/pipeline.rs | 4 ++-- crates/cubecl-cpp/src/shared/base.rs | 11 +++++++++-- crates/cubecl-cpp/src/shared/pipeline.rs | 8 ++++++-- crates/cubecl-ir/src/allocator.rs | 11 +++++++++-- crates/cubecl-ir/src/scope.rs | 4 ++-- crates/cubecl-ir/src/variable.rs | 2 +- 8 files changed, 34 insertions(+), 16 deletions(-) diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs index 15bf05d9a..f46aa37a5 100644 --- a/crates/cubecl-core/src/frontend/pipeline.rs +++ b/crates/cubecl-core/src/frontend/pipeline.rs @@ -93,13 +93,13 @@ pub struct PipelineExpand { impl Default for Pipeline { fn default() -> Self { - Self::new() + Self::new(1) } } impl Pipeline { /// Create a pipeline instance - pub fn new() -> Self { + pub fn new(num_steps: u32) -> Self { Self { _c: PhantomData } } @@ -133,9 +133,9 @@ impl Pipeline { unexpanded!() } - pub fn __expand_new(scope: &mut Scope) -> PipelineExpand { + pub fn __expand_new(scope: &mut Scope, num_steps: u32) -> PipelineExpand { let elem = C::as_elem(scope); - let variable = scope.create_pipeline(Item::new(elem)); + let variable = scope.create_pipeline(Item::new(elem), num_steps as u8); PipelineExpand { elem: variable, _c: PhantomData, diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index ce0f8cc3d..209c5d0b6 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -34,7 +34,7 @@ fn computation( let mut lhs_smem = SharedMemory::::new_lined(2u32, 1u32); let mut rhs_smem = SharedMemory::::new_lined(2u32, 1u32); - let pipeline = Pipeline::new(); + let pipeline = Pipeline::new(1u32); pipeline.producer_acquire(); diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs index 2c4b95500..a33ee9be1 100644 --- a/crates/cubecl-core/src/runtime_tests/pipeline.rs +++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs @@ -12,7 +12,7 @@ fn pipelined_sum( let smem_size = 2 * batch_len; let num_batches = input.len() / batch_len; let mut shared_memory = SharedMemory::::new_lined(smem_size, input.line_size()); - let pipeline = Pipeline::new(); + let pipeline = Pipeline::new(2u32); let mut sum = Line::::empty(input.line_size()).fill(F::new(0.)); @@ -63,7 +63,7 @@ fn pipelined_sum( #[cube(launch)] pub fn async_copy_test(input: &Array>, output: &mut Array>) { - let pipeline = pipeline::Pipeline::::new(); + let pipeline = pipeline::Pipeline::::new(2u32); let mut smem = SharedMemory::::new_lined(1u32, 1u32); if UNIT_POS == 0 { diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs index ad3bf35e1..8746efb15 100644 --- a/crates/cubecl-cpp/src/shared/base.rs +++ b/crates/cubecl-cpp/src/shared/base.rs @@ -1043,14 +1043,21 @@ impl CppCompiler { frag: self.compile_matrix(mat), } } - gpu::VariableKind::Pipeline { id, item } => { + gpu::VariableKind::Pipeline { + id, + item, + num_steps, + } => { self.pipeline = true; let pipeline = Variable::Pipeline { id, item: self.compile_item(item), }; if !self.pipelines.iter().any(|s| s.pipeline_id() == id) { - self.pipelines.push(PipelineOps::Init { pipeline }); + self.pipelines.push(PipelineOps::Init { + pipeline, + num_steps, + }); } pipeline } diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 34e25e430..54bc2db8d 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -6,6 +6,7 @@ use super::{Component, Dialect, Variable}; pub enum PipelineOps { Init { pipeline: Variable, + num_steps: u8, }, MemCopyAsync { pipeline: Variable, @@ -53,11 +54,14 @@ impl Display for PipelineOps { cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline}); ") } - PipelineOps::Init { pipeline } => { + PipelineOps::Init { + pipeline, + num_steps, + } => { write!( f, " -__shared__ cuda::pipeline_shared_state {pipeline}_state; +__shared__ cuda::pipeline_shared_state {pipeline}_state; auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); " ) diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs index 90d6907e0..6d6173579 100644 --- a/crates/cubecl-ir/src/allocator.rs +++ b/crates/cubecl-ir/src/allocator.rs @@ -96,9 +96,16 @@ impl Allocator { ExpandElement::Plain(variable) } - pub fn create_pipeline(&self, item: Item) -> ExpandElement { + pub fn create_pipeline(&self, item: Item, num_steps: u8) -> ExpandElement { let id = self.new_local_index(); - let variable = Variable::new(VariableKind::Pipeline { id, item }, item); + let variable = Variable::new( + VariableKind::Pipeline { + id, + item, + num_steps, + }, + item, + ); ExpandElement::Plain(variable) } diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs index a533c8fc9..876c2df34 100644 --- a/crates/cubecl-ir/src/scope.rs +++ b/crates/cubecl-ir/src/scope.rs @@ -110,8 +110,8 @@ impl Scope { } /// Create a new pipeline element. - pub fn create_pipeline(&mut self, item: Item) -> ExpandElement { - let pipeline = self.allocator.create_pipeline(item); + pub fn create_pipeline(&mut self, item: Item, num_steps: u8) -> ExpandElement { + let pipeline = self.allocator.create_pipeline(item, num_steps); self.add_pipeline(*pipeline); pipeline } diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs index 36317aa95..530faa26e 100644 --- a/crates/cubecl-ir/src/variable.rs +++ b/crates/cubecl-ir/src/variable.rs @@ -59,7 +59,7 @@ pub enum VariableKind { Matrix { id: Id, mat: Matrix }, Slice { id: Id }, Builtin(Builtin), - Pipeline { id: Id, item: Item }, + Pipeline { id: Id, item: Item, num_steps: u8 }, } #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From 1b3d940f45602131f588e18bf16cc4886aa70e33 Mon Sep 17 00:00:00 2001 From: louisfd Date: Tue, 4 Feb 2025 09:22:26 -0500 Subject: [PATCH 04/14] fix pipeline --- crates/cubecl-core/src/frontend/context.rs | 159 ++++++++++++++++++ crates/cubecl-core/src/frontend/pipeline.rs | 27 ++- .../src/runtime_tests/memcpy_async.rs | 114 +++++++++---- crates/cubecl-cpp/src/shared/base.rs | 4 +- crates/cubecl-cpp/src/shared/pipeline.rs | 8 +- crates/cubecl-ir/src/allocator.rs | 4 +- crates/cubecl-ir/src/scope.rs | 4 +- crates/cubecl-ir/src/variable.rs | 2 +- 8 files changed, 271 insertions(+), 51 deletions(-) create mode 100644 crates/cubecl-core/src/frontend/context.rs diff --git a/crates/cubecl-core/src/frontend/context.rs b/crates/cubecl-core/src/frontend/context.rs new file mode 100644 index 000000000..284e14712 --- /dev/null +++ b/crates/cubecl-core/src/frontend/context.rs @@ -0,0 +1,159 @@ +use crate::ir::Id; +use crate::ir::{self, Elem, Instruction, Item, Scope, Variable, VariableKind}; +use alloc::rc::Rc; +use core::cell::RefCell; +use cubecl_ir::ExpandElement; +use cubecl_runtime::debug::DebugLogger; +use std::any::TypeId; +use std::collections::HashMap; + +pub struct CubeContext { + pub root: Rc>, + pub scope: Rc>, + pub debug_enabled: bool, + pub typemap: Rc>>, +} + +impl Default for CubeContext { + fn default() -> Self { + Self::root() + } +} + +impl CubeContext { + /// Create a new cube context, with a root scope + /// A root scope is at the root of a compute shader + /// Therefore there is one cube context per shader + /// The allocator will define the strategy for creating local intermediates and mutable variables + pub fn root() -> CubeContext { + let root = Rc::new(RefCell::new(Scope::root())); + let typemap = Rc::new(RefCell::new(HashMap::new())); + let scope = root.clone(); + + Self { + scope, + root, + debug_enabled: DebugLogger::default().is_activated(), + typemap, + } + } + + pub fn register>(&mut self, op: O) { + self.scope.borrow_mut().register(op) + } + + /// Resolve the element type of the given generic type. + pub fn resolve_elem(&self) -> Option { + let map = self.typemap.borrow(); + let result = map.get(&TypeId::of::()); + + result.cloned() + } + + /// Register the element type for the given generic type. + pub fn register_elem(&mut self, elem: Elem) { + let mut map = self.typemap.borrow_mut(); + + map.insert(TypeId::of::(), elem); + } + + pub fn child(&mut self) -> CubeContext { + let scope = self.scope.borrow_mut().child(); + + Self { + scope: Rc::new(RefCell::new(scope)), + root: self.root.clone(), + debug_enabled: self.debug_enabled, + typemap: self.typemap.clone(), + } + } + + pub fn into_scope(self) -> Scope { + core::mem::drop(self.root); + + Rc::into_inner(self.scope) + .expect("Only one reference") + .into_inner() + } + + /// Create a new mutable local variable. + pub fn create_local_mut(&mut self, item: Item) -> ExpandElement { + let local = self.scope.borrow().allocator.create_local_mut(item); + self.scope.borrow_mut().add_local_mut(*local); + local + } + + /// Create a new immutable local variable. + pub fn create_local(&mut self, item: Item) -> ExpandElement { + self.scope.borrow().allocator.create_local(item) + } + + /// Create a new immutable local binding that must never be a reused variable, regardless of + /// allocator + pub fn create_local_restricted(&mut self, item: Item) -> ExpandElement { + self.scope.borrow().allocator.create_local_restricted(item) + } + + /// Create a new matrix element. + pub fn create_matrix(&mut self, matrix: ir::Matrix) -> ExpandElement { + let matrix = self.scope.borrow().allocator.create_matrix(matrix); + self.scope.borrow_mut().add_matrix(*matrix); + matrix + } + + /// Create a new pipeline element. + pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement { + let pipeline = self + .scope + .borrow() + .allocator + .create_pipeline(item, num_stages); + self.scope.borrow_mut().add_pipeline(*pipeline); + pipeline + } + + /// Create a new slice element. + pub fn create_slice(&mut self, item: Item) -> ExpandElement { + let slice = self.scope.borrow().allocator.create_slice(item); + self.scope.borrow_mut().add_slice(*slice); + slice + } + + pub fn create_shared(&mut self, item: Item, size: u32) -> ExpandElement { + ExpandElement::Plain(self.root.borrow_mut().create_shared(item, size)) + } + + pub fn create_local_array(&mut self, item: Item, size: u32) -> ExpandElement { + let local_array: ExpandElement = + self.root.borrow().allocator.create_local_array(item, size); + self.root.borrow_mut().add_local_array(*local_array); + local_array + } + + pub fn create_const_array(&mut self, item: Item, data: Vec) -> ExpandElement { + ExpandElement::Plain(self.root.borrow_mut().create_const_array(item, data)) + } + + /// Obtain the index-th input + pub fn input(&mut self, id: Id, item: Item) -> ExpandElement { + ExpandElement::Plain(crate::ir::Variable::new( + VariableKind::GlobalInputArray(id), + item, + )) + } + + /// Obtain the index-th output + pub fn output(&mut self, id: Id, item: Item) -> ExpandElement { + let var = crate::ir::Variable::new(VariableKind::GlobalOutputArray(id), item); + self.scope.borrow_mut().write_global_custom(var); + ExpandElement::Plain(var) + } + + /// Obtain the index-th scalar + pub fn scalar(&self, id: Id, elem: Elem) -> ExpandElement { + ExpandElement::Plain(crate::ir::Variable::new( + VariableKind::GlobalScalar(id), + Item::new(elem), + )) + } +} diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs index f46aa37a5..847fa3248 100644 --- a/crates/cubecl-core/src/frontend/pipeline.rs +++ b/crates/cubecl-core/src/frontend/pipeline.rs @@ -76,14 +76,33 @@ use crate::{ unexpanded, }; -use super::{CubePrimitive, ExpandElementTyped, Line, Slice, SliceMut}; +use super::{ + CubePrimitive, CubeType, ExpandElementTyped, Init, IntoRuntime, Line, Slice, SliceMut, +}; /// A mechanism for managing a sequence of `memcpy_async` /// For now, it only works at the Cube scope +#[derive(Clone, Copy)] pub struct Pipeline { _c: PhantomData, } +impl IntoRuntime for Pipeline { + fn __expand_runtime_method(self, _scope: &mut Scope) -> Self::ExpandType { + panic!("Doesn't exist at runtime") + } +} + +impl CubeType for Pipeline { + type ExpandType = PipelineExpand; +} + +impl Init for PipelineExpand { + fn init(self, _scope: &mut Scope) -> Self { + self + } +} + #[derive(Clone)] /// Expand type of [Pipeline] pub struct PipelineExpand { @@ -99,7 +118,7 @@ impl Default for Pipeline { impl Pipeline { /// Create a pipeline instance - pub fn new(num_steps: u32) -> Self { + pub fn new(_num_stages: u32) -> Self { Self { _c: PhantomData } } @@ -133,9 +152,9 @@ impl Pipeline { unexpanded!() } - pub fn __expand_new(scope: &mut Scope, num_steps: u32) -> PipelineExpand { + pub fn __expand_new(scope: &mut Scope, num_stages: u32) -> PipelineExpand { let elem = C::as_elem(scope); - let variable = scope.create_pipeline(Item::new(elem), num_steps as u8); + let variable = scope.create_pipeline(Item::new(elem), num_stages as u8); PipelineExpand { elem: variable, _c: PhantomData, diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 209c5d0b6..013c78063 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -3,19 +3,52 @@ use cubecl::prelude::*; use pipeline::Pipeline; #[cube] -fn tile_computation( +fn tile_computation_1( lhs: Slice>, rhs: Slice>, mut out: SliceMut>, ) { - if UNIT_POS_X == 0 { - for i in 0..2 { - out[i] = lhs[i]; - } - for i in 0..2 { - out[i + 2] = rhs[i]; - } + for i in 0..2 { + out[i] = Line::cast_from(10u32) * lhs[i]; + } + for i in 0..2 { + out[i] += rhs[i]; + } +} + +#[cube] +fn tile_computation_2( + lhs: &SharedMemory>, + rhs: &SharedMemory>, + out: &mut Tensor>, + start: u32, + end: u32, +) { + for i in start..end { + out[i] = Line::cast_from(10u32) * lhs[i]; + } + for i in start..end { + out[i] += rhs[i]; + } +} + +#[cube] +fn tile_computation_3( + lhs: &SharedMemory>, + rhs: &SharedMemory>, + out: &mut Tensor>, + start: u32, + end: u32, + pipeline: Pipeline, +) { + pipeline.consumer_wait(); + for i in start..end { + out[i] = Line::cast_from(10u32) * lhs[i]; + } + for i in start..end { + out[i] += rhs[i]; } + pipeline.consumer_release(); } #[cube] @@ -31,33 +64,47 @@ fn computation( rhs: &Tensor>, output: &mut Tensor>, ) { - let mut lhs_smem = SharedMemory::::new_lined(2u32, 1u32); - let mut rhs_smem = SharedMemory::::new_lined(2u32, 1u32); + let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); + let mut rhs_smem = SharedMemory::::new_lined(4u32, 1u32); - let pipeline = Pipeline::new(1u32); + let pipeline = Pipeline::new(2u32); - pipeline.producer_acquire(); + let start = UNIT_POS_X * 2u32; + let end = start + 2u32; - let start = 0u32; - let end = 2u32; - if UNIT_POS_X == 0 { - pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); - } else { - // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); - } + pipeline.producer_acquire(); + pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); + // memcpy_sync(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); + pipeline.producer_commit(); + pipeline.producer_acquire(); + pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); + // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); pipeline.producer_commit(); - sync_units(); + // tile_computation_0: inline + // pipeline.consumer_wait(); + // for i in start..end { + // output[i] = Line::cast_from(10u32) * lhs[i]; + // } + // for i in start..end { + // output[i] += rhs[i]; + // } + // pipeline.consumer_release(); - pipeline.consumer_wait(); - // Perform matmul on SMEM - tile_computation( - lhs_smem.slice(start, end), - rhs_smem.slice(start, end), - output.slice_mut(start, end), - ); - pipeline.consumer_release(); + // pipeline.consumer_wait(); + // tile_computation_1( + // lhs_smem.slice(start, end), + // rhs_smem.slice(start, end), + // output.slice_mut(start, end), + // ); + // pipeline.consumer_release(); + + // pipeline.consumer_wait(); + // tile_computation_2(&lhs_smem, &rhs_smem, output, start, end); + // pipeline.consumer_release(); + + tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline); } pub fn test_memcpy( @@ -68,7 +115,7 @@ pub fn test_memcpy( // return; // } - let lhs = client.create(as_bytes![F: 10., 11., 12., 13., 14., 15.]); + let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]); let rhs = client.create(as_bytes![F: 10., 11., 12., 13.]); let output = client.empty(4 * core::mem::size_of::()); @@ -85,14 +132,7 @@ pub fn test_memcpy( let actual = client.read_one(output.binding()); let actual = F::from_bytes(&actual); - let expected = [ - F::new(10.0), - F::new(11.0), - F::new(12.0), - F::new(13.0), - F::new(14.0), - F::new(15.0), - ]; + let expected = [F::new(110.0), F::new(121.0), F::new(132.0), F::new(143.0)]; assert_eq!(actual, expected); } diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs index 8746efb15..e583ddae8 100644 --- a/crates/cubecl-cpp/src/shared/base.rs +++ b/crates/cubecl-cpp/src/shared/base.rs @@ -1046,7 +1046,7 @@ impl CppCompiler { gpu::VariableKind::Pipeline { id, item, - num_steps, + num_stages, } => { self.pipeline = true; let pipeline = Variable::Pipeline { @@ -1056,7 +1056,7 @@ impl CppCompiler { if !self.pipelines.iter().any(|s| s.pipeline_id() == id) { self.pipelines.push(PipelineOps::Init { pipeline, - num_steps, + num_stages, }); } pipeline diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 54bc2db8d..3f7e81afa 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -6,7 +6,7 @@ use super::{Component, Dialect, Variable}; pub enum PipelineOps { Init { pipeline: Variable, - num_steps: u8, + num_stages: u8, }, MemCopyAsync { pipeline: Variable, @@ -56,12 +56,14 @@ cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {sour } PipelineOps::Init { pipeline, - num_steps, + num_stages, } => { + // cuda::thread_scope::thread_scope_block -> sync only within thread_block, no gmem sync + // cooperative_groups::this_thread_block() -> each thread in threadblock has its pipeline instance write!( f, " -__shared__ cuda::pipeline_shared_state {pipeline}_state; +__shared__ cuda::pipeline_shared_state {pipeline}_state; auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); " ) diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs index 6d6173579..6a4115d5b 100644 --- a/crates/cubecl-ir/src/allocator.rs +++ b/crates/cubecl-ir/src/allocator.rs @@ -96,13 +96,13 @@ impl Allocator { ExpandElement::Plain(variable) } - pub fn create_pipeline(&self, item: Item, num_steps: u8) -> ExpandElement { + pub fn create_pipeline(&self, item: Item, num_stages: u8) -> ExpandElement { let id = self.new_local_index(); let variable = Variable::new( VariableKind::Pipeline { id, item, - num_steps, + num_stages, }, item, ); diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs index 876c2df34..a3f0e4540 100644 --- a/crates/cubecl-ir/src/scope.rs +++ b/crates/cubecl-ir/src/scope.rs @@ -110,8 +110,8 @@ impl Scope { } /// Create a new pipeline element. - pub fn create_pipeline(&mut self, item: Item, num_steps: u8) -> ExpandElement { - let pipeline = self.allocator.create_pipeline(item, num_steps); + pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement { + let pipeline = self.allocator.create_pipeline(item, num_stages); self.add_pipeline(*pipeline); pipeline } diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs index 530faa26e..c1c3e850a 100644 --- a/crates/cubecl-ir/src/variable.rs +++ b/crates/cubecl-ir/src/variable.rs @@ -59,7 +59,7 @@ pub enum VariableKind { Matrix { id: Id, mat: Matrix }, Slice { id: Id }, Builtin(Builtin), - Pipeline { id: Id, item: Item, num_steps: u8 }, + Pipeline { id: Id, item: Item, num_stages: u8 }, } #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From e1a4b5e24974fab720a9a1ae67db31ed18d6251d Mon Sep 17 00:00:00 2001 From: louisfd Date: Tue, 4 Feb 2025 11:11:15 -0500 Subject: [PATCH 05/14] wip --- .../src/runtime_tests/memcpy_async.rs | 32 +++++++++++-------- crates/cubecl-cpp/src/shared/pipeline.rs | 6 ++-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 013c78063..5518b50a6 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -61,11 +61,11 @@ fn memcpy_sync(source: Slice>, mut destination: SliceMut( lhs: &Tensor>, - rhs: &Tensor>, + // rhs: &Tensor>, output: &mut Tensor>, ) { let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); - let mut rhs_smem = SharedMemory::::new_lined(4u32, 1u32); + // let mut rhs_smem = SharedMemory::::new_lined(4u32, 1u32); let pipeline = Pipeline::new(2u32); @@ -74,21 +74,24 @@ fn computation( pipeline.producer_acquire(); pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); - // memcpy_sync(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); pipeline.producer_commit(); - pipeline.producer_acquire(); - pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); - // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); - pipeline.producer_commit(); + pipeline.consumer_wait(); + for i in start..end { + output[i] = lhs_smem[i]; + } + pipeline.consumer_release(); + + // pipeline.producer_acquire(); + // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); + // // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); + // pipeline.producer_commit(); // tile_computation_0: inline + // pipeline.consumer_wait(); // for i in start..end { - // output[i] = Line::cast_from(10u32) * lhs[i]; - // } - // for i in start..end { - // output[i] += rhs[i]; + // output[i] += rhs_smem[i]; // } // pipeline.consumer_release(); @@ -104,7 +107,7 @@ fn computation( // tile_computation_2(&lhs_smem, &rhs_smem, output, start, end); // pipeline.consumer_release(); - tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline); + // tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline); } pub fn test_memcpy( @@ -123,9 +126,9 @@ pub fn test_memcpy( computation::launch::( &client, CubeCount::Static(1, 1, 1), - CubeDim::new(2, 1, 1), + CubeDim::new(1, 1, 1), TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), - TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), + // TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), ) }; @@ -135,6 +138,7 @@ pub fn test_memcpy( let expected = [F::new(110.0), F::new(121.0), F::new(132.0), F::new(143.0)]; assert_eq!(actual, expected); + assert!(false); } #[allow(missing_docs)] diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 3f7e81afa..921efb224 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -51,7 +51,7 @@ impl Display for PipelineOps { let item = source.item(); let size = item.elem().size() * item.vectorization; write!(f, " -cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline}); +cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); ") } PipelineOps::Init { @@ -63,8 +63,8 @@ cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {sour write!( f, " -__shared__ cuda::pipeline_shared_state {pipeline}_state; -auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); +__shared__ cuda::pipeline_shared_state {pipeline}_state; +auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); " ) } From a12fbc692147f9725034debeb9c5ee05f8ea10e1 Mon Sep 17 00:00:00 2001 From: louisfd Date: Tue, 4 Feb 2025 11:13:51 -0500 Subject: [PATCH 06/14] cleanup example --- .../src/runtime_tests/memcpy_async.rs | 95 +------------------ 1 file changed, 3 insertions(+), 92 deletions(-) diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 5518b50a6..911d6612d 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -1,56 +1,7 @@ -use crate::{self as cubecl, as_bytes, runtime_tests::memcpy_async, Feature}; +use crate::{self as cubecl, as_bytes}; use cubecl::prelude::*; use pipeline::Pipeline; -#[cube] -fn tile_computation_1( - lhs: Slice>, - rhs: Slice>, - mut out: SliceMut>, -) { - for i in 0..2 { - out[i] = Line::cast_from(10u32) * lhs[i]; - } - for i in 0..2 { - out[i] += rhs[i]; - } -} - -#[cube] -fn tile_computation_2( - lhs: &SharedMemory>, - rhs: &SharedMemory>, - out: &mut Tensor>, - start: u32, - end: u32, -) { - for i in start..end { - out[i] = Line::cast_from(10u32) * lhs[i]; - } - for i in start..end { - out[i] += rhs[i]; - } -} - -#[cube] -fn tile_computation_3( - lhs: &SharedMemory>, - rhs: &SharedMemory>, - out: &mut Tensor>, - start: u32, - end: u32, - pipeline: Pipeline, -) { - pipeline.consumer_wait(); - for i in start..end { - out[i] = Line::cast_from(10u32) * lhs[i]; - } - for i in start..end { - out[i] += rhs[i]; - } - pipeline.consumer_release(); -} - #[cube] fn memcpy_sync(source: Slice>, mut destination: SliceMut>) { for i in 0..source.len() { @@ -59,13 +10,8 @@ fn memcpy_sync(source: Slice>, mut destination: SliceMut( - lhs: &Tensor>, - // rhs: &Tensor>, - output: &mut Tensor>, -) { +fn computation(lhs: &Tensor>, output: &mut Tensor>) { let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); - // let mut rhs_smem = SharedMemory::::new_lined(4u32, 1u32); let pipeline = Pipeline::new(2u32); @@ -81,45 +27,12 @@ fn computation( output[i] = lhs_smem[i]; } pipeline.consumer_release(); - - // pipeline.producer_acquire(); - // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); - // // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); - // pipeline.producer_commit(); - - // tile_computation_0: inline - - // pipeline.consumer_wait(); - // for i in start..end { - // output[i] += rhs_smem[i]; - // } - // pipeline.consumer_release(); - - // pipeline.consumer_wait(); - // tile_computation_1( - // lhs_smem.slice(start, end), - // rhs_smem.slice(start, end), - // output.slice_mut(start, end), - // ); - // pipeline.consumer_release(); - - // pipeline.consumer_wait(); - // tile_computation_2(&lhs_smem, &rhs_smem, output, start, end); - // pipeline.consumer_release(); - - // tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline); } pub fn test_memcpy( client: ComputeClient, ) { - // if !client.properties().feature_enabled(Feature::Pipeline) { - // // We can't execute the test, skip. - // return; - // } - let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]); - let rhs = client.create(as_bytes![F: 10., 11., 12., 13.]); let output = client.empty(4 * core::mem::size_of::()); unsafe { @@ -128,17 +41,15 @@ pub fn test_memcpy( CubeCount::Static(1, 1, 1), CubeDim::new(1, 1, 1), TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), - // TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), ) }; let actual = client.read_one(output.binding()); let actual = F::from_bytes(&actual); - let expected = [F::new(110.0), F::new(121.0), F::new(132.0), F::new(143.0)]; + let expected = [F::new(10.0), F::new(11.0), F::new(12.0), F::new(13.0)]; assert_eq!(actual, expected); - assert!(false); } #[allow(missing_docs)] From c6de6142127231cf6752080c238fb9ef93df9786 Mon Sep 17 00:00:00 2001 From: louisfd Date: Tue, 4 Feb 2025 11:44:39 -0500 Subject: [PATCH 07/14] pipeline group enum --- crates/cubecl-core/src/frontend/pipeline.rs | 24 +++++++-- .../src/runtime_tests/memcpy_async.rs | 4 +- .../cubecl-core/src/runtime_tests/pipeline.rs | 4 +- crates/cubecl-cpp/src/shared/base.rs | 25 ++++++--- crates/cubecl-cpp/src/shared/pipeline.rs | 52 ++++++++++++++----- crates/cubecl-ir/src/allocator.rs | 5 +- crates/cubecl-ir/src/scope.rs | 11 +++- crates/cubecl-ir/src/variable.rs | 44 ++++++++++++---- 8 files changed, 128 insertions(+), 41 deletions(-) diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs index 847fa3248..f30f4cdb8 100644 --- a/crates/cubecl-core/src/frontend/pipeline.rs +++ b/crates/cubecl-core/src/frontend/pipeline.rs @@ -112,13 +112,18 @@ pub struct PipelineExpand { impl Default for Pipeline { fn default() -> Self { - Self::new(1) + Self::new(1, PipelineGroup::Unit) } } +pub enum PipelineGroup { + Unit, + Cube, +} + impl Pipeline { /// Create a pipeline instance - pub fn new(_num_stages: u32) -> Self { + pub fn new(_num_stages: u32, _group: PipelineGroup) -> Self { Self { _c: PhantomData } } @@ -152,9 +157,20 @@ impl Pipeline { unexpanded!() } - pub fn __expand_new(scope: &mut Scope, num_stages: u32) -> PipelineExpand { + pub fn __expand_new( + scope: &mut Scope, + num_stages: u32, + pipeline_group: PipelineGroup, + ) -> PipelineExpand { let elem = C::as_elem(scope); - let variable = scope.create_pipeline(Item::new(elem), num_stages as u8); + let variable = scope.create_pipeline( + Item::new(elem), + num_stages as u8, + match pipeline_group { + PipelineGroup::Unit => 0, + PipelineGroup::Cube => 1, + }, + ); PipelineExpand { elem: variable, _c: PhantomData, diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 911d6612d..658283a4c 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -1,6 +1,6 @@ use crate::{self as cubecl, as_bytes}; use cubecl::prelude::*; -use pipeline::Pipeline; +use pipeline::{Pipeline, PipelineGroup}; #[cube] fn memcpy_sync(source: Slice>, mut destination: SliceMut>) { @@ -13,7 +13,7 @@ fn memcpy_sync(source: Slice>, mut destination: SliceMut(lhs: &Tensor>, output: &mut Tensor>) { let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); - let pipeline = Pipeline::new(2u32); + let pipeline = Pipeline::new(2u32, PipelineGroup::Unit); let start = UNIT_POS_X * 2u32; let end = start + 2u32; diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs index a33ee9be1..315a9ec10 100644 --- a/crates/cubecl-core/src/runtime_tests/pipeline.rs +++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs @@ -12,7 +12,7 @@ fn pipelined_sum( let smem_size = 2 * batch_len; let num_batches = input.len() / batch_len; let mut shared_memory = SharedMemory::::new_lined(smem_size, input.line_size()); - let pipeline = Pipeline::new(2u32); + let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Cube); let mut sum = Line::::empty(input.line_size()).fill(F::new(0.)); @@ -63,7 +63,7 @@ fn pipelined_sum( #[cube(launch)] pub fn async_copy_test(input: &Array>, output: &mut Array>) { - let pipeline = pipeline::Pipeline::::new(2u32); + let pipeline = pipeline::Pipeline::::new(2u32, pipeline::PipelineGroup::Cube); let mut smem = SharedMemory::::new_lined(1u32, 1u32); if UNIT_POS == 0 { diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs index e583ddae8..03f57d870 100644 --- a/crates/cubecl-cpp/src/shared/base.rs +++ b/crates/cubecl-cpp/src/shared/base.rs @@ -357,13 +357,22 @@ impl CppCompiler { source, destination, } => { - instructions.push(Instruction::Pipeline( - super::pipeline::PipelineOps::MemCopyAsync { - pipeline: self.compile_variable(pipeline), - source: self.compile_variable(source), - destination: self.compile_variable(destination), - }, - )); + if let gpu::VariableKind::Pipeline { + id: _, + item: _, + num_stages: _, + pipeline_group, + } = pipeline.kind + { + instructions.push(Instruction::Pipeline( + super::pipeline::PipelineOps::MemCopyAsync { + pipeline: self.compile_variable(pipeline), + source: self.compile_variable(source), + destination: self.compile_variable(destination), + pipeline_group, + }, + )); + } } gpu::PipelineOps::ProducerAcquire { pipeline } => instructions.push( Instruction::Pipeline(super::pipeline::PipelineOps::ProducerAcquire { @@ -1047,6 +1056,7 @@ impl CppCompiler { id, item, num_stages, + pipeline_group, } => { self.pipeline = true; let pipeline = Variable::Pipeline { @@ -1057,6 +1067,7 @@ impl CppCompiler { self.pipelines.push(PipelineOps::Init { pipeline, num_stages, + pipeline_group, }); } pipeline diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 921efb224..63fe3166d 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -7,11 +7,13 @@ pub enum PipelineOps { Init { pipeline: Variable, num_stages: u8, + pipeline_group: u8, }, MemCopyAsync { pipeline: Variable, source: Variable, destination: Variable, + pipeline_group: u8, }, ProducerAcquire { pipeline: Variable, @@ -47,27 +49,49 @@ impl Display for PipelineOps { pipeline, source, destination, + pipeline_group, } => { let item = source.item(); let size = item.elem().size() * item.vectorization; - write!(f, " -cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); - ") + match pipeline_group { + 0 => { + write!(f, " + cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); + ") + } + 1 => { + write!(f, " + cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline}); + ") + } + _ => unreachable!(), + } } PipelineOps::Init { pipeline, num_stages, - } => { - // cuda::thread_scope::thread_scope_block -> sync only within thread_block, no gmem sync - // cooperative_groups::this_thread_block() -> each thread in threadblock has its pipeline instance - write!( - f, - " -__shared__ cuda::pipeline_shared_state {pipeline}_state; -auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); - " - ) - } + pipeline_group, + } => match pipeline_group { + 0 => { + write!( + f, + " + __shared__ cuda::pipeline_shared_state {pipeline}_state; + auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); + " + ) + } + 1 => { + write!( + f, + " + __shared__ cuda::pipeline_shared_state {pipeline}_state; + auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); + " + ) + } + _ => unreachable!(), + }, PipelineOps::ProducerAcquire { pipeline } => { write!( f, diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs index 6a4115d5b..fcbe86503 100644 --- a/crates/cubecl-ir/src/allocator.rs +++ b/crates/cubecl-ir/src/allocator.rs @@ -4,6 +4,8 @@ use core::cell::RefCell; use hashbrown::HashMap; use portable_atomic::{AtomicU32, Ordering}; +use crate::pipeline; + use super::{Item, Matrix, Variable, VariableKind}; /// An allocator for local variables of a kernel. @@ -96,13 +98,14 @@ impl Allocator { ExpandElement::Plain(variable) } - pub fn create_pipeline(&self, item: Item, num_stages: u8) -> ExpandElement { + pub fn create_pipeline(&self, item: Item, num_stages: u8, pipeline_group: u8) -> ExpandElement { let id = self.new_local_index(); let variable = Variable::new( VariableKind::Pipeline { id, item, num_stages, + pipeline_group, }, item, ); diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs index a3f0e4540..751d5a37e 100644 --- a/crates/cubecl-ir/src/scope.rs +++ b/crates/cubecl-ir/src/scope.rs @@ -110,8 +110,15 @@ impl Scope { } /// Create a new pipeline element. - pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement { - let pipeline = self.allocator.create_pipeline(item, num_stages); + pub fn create_pipeline( + &mut self, + item: Item, + num_stages: u8, + pipeline_group: u8, + ) -> ExpandElement { + let pipeline = self + .allocator + .create_pipeline(item, num_stages, pipeline_group); self.add_pipeline(*pipeline); pipeline } diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs index c1c3e850a..d44c9e9a1 100644 --- a/crates/cubecl-ir/src/variable.rs +++ b/crates/cubecl-ir/src/variable.rs @@ -49,17 +49,43 @@ pub enum VariableKind { GlobalInputArray(Id), GlobalOutputArray(Id), GlobalScalar(Id), - LocalArray { id: Id, length: u32 }, - LocalMut { id: Id }, - LocalConst { id: Id }, - Versioned { id: Id, version: u16 }, + LocalArray { + id: Id, + length: u32, + }, + LocalMut { + id: Id, + }, + LocalConst { + id: Id, + }, + Versioned { + id: Id, + version: u16, + }, ConstantScalar(ConstantScalarValue), - ConstantArray { id: Id, length: u32 }, - SharedMemory { id: Id, length: u32 }, - Matrix { id: Id, mat: Matrix }, - Slice { id: Id }, + ConstantArray { + id: Id, + length: u32, + }, + SharedMemory { + id: Id, + length: u32, + }, + Matrix { + id: Id, + mat: Matrix, + }, + Slice { + id: Id, + }, Builtin(Builtin), - Pipeline { id: Id, item: Item, num_stages: u8 }, + Pipeline { + id: Id, + item: Item, + num_stages: u8, + pipeline_group: u8, + }, } #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From 2b6b65f74e77100a44db9cd88b947e5a6131719f Mon Sep 17 00:00:00 2001 From: louisfd Date: Tue, 4 Feb 2025 12:05:40 -0500 Subject: [PATCH 08/14] works --- .../src/runtime_tests/memcpy_async.rs | 82 ++++++++++++++++--- crates/cubecl-cpp/src/shared/pipeline.rs | 2 +- 2 files changed, 71 insertions(+), 13 deletions(-) diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 658283a4c..919cc341e 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -2,44 +2,67 @@ use crate::{self as cubecl, as_bytes}; use cubecl::prelude::*; use pipeline::{Pipeline, PipelineGroup}; -#[cube] -fn memcpy_sync(source: Slice>, mut destination: SliceMut>) { - for i in 0..source.len() { - destination[i] = source[i]; +// TODO delete +// #[cube] +// fn memcpy_sync(source: Slice>, mut destination: SliceMut>) { +// for i in 0..source.len() { +// destination[i] = source[i]; +// } +// } + +#[cube(launch)] +fn one_load(lhs: &Tensor>, output: &mut Tensor>) { + let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); + + let pipeline = Pipeline::new(1u32, PipelineGroup::Unit); + + let start = UNIT_POS_X * 2u32; + let end = start + 2u32; + + pipeline.producer_acquire(); + pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); + pipeline.producer_commit(); + + pipeline.consumer_wait(); + for i in start..end { + output[i] = lhs_smem[i]; } + pipeline.consumer_release(); } #[cube(launch)] -fn computation(lhs: &Tensor>, output: &mut Tensor>) { +fn two_loads(lhs: &Tensor>, rhs: &Tensor>, output: &mut Tensor>) { let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); + let mut rhs_smem = SharedMemory::::new_lined(4u32, 1u32); - let pipeline = Pipeline::new(2u32, PipelineGroup::Unit); + let pipeline = Pipeline::new(1u32, PipelineGroup::Unit); let start = UNIT_POS_X * 2u32; let end = start + 2u32; pipeline.producer_acquire(); pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); + pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); pipeline.producer_commit(); pipeline.consumer_wait(); for i in start..end { - output[i] = lhs_smem[i]; + output[i] = lhs_smem[i] + rhs_smem[i]; } pipeline.consumer_release(); } -pub fn test_memcpy( +pub fn test_memcpy_one_load( client: ComputeClient, ) { let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]); let output = client.empty(4 * core::mem::size_of::()); unsafe { - computation::launch::( + one_load::launch::( &client, CubeCount::Static(1, 1, 1), - CubeDim::new(1, 1, 1), + CubeDim::new(2, 1, 1), TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), ) @@ -52,6 +75,31 @@ pub fn test_memcpy( assert_eq!(actual, expected); } +pub fn test_memcpy_two_loads( + client: ComputeClient, +) { + let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]); + let rhs = client.create(as_bytes![F: 20., 21., 22., 23.]); + let output = client.empty(4 * core::mem::size_of::()); + + unsafe { + two_loads::launch::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(2, 1, 1), + TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), + TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), + TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), + ) + }; + + let actual = client.read_one(output.binding()); + let actual = F::from_bytes(&actual); + let expected = [F::new(30.0), F::new(32.0), F::new(34.0), F::new(36.0)]; + + assert_eq!(actual, expected); +} + #[allow(missing_docs)] #[macro_export] macro_rules! testgen_memcpy_async { @@ -59,9 +107,19 @@ macro_rules! testgen_memcpy_async { use super::*; #[test] - fn test_memcpy_async() { + fn test_memcpy_async_one_load() { + let client = TestRuntime::client(&Default::default()); + cubecl_core::runtime_tests::memcpy_async::test_memcpy_one_load::( + client, + ); + } + + #[test] + fn test_memcpy_async_two_loads() { let client = TestRuntime::client(&Default::default()); - cubecl_core::runtime_tests::memcpy_async::test_memcpy::(client); + cubecl_core::runtime_tests::memcpy_async::test_memcpy_two_loads::( + client, + ); } }; } diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 63fe3166d..000d53e8c 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -76,7 +76,7 @@ impl Display for PipelineOps { write!( f, " - __shared__ cuda::pipeline_shared_state {pipeline}_state; + cuda::pipeline_shared_state {pipeline}_state; auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); " ) From 4bef2d0fdc79e8d82e37e925b68ade5a8344359a Mon Sep 17 00:00:00 2001 From: louisfd Date: Wed, 5 Feb 2025 09:12:57 -0500 Subject: [PATCH 09/14] tests --- .../src/runtime_tests/memcpy_async.rs | 162 ++++++++++++++---- crates/cubecl-cpp/src/shared/pipeline.rs | 18 +- crates/cubecl-ir/src/allocator.rs | 2 - 3 files changed, 142 insertions(+), 40 deletions(-) diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 919cc341e..0b656ccd1 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -2,14 +2,6 @@ use crate::{self as cubecl, as_bytes}; use cubecl::prelude::*; use pipeline::{Pipeline, PipelineGroup}; -// TODO delete -// #[cube] -// fn memcpy_sync(source: Slice>, mut destination: SliceMut>) { -// for i in 0..source.len() { -// destination[i] = source[i]; -// } -// } - #[cube(launch)] fn one_load(lhs: &Tensor>, output: &mut Tensor>) { let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); @@ -31,14 +23,19 @@ fn one_load(lhs: &Tensor>, output: &mut Tensor>) { } #[cube(launch)] -fn two_loads(lhs: &Tensor>, rhs: &Tensor>, output: &mut Tensor>) { - let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); - let mut rhs_smem = SharedMemory::::new_lined(4u32, 1u32); +fn two_loads( + lhs: &Tensor>, + rhs: &Tensor>, + output: &mut Tensor>, + #[comptime] num_data: u32, // should be even +) { + let mut lhs_smem = SharedMemory::::new_lined(num_data, 1u32); + let mut rhs_smem = SharedMemory::::new_lined(num_data, 1u32); let pipeline = Pipeline::new(1u32, PipelineGroup::Unit); - let start = UNIT_POS_X * 2u32; - let end = start + 2u32; + let start = UNIT_POS_X * num_data / 2; + let end = start + num_data / 2; pipeline.producer_acquire(); pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); @@ -46,10 +43,57 @@ fn two_loads(lhs: &Tensor>, rhs: &Tensor>, output: &mu pipeline.producer_commit(); pipeline.consumer_wait(); + let mut dot = Line::cast_from(0u32); for i in start..end { - output[i] = lhs_smem[i] + rhs_smem[i]; + dot += lhs_smem[i] * rhs_smem[i]; } pipeline.consumer_release(); + + output[UNIT_POS_X] = dot; +} + +#[cube(launch)] +fn two_independant_loads( + lhs: &Tensor>, + rhs: &Tensor>, + output: &mut Tensor>, + #[comptime] num_data: u32, +) { + let mut lhs_smem = SharedMemory::::new_lined(num_data, 1u32); + let mut rhs_smem = SharedMemory::::new_lined(num_data, 1u32); + + let pipeline = Pipeline::new(2u32, PipelineGroup::Unit); + + let start = UNIT_POS_X * num_data / 2; + let end = start + num_data / 2; + + for i in start..end { + lhs_smem[i] = Line::cast_from(0u32); + rhs_smem[i] = Line::cast_from(0u32); + output[i] = Line::cast_from(0u32); + } + + pipeline.producer_acquire(); + pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end)); + pipeline.producer_commit(); + + pipeline.producer_acquire(); + pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end)); + pipeline.producer_commit(); + + let mut dot = Line::cast_from(0u32); + + pipeline.consumer_wait(); + pipeline.consumer_wait(); + pipeline.consumer_wait(); + for i in start..end { + dot += lhs_smem[i] * rhs_smem[i]; + } + pipeline.consumer_release(); + pipeline.consumer_release(); + pipeline.consumer_release(); + + output[UNIT_POS_X] = dot; } pub fn test_memcpy_one_load( @@ -76,30 +120,80 @@ pub fn test_memcpy_one_load( } pub fn test_memcpy_two_loads( + independant: bool, client: ComputeClient, ) { - let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]); - let rhs = client.create(as_bytes![F: 20., 21., 22., 23.]); - let output = client.empty(4 * core::mem::size_of::()); - - unsafe { - two_loads::launch::( - &client, - CubeCount::Static(1, 1, 1), - CubeDim::new(2, 1, 1), - TensorArg::from_raw_parts::(&lhs, &[4, 1], &[4, 4], 1), - TensorArg::from_raw_parts::(&rhs, &[4, 1], &[4, 4], 1), - TensorArg::from_raw_parts::(&output, &[4, 1], &[4, 4], 1), - ) - }; + let num_data = 4; + let lhs_data = generate_random_data(num_data, 42); + let rhs_data = generate_random_data(num_data, 43); + + let lhs = client.create(F::as_bytes(&lhs_data)); + let rhs = client.create(F::as_bytes(&rhs_data)); + let output = client.empty(2 * core::mem::size_of::()); + + if independant { + unsafe { + two_independant_loads::launch::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(2, 1, 1), + TensorArg::from_raw_parts::(&lhs, &[1], &[num_data], 1), + TensorArg::from_raw_parts::(&rhs, &[1], &[num_data], 1), + TensorArg::from_raw_parts::(&output, &[1], &[2], 1), + num_data as u32, + ) + }; + } else { + unsafe { + two_loads::launch::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(2, 1, 1), + TensorArg::from_raw_parts::(&lhs, &[1], &[num_data], 1), + TensorArg::from_raw_parts::(&rhs, &[1], &[num_data], 1), + TensorArg::from_raw_parts::(&output, &[1], &[2], 1), + num_data as u32, + ) + }; + } let actual = client.read_one(output.binding()); let actual = F::from_bytes(&actual); - let expected = [F::new(30.0), F::new(32.0), F::new(34.0), F::new(36.0)]; + + let middle = num_data / 2; + let expected = [ + dot(&lhs_data[..middle], &rhs_data[..middle]), + dot(&lhs_data[middle..], &rhs_data[middle..]), + ]; assert_eq!(actual, expected); } +fn dot(vec1: &[F], vec2: &[F]) -> F { + let mut sum = F::from_int(0); + for i in 0..vec1.len() { + sum += vec1[i] * vec2[i]; + } + sum +} + +// TODO tmp +pub(crate) fn generate_random_data( + num_elements: usize, + mut seed: u64, +) -> Vec { + fn lcg(seed: &mut u64) -> f32 { + const A: u64 = 1664525; + const C: u64 = 1013904223; + const M: f64 = 2u64.pow(32) as f64; + + *seed = (A.wrapping_mul(*seed).wrapping_add(C)) % (1u64 << 32); + (*seed as f64 / M * 2.0 - 1.0) as f32 + } + + (0..num_elements).map(|_| F::new(lcg(&mut seed))).collect() +} + #[allow(missing_docs)] #[macro_export] macro_rules! testgen_memcpy_async { @@ -118,6 +212,16 @@ macro_rules! testgen_memcpy_async { fn test_memcpy_async_two_loads() { let client = TestRuntime::client(&Default::default()); cubecl_core::runtime_tests::memcpy_async::test_memcpy_two_loads::( + false, + client, + ); + } + + #[test] + fn test_memcpy_async_two_independant_loads() { + let client = TestRuntime::client(&Default::default()); + cubecl_core::runtime_tests::memcpy_async::test_memcpy_two_loads::( + true, client, ); } diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 000d53e8c..bdf2b926a 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -56,12 +56,12 @@ impl Display for PipelineOps { match pipeline_group { 0 => { write!(f, " - cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); +cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); ") } 1 => { write!(f, " - cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline}); +cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline}); ") } _ => unreachable!(), @@ -74,19 +74,19 @@ impl Display for PipelineOps { } => match pipeline_group { 0 => { write!( - f, - " - cuda::pipeline_shared_state {pipeline}_state; - auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); + f, " - ) +cuda::pipeline_shared_state {pipeline}_state; +auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); + " + ) } 1 => { write!( f, " - __shared__ cuda::pipeline_shared_state {pipeline}_state; - auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); +__shared__ cuda::pipeline_shared_state {pipeline}_state; +auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); " ) } diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs index fcbe86503..83675f417 100644 --- a/crates/cubecl-ir/src/allocator.rs +++ b/crates/cubecl-ir/src/allocator.rs @@ -4,8 +4,6 @@ use core::cell::RefCell; use hashbrown::HashMap; use portable_atomic::{AtomicU32, Ordering}; -use crate::pipeline; - use super::{Item, Matrix, Variable, VariableKind}; /// An allocator for local variables of a kernel. From be16046d0153fdd6137ed642c42d943a93cf3aaf Mon Sep 17 00:00:00 2001 From: louisfd Date: Wed, 5 Feb 2025 09:17:56 -0500 Subject: [PATCH 10/14] cleaning --- crates/cubecl-core/src/frontend/context.rs | 159 ------------------ .../src/runtime_tests/memcpy_async.rs | 21 +-- 2 files changed, 2 insertions(+), 178 deletions(-) delete mode 100644 crates/cubecl-core/src/frontend/context.rs diff --git a/crates/cubecl-core/src/frontend/context.rs b/crates/cubecl-core/src/frontend/context.rs deleted file mode 100644 index 284e14712..000000000 --- a/crates/cubecl-core/src/frontend/context.rs +++ /dev/null @@ -1,159 +0,0 @@ -use crate::ir::Id; -use crate::ir::{self, Elem, Instruction, Item, Scope, Variable, VariableKind}; -use alloc::rc::Rc; -use core::cell::RefCell; -use cubecl_ir::ExpandElement; -use cubecl_runtime::debug::DebugLogger; -use std::any::TypeId; -use std::collections::HashMap; - -pub struct CubeContext { - pub root: Rc>, - pub scope: Rc>, - pub debug_enabled: bool, - pub typemap: Rc>>, -} - -impl Default for CubeContext { - fn default() -> Self { - Self::root() - } -} - -impl CubeContext { - /// Create a new cube context, with a root scope - /// A root scope is at the root of a compute shader - /// Therefore there is one cube context per shader - /// The allocator will define the strategy for creating local intermediates and mutable variables - pub fn root() -> CubeContext { - let root = Rc::new(RefCell::new(Scope::root())); - let typemap = Rc::new(RefCell::new(HashMap::new())); - let scope = root.clone(); - - Self { - scope, - root, - debug_enabled: DebugLogger::default().is_activated(), - typemap, - } - } - - pub fn register>(&mut self, op: O) { - self.scope.borrow_mut().register(op) - } - - /// Resolve the element type of the given generic type. - pub fn resolve_elem(&self) -> Option { - let map = self.typemap.borrow(); - let result = map.get(&TypeId::of::()); - - result.cloned() - } - - /// Register the element type for the given generic type. - pub fn register_elem(&mut self, elem: Elem) { - let mut map = self.typemap.borrow_mut(); - - map.insert(TypeId::of::(), elem); - } - - pub fn child(&mut self) -> CubeContext { - let scope = self.scope.borrow_mut().child(); - - Self { - scope: Rc::new(RefCell::new(scope)), - root: self.root.clone(), - debug_enabled: self.debug_enabled, - typemap: self.typemap.clone(), - } - } - - pub fn into_scope(self) -> Scope { - core::mem::drop(self.root); - - Rc::into_inner(self.scope) - .expect("Only one reference") - .into_inner() - } - - /// Create a new mutable local variable. - pub fn create_local_mut(&mut self, item: Item) -> ExpandElement { - let local = self.scope.borrow().allocator.create_local_mut(item); - self.scope.borrow_mut().add_local_mut(*local); - local - } - - /// Create a new immutable local variable. - pub fn create_local(&mut self, item: Item) -> ExpandElement { - self.scope.borrow().allocator.create_local(item) - } - - /// Create a new immutable local binding that must never be a reused variable, regardless of - /// allocator - pub fn create_local_restricted(&mut self, item: Item) -> ExpandElement { - self.scope.borrow().allocator.create_local_restricted(item) - } - - /// Create a new matrix element. - pub fn create_matrix(&mut self, matrix: ir::Matrix) -> ExpandElement { - let matrix = self.scope.borrow().allocator.create_matrix(matrix); - self.scope.borrow_mut().add_matrix(*matrix); - matrix - } - - /// Create a new pipeline element. - pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement { - let pipeline = self - .scope - .borrow() - .allocator - .create_pipeline(item, num_stages); - self.scope.borrow_mut().add_pipeline(*pipeline); - pipeline - } - - /// Create a new slice element. - pub fn create_slice(&mut self, item: Item) -> ExpandElement { - let slice = self.scope.borrow().allocator.create_slice(item); - self.scope.borrow_mut().add_slice(*slice); - slice - } - - pub fn create_shared(&mut self, item: Item, size: u32) -> ExpandElement { - ExpandElement::Plain(self.root.borrow_mut().create_shared(item, size)) - } - - pub fn create_local_array(&mut self, item: Item, size: u32) -> ExpandElement { - let local_array: ExpandElement = - self.root.borrow().allocator.create_local_array(item, size); - self.root.borrow_mut().add_local_array(*local_array); - local_array - } - - pub fn create_const_array(&mut self, item: Item, data: Vec) -> ExpandElement { - ExpandElement::Plain(self.root.borrow_mut().create_const_array(item, data)) - } - - /// Obtain the index-th input - pub fn input(&mut self, id: Id, item: Item) -> ExpandElement { - ExpandElement::Plain(crate::ir::Variable::new( - VariableKind::GlobalInputArray(id), - item, - )) - } - - /// Obtain the index-th output - pub fn output(&mut self, id: Id, item: Item) -> ExpandElement { - let var = crate::ir::Variable::new(VariableKind::GlobalOutputArray(id), item); - self.scope.borrow_mut().write_global_custom(var); - ExpandElement::Plain(var) - } - - /// Obtain the index-th scalar - pub fn scalar(&self, id: Id, elem: Elem) -> ExpandElement { - ExpandElement::Plain(crate::ir::Variable::new( - VariableKind::GlobalScalar(id), - Item::new(elem), - )) - } -} diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 0b656ccd1..3e01770d2 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -124,8 +124,8 @@ pub fn test_memcpy_two_loads( client: ComputeClient, ) { let num_data = 4; - let lhs_data = generate_random_data(num_data, 42); - let rhs_data = generate_random_data(num_data, 43); + let lhs_data: Vec = (0..num_data).map(|i| F::new(i as f32)).collect(); + let rhs_data: Vec = (0..num_data).map(|i| F::new(i as f32)).collect(); let lhs = client.create(F::as_bytes(&lhs_data)); let rhs = client.create(F::as_bytes(&rhs_data)); @@ -177,23 +177,6 @@ fn dot(vec1: &[F], vec2: &[F]) -> F { sum } -// TODO tmp -pub(crate) fn generate_random_data( - num_elements: usize, - mut seed: u64, -) -> Vec { - fn lcg(seed: &mut u64) -> f32 { - const A: u64 = 1664525; - const C: u64 = 1013904223; - const M: f64 = 2u64.pow(32) as f64; - - *seed = (A.wrapping_mul(*seed).wrapping_add(C)) % (1u64 << 32); - (*seed as f64 / M * 2.0 - 1.0) as f32 - } - - (0..num_elements).map(|_| F::new(lcg(&mut seed))).collect() -} - #[allow(missing_docs)] #[macro_export] macro_rules! testgen_memcpy_async { From db3cb51903b1446d1a47b8aedccbc7b6f0c99962 Mon Sep 17 00:00:00 2001 From: louisfd Date: Wed, 5 Feb 2025 09:35:53 -0500 Subject: [PATCH 11/14] unit pipeline group --- crates/cubecl-core/src/runtime_tests/pipeline.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs index 315a9ec10..9e754c47e 100644 --- a/crates/cubecl-core/src/runtime_tests/pipeline.rs +++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs @@ -12,7 +12,7 @@ fn pipelined_sum( let smem_size = 2 * batch_len; let num_batches = input.len() / batch_len; let mut shared_memory = SharedMemory::::new_lined(smem_size, input.line_size()); - let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Cube); + let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Unit); let mut sum = Line::::empty(input.line_size()).fill(F::new(0.)); @@ -63,7 +63,7 @@ fn pipelined_sum( #[cube(launch)] pub fn async_copy_test(input: &Array>, output: &mut Array>) { - let pipeline = pipeline::Pipeline::::new(2u32, pipeline::PipelineGroup::Cube); + let pipeline = pipeline::Pipeline::::new(2u32, pipeline::PipelineGroup::Unit); let mut smem = SharedMemory::::new_lined(1u32, 1u32); if UNIT_POS == 0 { From da5ea7b049c9a4b02ce2b22830bd8c4de67baa79 Mon Sep 17 00:00:00 2001 From: louisfd Date: Wed, 5 Feb 2025 09:40:00 -0500 Subject: [PATCH 12/14] remove pipeline group --- crates/cubecl-core/src/frontend/pipeline.rs | 15 +------ crates/cubecl-cpp/src/shared/base.rs | 4 -- crates/cubecl-cpp/src/shared/pipeline.rs | 40 ++++--------------- crates/cubecl-ir/src/allocator.rs | 3 +- crates/cubecl-ir/src/scope.rs | 11 +----- crates/cubecl-ir/src/variable.rs | 44 +++++---------------- 6 files changed, 21 insertions(+), 96 deletions(-) diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs index f30f4cdb8..7aff0d532 100644 --- a/crates/cubecl-core/src/frontend/pipeline.rs +++ b/crates/cubecl-core/src/frontend/pipeline.rs @@ -157,20 +157,9 @@ impl Pipeline { unexpanded!() } - pub fn __expand_new( - scope: &mut Scope, - num_stages: u32, - pipeline_group: PipelineGroup, - ) -> PipelineExpand { + pub fn __expand_new(scope: &mut Scope, num_stages: u32) -> PipelineExpand { let elem = C::as_elem(scope); - let variable = scope.create_pipeline( - Item::new(elem), - num_stages as u8, - match pipeline_group { - PipelineGroup::Unit => 0, - PipelineGroup::Cube => 1, - }, - ); + let variable = scope.create_pipeline(Item::new(elem), num_stages as u8); PipelineExpand { elem: variable, _c: PhantomData, diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs index 03f57d870..bab3bc642 100644 --- a/crates/cubecl-cpp/src/shared/base.rs +++ b/crates/cubecl-cpp/src/shared/base.rs @@ -361,7 +361,6 @@ impl CppCompiler { id: _, item: _, num_stages: _, - pipeline_group, } = pipeline.kind { instructions.push(Instruction::Pipeline( @@ -369,7 +368,6 @@ impl CppCompiler { pipeline: self.compile_variable(pipeline), source: self.compile_variable(source), destination: self.compile_variable(destination), - pipeline_group, }, )); } @@ -1056,7 +1054,6 @@ impl CppCompiler { id, item, num_stages, - pipeline_group, } => { self.pipeline = true; let pipeline = Variable::Pipeline { @@ -1067,7 +1064,6 @@ impl CppCompiler { self.pipelines.push(PipelineOps::Init { pipeline, num_stages, - pipeline_group, }); } pipeline diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index bdf2b926a..9423dad40 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -7,13 +7,11 @@ pub enum PipelineOps { Init { pipeline: Variable, num_stages: u8, - pipeline_group: u8, }, MemCopyAsync { pipeline: Variable, source: Variable, destination: Variable, - pipeline_group: u8, }, ProducerAcquire { pipeline: Variable, @@ -49,49 +47,25 @@ impl Display for PipelineOps { pipeline, source, destination, - pipeline_group, } => { let item = source.item(); let size = item.elem().size() * item.vectorization; - match pipeline_group { - 0 => { - write!(f, " + write!(f, " cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); ") - } - 1 => { - write!(f, " -cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline}); - ") - } - _ => unreachable!(), - } } PipelineOps::Init { pipeline, num_stages, - pipeline_group, - } => match pipeline_group { - 0 => { - write!( - f, - " + } => { + write!( + f, + " cuda::pipeline_shared_state {pipeline}_state; auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); " - ) - } - 1 => { - write!( - f, - " -__shared__ cuda::pipeline_shared_state {pipeline}_state; -auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state); - " - ) - } - _ => unreachable!(), - }, + ) + } PipelineOps::ProducerAcquire { pipeline } => { write!( f, diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs index 83675f417..6a4115d5b 100644 --- a/crates/cubecl-ir/src/allocator.rs +++ b/crates/cubecl-ir/src/allocator.rs @@ -96,14 +96,13 @@ impl Allocator { ExpandElement::Plain(variable) } - pub fn create_pipeline(&self, item: Item, num_stages: u8, pipeline_group: u8) -> ExpandElement { + pub fn create_pipeline(&self, item: Item, num_stages: u8) -> ExpandElement { let id = self.new_local_index(); let variable = Variable::new( VariableKind::Pipeline { id, item, num_stages, - pipeline_group, }, item, ); diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs index 751d5a37e..a3f0e4540 100644 --- a/crates/cubecl-ir/src/scope.rs +++ b/crates/cubecl-ir/src/scope.rs @@ -110,15 +110,8 @@ impl Scope { } /// Create a new pipeline element. - pub fn create_pipeline( - &mut self, - item: Item, - num_stages: u8, - pipeline_group: u8, - ) -> ExpandElement { - let pipeline = self - .allocator - .create_pipeline(item, num_stages, pipeline_group); + pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement { + let pipeline = self.allocator.create_pipeline(item, num_stages); self.add_pipeline(*pipeline); pipeline } diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs index d44c9e9a1..c1c3e850a 100644 --- a/crates/cubecl-ir/src/variable.rs +++ b/crates/cubecl-ir/src/variable.rs @@ -49,43 +49,17 @@ pub enum VariableKind { GlobalInputArray(Id), GlobalOutputArray(Id), GlobalScalar(Id), - LocalArray { - id: Id, - length: u32, - }, - LocalMut { - id: Id, - }, - LocalConst { - id: Id, - }, - Versioned { - id: Id, - version: u16, - }, + LocalArray { id: Id, length: u32 }, + LocalMut { id: Id }, + LocalConst { id: Id }, + Versioned { id: Id, version: u16 }, ConstantScalar(ConstantScalarValue), - ConstantArray { - id: Id, - length: u32, - }, - SharedMemory { - id: Id, - length: u32, - }, - Matrix { - id: Id, - mat: Matrix, - }, - Slice { - id: Id, - }, + ConstantArray { id: Id, length: u32 }, + SharedMemory { id: Id, length: u32 }, + Matrix { id: Id, mat: Matrix }, + Slice { id: Id }, Builtin(Builtin), - Pipeline { - id: Id, - item: Item, - num_stages: u8, - pipeline_group: u8, - }, + Pipeline { id: Id, item: Item, num_stages: u8 }, } #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From cc88ab8d5fe0d732f773a1c009f79c2e63c6fbdc Mon Sep 17 00:00:00 2001 From: louisfd Date: Wed, 5 Feb 2025 09:44:44 -0500 Subject: [PATCH 13/14] finish removing pipeline group --- crates/cubecl-core/src/frontend/pipeline.rs | 9 ++------ .../src/runtime_tests/memcpy_async.rs | 8 +++---- .../cubecl-core/src/runtime_tests/pipeline.rs | 4 ++-- crates/cubecl-cpp/src/shared/base.rs | 21 +++++++------------ crates/cubecl-cpp/src/shared/pipeline.rs | 4 ++-- 5 files changed, 17 insertions(+), 29 deletions(-) diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs index 7aff0d532..847fa3248 100644 --- a/crates/cubecl-core/src/frontend/pipeline.rs +++ b/crates/cubecl-core/src/frontend/pipeline.rs @@ -112,18 +112,13 @@ pub struct PipelineExpand { impl Default for Pipeline { fn default() -> Self { - Self::new(1, PipelineGroup::Unit) + Self::new(1) } } -pub enum PipelineGroup { - Unit, - Cube, -} - impl Pipeline { /// Create a pipeline instance - pub fn new(_num_stages: u32, _group: PipelineGroup) -> Self { + pub fn new(_num_stages: u32) -> Self { Self { _c: PhantomData } } diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index 3e01770d2..b389e4373 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -1,12 +1,12 @@ use crate::{self as cubecl, as_bytes}; use cubecl::prelude::*; -use pipeline::{Pipeline, PipelineGroup}; +use pipeline::Pipeline; #[cube(launch)] fn one_load(lhs: &Tensor>, output: &mut Tensor>) { let mut lhs_smem = SharedMemory::::new_lined(4u32, 1u32); - let pipeline = Pipeline::new(1u32, PipelineGroup::Unit); + let pipeline = Pipeline::new(1u32); let start = UNIT_POS_X * 2u32; let end = start + 2u32; @@ -32,7 +32,7 @@ fn two_loads( let mut lhs_smem = SharedMemory::::new_lined(num_data, 1u32); let mut rhs_smem = SharedMemory::::new_lined(num_data, 1u32); - let pipeline = Pipeline::new(1u32, PipelineGroup::Unit); + let pipeline = Pipeline::new(1u32); let start = UNIT_POS_X * num_data / 2; let end = start + num_data / 2; @@ -62,7 +62,7 @@ fn two_independant_loads( let mut lhs_smem = SharedMemory::::new_lined(num_data, 1u32); let mut rhs_smem = SharedMemory::::new_lined(num_data, 1u32); - let pipeline = Pipeline::new(2u32, PipelineGroup::Unit); + let pipeline = Pipeline::new(2u32); let start = UNIT_POS_X * num_data / 2; let end = start + num_data / 2; diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs index 9e754c47e..a33ee9be1 100644 --- a/crates/cubecl-core/src/runtime_tests/pipeline.rs +++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs @@ -12,7 +12,7 @@ fn pipelined_sum( let smem_size = 2 * batch_len; let num_batches = input.len() / batch_len; let mut shared_memory = SharedMemory::::new_lined(smem_size, input.line_size()); - let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Unit); + let pipeline = Pipeline::new(2u32); let mut sum = Line::::empty(input.line_size()).fill(F::new(0.)); @@ -63,7 +63,7 @@ fn pipelined_sum( #[cube(launch)] pub fn async_copy_test(input: &Array>, output: &mut Array>) { - let pipeline = pipeline::Pipeline::::new(2u32, pipeline::PipelineGroup::Unit); + let pipeline = pipeline::Pipeline::::new(2u32); let mut smem = SharedMemory::::new_lined(1u32, 1u32); if UNIT_POS == 0 { diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs index bab3bc642..e583ddae8 100644 --- a/crates/cubecl-cpp/src/shared/base.rs +++ b/crates/cubecl-cpp/src/shared/base.rs @@ -357,20 +357,13 @@ impl CppCompiler { source, destination, } => { - if let gpu::VariableKind::Pipeline { - id: _, - item: _, - num_stages: _, - } = pipeline.kind - { - instructions.push(Instruction::Pipeline( - super::pipeline::PipelineOps::MemCopyAsync { - pipeline: self.compile_variable(pipeline), - source: self.compile_variable(source), - destination: self.compile_variable(destination), - }, - )); - } + instructions.push(Instruction::Pipeline( + super::pipeline::PipelineOps::MemCopyAsync { + pipeline: self.compile_variable(pipeline), + source: self.compile_variable(source), + destination: self.compile_variable(destination), + }, + )); } gpu::PipelineOps::ProducerAcquire { pipeline } => instructions.push( Instruction::Pipeline(super::pipeline::PipelineOps::ProducerAcquire { diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs index 9423dad40..844039b69 100644 --- a/crates/cubecl-cpp/src/shared/pipeline.rs +++ b/crates/cubecl-cpp/src/shared/pipeline.rs @@ -52,7 +52,7 @@ impl Display for PipelineOps { let size = item.elem().size() * item.vectorization; write!(f, " cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline}); - ") + ") } PipelineOps::Init { pipeline, @@ -63,7 +63,7 @@ cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, { " cuda::pipeline_shared_state {pipeline}_state; auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state); - " + " ) } PipelineOps::ProducerAcquire { pipeline } => { From 63f92f1e0d282891f3bb63c3a7744c8c1d1add79 Mon Sep 17 00:00:00 2001 From: louisfd Date: Wed, 5 Feb 2025 09:51:45 -0500 Subject: [PATCH 14/14] add guards --- crates/cubecl-core/src/runtime_tests/memcpy_async.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs index b389e4373..9936176af 100644 --- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs +++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs @@ -1,4 +1,4 @@ -use crate::{self as cubecl, as_bytes}; +use crate::{self as cubecl, as_bytes, Feature}; use cubecl::prelude::*; use pipeline::Pipeline; @@ -99,6 +99,11 @@ fn two_independant_loads( pub fn test_memcpy_one_load( client: ComputeClient, ) { + if !client.properties().feature_enabled(Feature::Pipeline) { + // We can't execute the test, skip. + return; + } + let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]); let output = client.empty(4 * core::mem::size_of::()); @@ -123,6 +128,11 @@ pub fn test_memcpy_two_loads( independant: bool, client: ComputeClient, ) { + if !client.properties().feature_enabled(Feature::Pipeline) { + // We can't execute the test, skip. + return; + } + let num_data = 4; let lhs_data: Vec = (0..num_data).map(|i| F::new(i as f32)).collect(); let rhs_data: Vec = (0..num_data).map(|i| F::new(i as f32)).collect();