From a66e417b113dd7f4ef23c67892634aace762f64e Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Mon, 3 Feb 2025 13:08:37 -0500
Subject: [PATCH 01/14] wip

---
 .../src/runtime_tests/memcpy_async.rs         | 119 ++++++++++++++++++
 crates/cubecl-core/src/runtime_tests/mod.rs   |   2 +
 2 files changed, 121 insertions(+)
 create mode 100644 crates/cubecl-core/src/runtime_tests/memcpy_async.rs
diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
new file mode 100644
index 000000000..3e3541082
--- /dev/null
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -0,0 +1,119 @@
+use crate::{self as cubecl, as_bytes, runtime_tests::memcpy_async, Feature};
+use cubecl::prelude::*;
+use pipeline::Pipeline;
+
+#[cube]
+fn tile_computation<F: Float>(
+    lhs: Slice<Line<F>>,
+    rhs: Slice<Line<F>>,
+    mut out: SliceMut<Line<F>>,
+) {
+    for i in 0..lhs.len() {
+        out[i] = Line::cast_from(F::new(10.)) * lhs[i] + rhs[i];
+    }
+}
+
+#[cube]
+fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<F>>) {
+    for i in 0..source.len() {
+        destination[i] = source[i];
+    }
+}
+
+#[cube(launch)]
+fn computation<F: Float>(
+    lhs: &Tensor<Line<F>>,
+    rhs: &Tensor<Line<F>>,
+    output: &mut Tensor<Line<F>>,
+) {
+    let unit_id = UNIT_POS_X;
+    let mut lhs_smem = SharedMemory::<F>::new_lined(16u32, 1u32);
+    let mut rhs_smem = SharedMemory::<F>::new_lined(16u32, 1u32);
+
+    let pipeline = Pipeline::new();
+
+    // Load Lhs to SMEM
+    pipeline.memcpy_async(
+        lhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
+        lhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32),
+    );
+
+    // Load Rhs to SMEM
+    pipeline.memcpy_async(
+        rhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
+        rhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32),
+    );
+
+    // sync
+    sync_units();
+
+    // Perform matmul on SMEM
+    tile_computation(
+        lhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
+        rhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
+        output.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32),
+    );
+}
+
+pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
+    client: ComputeClient<R::Server, R::Channel>,
+) {
+    // if !client.properties().feature_enabled(Feature::Pipeline) {
+    //     // We can't execute the test, skip.
+    //     return;
+    // }
+
+    let lhs = client
+        .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]);
+    let rhs = client
+        .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]);
+    let output = client.empty(16 * core::mem::size_of::<F>());
+
+    unsafe {
+        computation::launch::<F, R>(
+            &client,
+            CubeCount::Static(1, 1, 1),
+            CubeDim::new(8, 1, 1),
+            TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
+            TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
+            TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
+        )
+    };
+
+    let actual = client.read_one(output.binding());
+    let actual = F::from_bytes(&actual);
+    let expected = [
+        F::new(0.0),
+        F::new(11.0),
+        F::new(22.0),
+        F::new(33.0),
+        F::new(44.0),
+        F::new(55.0),
+        F::new(66.0),
+        F::new(77.0),
+        F::new(88.0),
+        F::new(99.0),
+        F::new(110.0),
+        F::new(121.0),
+        F::new(132.0),
+        F::new(143.0),
+        F::new(154.0),
+        F::new(165.0),
+    ];
+
+    assert_eq!(actual, expected);
+}
+
+#[allow(missing_docs)]
+#[macro_export]
+macro_rules! testgen_memcpy_async {
+    () => {
+        use super::*;
+
+        #[test]
+        fn test_memcpy_async() {
+            let client = TestRuntime::client(&Default::default());
+            cubecl_core::runtime_tests::memcpy_async::test_memcpy::<TestRuntime, FloatType>(client);
+        }
+    };
+}
diff --git a/crates/cubecl-core/src/runtime_tests/mod.rs b/crates/cubecl-core/src/runtime_tests/mod.rs
index 2f37056dc..033a69fb2 100644
--- a/crates/cubecl-core/src/runtime_tests/mod.rs
+++ b/crates/cubecl-core/src/runtime_tests/mod.rs
@@ -10,6 +10,7 @@ pub mod different_rank;
 pub mod index;
 pub mod launch;
 pub mod line;
+pub mod memcpy_async;
 pub mod metadata;
 pub mod pipeline;
 pub mod plane;
@@ -83,6 +84,7 @@ macro_rules! testgen_float {
         cubecl_core::testgen_launch!();
         cubecl_core::testgen_line!();
         cubecl_core::testgen_pipeline!();
+        cubecl_core::testgen_memcpy_async!();
         cubecl_core::testgen_plane!();
         cubecl_core::testgen_sequence!();
         cubecl_core::testgen_slice!();

From 3ccb9de75568cb89c522f1fd9d27fd382659dd3d Mon Sep 17 00:00:00 2001
From: louisfd <louisfd@gmail.com>
Date: Mon, 3 Feb 2025 15:42:55 -0500
Subject: [PATCH 02/14] wip

---
 .../src/runtime_tests/memcpy_async.rs         | 73 +++++++++----------
 1 file changed, 33 insertions(+), 40 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 3e3541082..ce0f8cc3d 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -8,8 +8,13 @@ fn tile_computation<F: Float>(
     rhs: Slice<Line<F>>,
     mut out: SliceMut<Line<F>>,
 ) {
-    for i in 0..lhs.len() {
-        out[i] = Line::cast_from(F::new(10.)) * lhs[i] + rhs[i];
+    if UNIT_POS_X == 0 {
+        for i in 0..2 {
+            out[i] = lhs[i];
+        }
+        for i in 0..2 {
+            out[i + 2] = rhs[i];
+        }
     }
 }
 
@@ -26,33 +31,33 @@ fn computation<F: Float>(
     rhs: &Tensor<Line<F>>,
     output: &mut Tensor<Line<F>>,
 ) {
-    let unit_id = UNIT_POS_X;
-    let mut lhs_smem = SharedMemory::<F>::new_lined(16u32, 1u32);
-    let mut rhs_smem = SharedMemory::<F>::new_lined(16u32, 1u32);
+    let mut lhs_smem = SharedMemory::<F>::new_lined(2u32, 1u32);
+    let mut rhs_smem = SharedMemory::<F>::new_lined(2u32, 1u32);
 
     let pipeline = Pipeline::new();
 
-    // Load Lhs to SMEM
-    pipeline.memcpy_async(
-        lhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
-        lhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32),
-    );
+    pipeline.producer_acquire();
 
-    // Load Rhs to SMEM
-    pipeline.memcpy_async(
-        rhs.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
-        rhs_smem.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32),
-    );
+    let start = 0u32;
+    let end = 2u32;
+    if UNIT_POS_X == 0 {
+        pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
+    } else {
+        // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
+    }
+
+    pipeline.producer_commit();
 
-    // sync
     sync_units();
 
+    pipeline.consumer_wait();
     // Perform matmul on SMEM
     tile_computation(
-        lhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
-        rhs_smem.slice(unit_id * 2u32, unit_id * 2u32 + 2u32),
-        output.slice_mut(unit_id * 2u32, unit_id * 2u32 + 2u32),
+        lhs_smem.slice(start, end),
+        rhs_smem.slice(start, end),
+        output.slice_mut(start, end),
     );
+    pipeline.consumer_release();
 }
 
 pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
@@ -63,17 +68,15 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
     //     return;
     // }
 
-    let lhs = client
-        .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]);
-    let rhs = client
-        .create(as_bytes![F: 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]);
-    let output = client.empty(16 * core::mem::size_of::<F>());
+    let lhs = client.create(as_bytes![F: 10., 11., 12., 13., 14., 15.]);
+    let rhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
+    let output = client.empty(4 * core::mem::size_of::<F>());
 
     unsafe {
         computation::launch::<F, R>(
             &client,
             CubeCount::Static(1, 1, 1),
-            CubeDim::new(8, 1, 1),
+            CubeDim::new(2, 1, 1),
             TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
             TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
             TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
@@ -83,22 +86,12 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
     let actual = client.read_one(output.binding());
     let actual = F::from_bytes(&actual);
     let expected = [
-        F::new(0.0),
+        F::new(10.0),
         F::new(11.0),
-        F::new(22.0),
-        F::new(33.0),
-        F::new(44.0),
-        F::new(55.0),
-        F::new(66.0),
-        F::new(77.0),
-        F::new(88.0),
-        F::new(99.0),
-        F::new(110.0),
-        F::new(121.0),
-        F::new(132.0),
-        F::new(143.0),
-        F::new(154.0),
-        F::new(165.0),
+        F::new(12.0),
+        F::new(13.0),
+        F::new(14.0),
+        F::new(15.0),
     ];
 
     assert_eq!(actual, expected);

From 4359ac5baf25a5c0dda2b6fdae331d2981c990c5 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Mon, 3 Feb 2025 15:57:21 -0500
Subject: [PATCH 03/14] choosable pipeline steps

---
 crates/cubecl-core/src/frontend/pipeline.rs          |  8 ++++----
 crates/cubecl-core/src/runtime_tests/memcpy_async.rs |  2 +-
 crates/cubecl-core/src/runtime_tests/pipeline.rs     |  4 ++--
 crates/cubecl-cpp/src/shared/base.rs                 | 11 +++++++++--
 crates/cubecl-cpp/src/shared/pipeline.rs             |  8 ++++++--
 crates/cubecl-ir/src/allocator.rs                    | 11 +++++++++--
 crates/cubecl-ir/src/scope.rs                        |  4 ++--
 crates/cubecl-ir/src/variable.rs                     |  2 +-
 8 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs
index 15bf05d9a..f46aa37a5 100644
--- a/crates/cubecl-core/src/frontend/pipeline.rs
+++ b/crates/cubecl-core/src/frontend/pipeline.rs
@@ -93,13 +93,13 @@ pub struct PipelineExpand<C: CubePrimitive> {
 
 impl<C: CubePrimitive> Default for Pipeline<C> {
     fn default() -> Self {
-        Self::new()
+        Self::new(1)
     }
 }
 
 impl<C: CubePrimitive> Pipeline<C> {
     /// Create a pipeline instance
-    pub fn new() -> Self {
+    pub fn new(num_steps: u32) -> Self {
         Self { _c: PhantomData }
     }
 
@@ -133,9 +133,9 @@ impl<C: CubePrimitive> Pipeline<C> {
         unexpanded!()
     }
 
-    pub fn __expand_new(scope: &mut Scope) -> PipelineExpand<C> {
+    pub fn __expand_new(scope: &mut Scope, num_steps: u32) -> PipelineExpand<C> {
         let elem = C::as_elem(scope);
-        let variable = scope.create_pipeline(Item::new(elem));
+        let variable = scope.create_pipeline(Item::new(elem), num_steps as u8);
         PipelineExpand {
             elem: variable,
             _c: PhantomData,
diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index ce0f8cc3d..209c5d0b6 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -34,7 +34,7 @@ fn computation<F: Float>(
     let mut lhs_smem = SharedMemory::<F>::new_lined(2u32, 1u32);
     let mut rhs_smem = SharedMemory::<F>::new_lined(2u32, 1u32);
 
-    let pipeline = Pipeline::new();
+    let pipeline = Pipeline::new(1u32);
 
     pipeline.producer_acquire();
 
diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs
index 2c4b95500..a33ee9be1 100644
--- a/crates/cubecl-core/src/runtime_tests/pipeline.rs
+++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs
@@ -12,7 +12,7 @@ fn pipelined_sum<F: Float>(
     let smem_size = 2 * batch_len;
     let num_batches = input.len() / batch_len;
     let mut shared_memory = SharedMemory::<F>::new_lined(smem_size, input.line_size());
-    let pipeline = Pipeline::new();
+    let pipeline = Pipeline::new(2u32);
 
     let mut sum = Line::<F>::empty(input.line_size()).fill(F::new(0.));
 
@@ -63,7 +63,7 @@ fn pipelined_sum<F: Float>(
 
 #[cube(launch)]
 pub fn async_copy_test<F: Float>(input: &Array<Line<F>>, output: &mut Array<Line<F>>) {
-    let pipeline = pipeline::Pipeline::<F>::new();
+    let pipeline = pipeline::Pipeline::<F>::new(2u32);
     let mut smem = SharedMemory::<F>::new_lined(1u32, 1u32);
 
     if UNIT_POS == 0 {
diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs
index ad3bf35e1..8746efb15 100644
--- a/crates/cubecl-cpp/src/shared/base.rs
+++ b/crates/cubecl-cpp/src/shared/base.rs
@@ -1043,14 +1043,21 @@ impl<D: Dialect> CppCompiler<D> {
                     frag: self.compile_matrix(mat),
                 }
             }
-            gpu::VariableKind::Pipeline { id, item } => {
+            gpu::VariableKind::Pipeline {
+                id,
+                item,
+                num_steps,
+            } => {
                 self.pipeline = true;
                 let pipeline = Variable::Pipeline {
                     id,
                     item: self.compile_item(item),
                 };
                 if !self.pipelines.iter().any(|s| s.pipeline_id() == id) {
-                    self.pipelines.push(PipelineOps::Init { pipeline });
+                    self.pipelines.push(PipelineOps::Init {
+                        pipeline,
+                        num_steps,
+                    });
                 }
                 pipeline
             }
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 34e25e430..54bc2db8d 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -6,6 +6,7 @@ use super::{Component, Dialect, Variable};
 pub enum PipelineOps<D: Dialect> {
     Init {
         pipeline: Variable<D>,
+        num_steps: u8,
     },
     MemCopyAsync {
         pipeline: Variable<D>,
@@ -53,11 +54,14 @@ impl<D: Dialect> Display for PipelineOps<D> {
 cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline});
                 ")
             }
-            PipelineOps::Init { pipeline } => {
+            PipelineOps::Init {
+                pipeline,
+                num_steps,
+            } => {
                 write!(
                     f,
                     "
-__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, 2> {pipeline}_state;
+__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_steps}> {pipeline}_state;
 auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
                 "
                 )
diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs
index 90d6907e0..6d6173579 100644
--- a/crates/cubecl-ir/src/allocator.rs
+++ b/crates/cubecl-ir/src/allocator.rs
@@ -96,9 +96,16 @@ impl Allocator {
         ExpandElement::Plain(variable)
     }
 
-    pub fn create_pipeline(&self, item: Item) -> ExpandElement {
+    pub fn create_pipeline(&self, item: Item, num_steps: u8) -> ExpandElement {
         let id = self.new_local_index();
-        let variable = Variable::new(VariableKind::Pipeline { id, item }, item);
+        let variable = Variable::new(
+            VariableKind::Pipeline {
+                id,
+                item,
+                num_steps,
+            },
+            item,
+        );
         ExpandElement::Plain(variable)
     }
 
diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs
index a533c8fc9..876c2df34 100644
--- a/crates/cubecl-ir/src/scope.rs
+++ b/crates/cubecl-ir/src/scope.rs
@@ -110,8 +110,8 @@ impl Scope {
     }
 
     /// Create a new pipeline element.
-    pub fn create_pipeline(&mut self, item: Item) -> ExpandElement {
-        let pipeline = self.allocator.create_pipeline(item);
+    pub fn create_pipeline(&mut self, item: Item, num_steps: u8) -> ExpandElement {
+        let pipeline = self.allocator.create_pipeline(item, num_steps);
         self.add_pipeline(*pipeline);
         pipeline
     }
diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs
index 36317aa95..530faa26e 100644
--- a/crates/cubecl-ir/src/variable.rs
+++ b/crates/cubecl-ir/src/variable.rs
@@ -59,7 +59,7 @@ pub enum VariableKind {
     Matrix { id: Id, mat: Matrix },
     Slice { id: Id },
     Builtin(Builtin),
-    Pipeline { id: Id, item: Item },
+    Pipeline { id: Id, item: Item, num_steps: u8 },
 }
 
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]

From 1b3d940f45602131f588e18bf16cc4886aa70e33 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd@gmail.com>
Date: Tue, 4 Feb 2025 09:22:26 -0500
Subject: [PATCH 04/14] fix pipeline

---
 crates/cubecl-core/src/frontend/context.rs    | 159 ++++++++++++++++++
 crates/cubecl-core/src/frontend/pipeline.rs   |  27 ++-
 .../src/runtime_tests/memcpy_async.rs         | 114 +++++++++----
 crates/cubecl-cpp/src/shared/base.rs          |   4 +-
 crates/cubecl-cpp/src/shared/pipeline.rs      |   8 +-
 crates/cubecl-ir/src/allocator.rs             |   4 +-
 crates/cubecl-ir/src/scope.rs                 |   4 +-
 crates/cubecl-ir/src/variable.rs              |   2 +-
 8 files changed, 271 insertions(+), 51 deletions(-)
 create mode 100644 crates/cubecl-core/src/frontend/context.rs

diff --git a/crates/cubecl-core/src/frontend/context.rs b/crates/cubecl-core/src/frontend/context.rs
new file mode 100644
index 000000000..284e14712
--- /dev/null
+++ b/crates/cubecl-core/src/frontend/context.rs
@@ -0,0 +1,159 @@
+use crate::ir::Id;
+use crate::ir::{self, Elem, Instruction, Item, Scope, Variable, VariableKind};
+use alloc::rc::Rc;
+use core::cell::RefCell;
+use cubecl_ir::ExpandElement;
+use cubecl_runtime::debug::DebugLogger;
+use std::any::TypeId;
+use std::collections::HashMap;
+
+pub struct CubeContext {
+    pub root: Rc<RefCell<Scope>>,
+    pub scope: Rc<RefCell<Scope>>,
+    pub debug_enabled: bool,
+    pub typemap: Rc<RefCell<HashMap<TypeId, Elem>>>,
+}
+
+impl Default for CubeContext {
+    fn default() -> Self {
+        Self::root()
+    }
+}
+
+impl CubeContext {
+    /// Create a new cube context, with a root scope
+    /// A root scope is at the root of a compute shader
+    /// Therefore there is one cube context per shader
+    /// The allocator will define the strategy for creating local intermediates and mutable variables
+    pub fn root() -> CubeContext {
+        let root = Rc::new(RefCell::new(Scope::root()));
+        let typemap = Rc::new(RefCell::new(HashMap::new()));
+        let scope = root.clone();
+
+        Self {
+            scope,
+            root,
+            debug_enabled: DebugLogger::default().is_activated(),
+            typemap,
+        }
+    }
+
+    pub fn register<O: Into<Instruction>>(&mut self, op: O) {
+        self.scope.borrow_mut().register(op)
+    }
+
+    /// Resolve the element type of the given generic type.
+    pub fn resolve_elem<T: 'static>(&self) -> Option<Elem> {
+        let map = self.typemap.borrow();
+        let result = map.get(&TypeId::of::<T>());
+
+        result.cloned()
+    }
+
+    /// Register the element type for the given generic type.
+    pub fn register_elem<T: 'static>(&mut self, elem: Elem) {
+        let mut map = self.typemap.borrow_mut();
+
+        map.insert(TypeId::of::<T>(), elem);
+    }
+
+    pub fn child(&mut self) -> CubeContext {
+        let scope = self.scope.borrow_mut().child();
+
+        Self {
+            scope: Rc::new(RefCell::new(scope)),
+            root: self.root.clone(),
+            debug_enabled: self.debug_enabled,
+            typemap: self.typemap.clone(),
+        }
+    }
+
+    pub fn into_scope(self) -> Scope {
+        core::mem::drop(self.root);
+
+        Rc::into_inner(self.scope)
+            .expect("Only one reference")
+            .into_inner()
+    }
+
+    /// Create a new mutable local variable.
+    pub fn create_local_mut(&mut self, item: Item) -> ExpandElement {
+        let local = self.scope.borrow().allocator.create_local_mut(item);
+        self.scope.borrow_mut().add_local_mut(*local);
+        local
+    }
+
+    /// Create a new immutable local variable.
+    pub fn create_local(&mut self, item: Item) -> ExpandElement {
+        self.scope.borrow().allocator.create_local(item)
+    }
+
+    /// Create a new immutable local binding that must never be a reused variable, regardless of
+    /// allocator
+    pub fn create_local_restricted(&mut self, item: Item) -> ExpandElement {
+        self.scope.borrow().allocator.create_local_restricted(item)
+    }
+
+    /// Create a new matrix element.
+    pub fn create_matrix(&mut self, matrix: ir::Matrix) -> ExpandElement {
+        let matrix = self.scope.borrow().allocator.create_matrix(matrix);
+        self.scope.borrow_mut().add_matrix(*matrix);
+        matrix
+    }
+
+    /// Create a new pipeline element.
+    pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement {
+        let pipeline = self
+            .scope
+            .borrow()
+            .allocator
+            .create_pipeline(item, num_stages);
+        self.scope.borrow_mut().add_pipeline(*pipeline);
+        pipeline
+    }
+
+    /// Create a new slice element.
+    pub fn create_slice(&mut self, item: Item) -> ExpandElement {
+        let slice = self.scope.borrow().allocator.create_slice(item);
+        self.scope.borrow_mut().add_slice(*slice);
+        slice
+    }
+
+    pub fn create_shared(&mut self, item: Item, size: u32) -> ExpandElement {
+        ExpandElement::Plain(self.root.borrow_mut().create_shared(item, size))
+    }
+
+    pub fn create_local_array(&mut self, item: Item, size: u32) -> ExpandElement {
+        let local_array: ExpandElement =
+            self.root.borrow().allocator.create_local_array(item, size);
+        self.root.borrow_mut().add_local_array(*local_array);
+        local_array
+    }
+
+    pub fn create_const_array(&mut self, item: Item, data: Vec<Variable>) -> ExpandElement {
+        ExpandElement::Plain(self.root.borrow_mut().create_const_array(item, data))
+    }
+
+    /// Obtain the index-th input
+    pub fn input(&mut self, id: Id, item: Item) -> ExpandElement {
+        ExpandElement::Plain(crate::ir::Variable::new(
+            VariableKind::GlobalInputArray(id),
+            item,
+        ))
+    }
+
+    /// Obtain the index-th output
+    pub fn output(&mut self, id: Id, item: Item) -> ExpandElement {
+        let var = crate::ir::Variable::new(VariableKind::GlobalOutputArray(id), item);
+        self.scope.borrow_mut().write_global_custom(var);
+        ExpandElement::Plain(var)
+    }
+
+    /// Obtain the index-th scalar
+    pub fn scalar(&self, id: Id, elem: Elem) -> ExpandElement {
+        ExpandElement::Plain(crate::ir::Variable::new(
+            VariableKind::GlobalScalar(id),
+            Item::new(elem),
+        ))
+    }
+}
diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs
index f46aa37a5..847fa3248 100644
--- a/crates/cubecl-core/src/frontend/pipeline.rs
+++ b/crates/cubecl-core/src/frontend/pipeline.rs
@@ -76,14 +76,33 @@ use crate::{
     unexpanded,
 };
 
-use super::{CubePrimitive, ExpandElementTyped, Line, Slice, SliceMut};
+use super::{
+    CubePrimitive, CubeType, ExpandElementTyped, Init, IntoRuntime, Line, Slice, SliceMut,
+};
 
 /// A mechanism for managing a sequence of `memcpy_async`
 /// For now, it only works at the Cube scope
+#[derive(Clone, Copy)]
 pub struct Pipeline<C: CubePrimitive> {
     _c: PhantomData<C>,
 }
 
+impl<C: CubePrimitive> IntoRuntime for Pipeline<C> {
+    fn __expand_runtime_method(self, _scope: &mut Scope) -> Self::ExpandType {
+        panic!("Doesn't exist at runtime")
+    }
+}
+
+impl<C: CubePrimitive> CubeType for Pipeline<C> {
+    type ExpandType = PipelineExpand<C>;
+}
+
+impl<C: CubePrimitive> Init for PipelineExpand<C> {
+    fn init(self, _scope: &mut Scope) -> Self {
+        self
+    }
+}
+
 #[derive(Clone)]
 /// Expand type of [Pipeline]
 pub struct PipelineExpand<C: CubePrimitive> {
@@ -99,7 +118,7 @@ impl<C: CubePrimitive> Default for Pipeline<C> {
 
 impl<C: CubePrimitive> Pipeline<C> {
     /// Create a pipeline instance
-    pub fn new(num_steps: u32) -> Self {
+    pub fn new(_num_stages: u32) -> Self {
         Self { _c: PhantomData }
     }
 
@@ -133,9 +152,9 @@ impl<C: CubePrimitive> Pipeline<C> {
         unexpanded!()
     }
 
-    pub fn __expand_new(scope: &mut Scope, num_steps: u32) -> PipelineExpand<C> {
+    pub fn __expand_new(scope: &mut Scope, num_stages: u32) -> PipelineExpand<C> {
         let elem = C::as_elem(scope);
-        let variable = scope.create_pipeline(Item::new(elem), num_steps as u8);
+        let variable = scope.create_pipeline(Item::new(elem), num_stages as u8);
         PipelineExpand {
             elem: variable,
             _c: PhantomData,
diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 209c5d0b6..013c78063 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -3,19 +3,52 @@ use cubecl::prelude::*;
 use pipeline::Pipeline;
 
 #[cube]
-fn tile_computation<F: Float>(
+fn tile_computation_1<F: Float>(
     lhs: Slice<Line<F>>,
     rhs: Slice<Line<F>>,
     mut out: SliceMut<Line<F>>,
 ) {
-    if UNIT_POS_X == 0 {
-        for i in 0..2 {
-            out[i] = lhs[i];
-        }
-        for i in 0..2 {
-            out[i + 2] = rhs[i];
-        }
+    for i in 0..2 {
+        out[i] = Line::cast_from(10u32) * lhs[i];
+    }
+    for i in 0..2 {
+        out[i] += rhs[i];
+    }
+}
+
+#[cube]
+fn tile_computation_2<F: Float>(
+    lhs: &SharedMemory<Line<F>>,
+    rhs: &SharedMemory<Line<F>>,
+    out: &mut Tensor<Line<F>>,
+    start: u32,
+    end: u32,
+) {
+    for i in start..end {
+        out[i] = Line::cast_from(10u32) * lhs[i];
+    }
+    for i in start..end {
+        out[i] += rhs[i];
+    }
+}
+
+#[cube]
+fn tile_computation_3<F: Float>(
+    lhs: &SharedMemory<Line<F>>,
+    rhs: &SharedMemory<Line<F>>,
+    out: &mut Tensor<Line<F>>,
+    start: u32,
+    end: u32,
+    pipeline: Pipeline<F>,
+) {
+    pipeline.consumer_wait();
+    for i in start..end {
+        out[i] = Line::cast_from(10u32) * lhs[i];
+    }
+    for i in start..end {
+        out[i] += rhs[i];
     }
+    pipeline.consumer_release();
 }
 
 #[cube]
@@ -31,33 +64,47 @@ fn computation<F: Float>(
     rhs: &Tensor<Line<F>>,
     output: &mut Tensor<Line<F>>,
 ) {
-    let mut lhs_smem = SharedMemory::<F>::new_lined(2u32, 1u32);
-    let mut rhs_smem = SharedMemory::<F>::new_lined(2u32, 1u32);
+    let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
+    let mut rhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
 
-    let pipeline = Pipeline::new(1u32);
+    let pipeline = Pipeline::new(2u32);
 
-    pipeline.producer_acquire();
+    let start = UNIT_POS_X * 2u32;
+    let end = start + 2u32;
 
-    let start = 0u32;
-    let end = 2u32;
-    if UNIT_POS_X == 0 {
-        pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
-    } else {
-        // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
-    }
+    pipeline.producer_acquire();
+    pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
+    // memcpy_sync(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
+    pipeline.producer_commit();
 
+    pipeline.producer_acquire();
+    pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
+    // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
     pipeline.producer_commit();
 
-    sync_units();
+    // tile_computation_0: inline
+    // pipeline.consumer_wait();
+    // for i in start..end {
+    //     output[i] = Line::cast_from(10u32) * lhs[i];
+    // }
+    // for i in start..end {
+    //     output[i] += rhs[i];
+    // }
+    // pipeline.consumer_release();
 
-    pipeline.consumer_wait();
-    // Perform matmul on SMEM
-    tile_computation(
-        lhs_smem.slice(start, end),
-        rhs_smem.slice(start, end),
-        output.slice_mut(start, end),
-    );
-    pipeline.consumer_release();
+    // pipeline.consumer_wait();
+    // tile_computation_1(
+    //     lhs_smem.slice(start, end),
+    //     rhs_smem.slice(start, end),
+    //     output.slice_mut(start, end),
+    // );
+    // pipeline.consumer_release();
+
+    // pipeline.consumer_wait();
+    // tile_computation_2(&lhs_smem, &rhs_smem, output, start, end);
+    // pipeline.consumer_release();
+
+    tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline);
 }
 
 pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
@@ -68,7 +115,7 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
     //     return;
     // }
 
-    let lhs = client.create(as_bytes![F: 10., 11., 12., 13., 14., 15.]);
+    let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
     let rhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
     let output = client.empty(4 * core::mem::size_of::<F>());
 
@@ -85,14 +132,7 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
 
     let actual = client.read_one(output.binding());
     let actual = F::from_bytes(&actual);
-    let expected = [
-        F::new(10.0),
-        F::new(11.0),
-        F::new(12.0),
-        F::new(13.0),
-        F::new(14.0),
-        F::new(15.0),
-    ];
+    let expected = [F::new(110.0), F::new(121.0), F::new(132.0), F::new(143.0)];
 
     assert_eq!(actual, expected);
 }
diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs
index 8746efb15..e583ddae8 100644
--- a/crates/cubecl-cpp/src/shared/base.rs
+++ b/crates/cubecl-cpp/src/shared/base.rs
@@ -1046,7 +1046,7 @@ impl<D: Dialect> CppCompiler<D> {
             gpu::VariableKind::Pipeline {
                 id,
                 item,
-                num_steps,
+                num_stages,
             } => {
                 self.pipeline = true;
                 let pipeline = Variable::Pipeline {
@@ -1056,7 +1056,7 @@ impl<D: Dialect> CppCompiler<D> {
                 if !self.pipelines.iter().any(|s| s.pipeline_id() == id) {
                     self.pipelines.push(PipelineOps::Init {
                         pipeline,
-                        num_steps,
+                        num_stages,
                     });
                 }
                 pipeline
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 54bc2db8d..3f7e81afa 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -6,7 +6,7 @@ use super::{Component, Dialect, Variable};
 pub enum PipelineOps<D: Dialect> {
     Init {
         pipeline: Variable<D>,
-        num_steps: u8,
+        num_stages: u8,
     },
     MemCopyAsync {
         pipeline: Variable<D>,
@@ -56,12 +56,14 @@ cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {sour
             }
             PipelineOps::Init {
                 pipeline,
-                num_steps,
+                num_stages,
             } => {
+                // cuda::thread_scope::thread_scope_block -> sync only within thread_block, no gmem sync
+                // cooperative_groups::this_thread_block() -> each thread in threadblock has its pipeline instance
                 write!(
                     f,
                     "
-__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_steps}> {pipeline}_state;
+__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
 auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
                 "
                 )
diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs
index 6d6173579..6a4115d5b 100644
--- a/crates/cubecl-ir/src/allocator.rs
+++ b/crates/cubecl-ir/src/allocator.rs
@@ -96,13 +96,13 @@ impl Allocator {
         ExpandElement::Plain(variable)
     }
 
-    pub fn create_pipeline(&self, item: Item, num_steps: u8) -> ExpandElement {
+    pub fn create_pipeline(&self, item: Item, num_stages: u8) -> ExpandElement {
         let id = self.new_local_index();
         let variable = Variable::new(
             VariableKind::Pipeline {
                 id,
                 item,
-                num_steps,
+                num_stages,
             },
             item,
         );
diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs
index 876c2df34..a3f0e4540 100644
--- a/crates/cubecl-ir/src/scope.rs
+++ b/crates/cubecl-ir/src/scope.rs
@@ -110,8 +110,8 @@ impl Scope {
     }
 
     /// Create a new pipeline element.
-    pub fn create_pipeline(&mut self, item: Item, num_steps: u8) -> ExpandElement {
-        let pipeline = self.allocator.create_pipeline(item, num_steps);
+    pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement {
+        let pipeline = self.allocator.create_pipeline(item, num_stages);
         self.add_pipeline(*pipeline);
         pipeline
     }
diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs
index 530faa26e..c1c3e850a 100644
--- a/crates/cubecl-ir/src/variable.rs
+++ b/crates/cubecl-ir/src/variable.rs
@@ -59,7 +59,7 @@ pub enum VariableKind {
     Matrix { id: Id, mat: Matrix },
     Slice { id: Id },
     Builtin(Builtin),
-    Pipeline { id: Id, item: Item, num_steps: u8 },
+    Pipeline { id: Id, item: Item, num_stages: u8 },
 }
 
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]

From e1a4b5e24974fab720a9a1ae67db31ed18d6251d Mon Sep 17 00:00:00 2001
From: louisfd <louisfd@gmail.com>
Date: Tue, 4 Feb 2025 11:11:15 -0500
Subject: [PATCH 05/14] wip

---
 .../src/runtime_tests/memcpy_async.rs         | 32 +++++++++++--------
 crates/cubecl-cpp/src/shared/pipeline.rs      |  6 ++--
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 013c78063..5518b50a6 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -61,11 +61,11 @@ fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<
 #[cube(launch)]
 fn computation<F: Float>(
     lhs: &Tensor<Line<F>>,
-    rhs: &Tensor<Line<F>>,
+    // rhs: &Tensor<Line<F>>,
     output: &mut Tensor<Line<F>>,
 ) {
     let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
-    let mut rhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
+    // let mut rhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
 
     let pipeline = Pipeline::new(2u32);
 
@@ -74,21 +74,24 @@ fn computation<F: Float>(
 
     pipeline.producer_acquire();
     pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
-    // memcpy_sync(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
     pipeline.producer_commit();
 
-    pipeline.producer_acquire();
-    pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
-    // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
-    pipeline.producer_commit();
+    pipeline.consumer_wait();
+    for i in start..end {
+        output[i] = lhs_smem[i];
+    }
+    pipeline.consumer_release();
+
+    // pipeline.producer_acquire();
+    // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
+    // // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
+    // pipeline.producer_commit();
 
     // tile_computation_0: inline
+
     // pipeline.consumer_wait();
     // for i in start..end {
-    //     output[i] = Line::cast_from(10u32) * lhs[i];
-    // }
-    // for i in start..end {
-    //     output[i] += rhs[i];
+    //     output[i] += rhs_smem[i];
     // }
     // pipeline.consumer_release();
 
@@ -104,7 +107,7 @@ fn computation<F: Float>(
     // tile_computation_2(&lhs_smem, &rhs_smem, output, start, end);
     // pipeline.consumer_release();
 
-    tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline);
+    // tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline);
 }
 
 pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
@@ -123,9 +126,9 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
         computation::launch::<F, R>(
             &client,
             CubeCount::Static(1, 1, 1),
-            CubeDim::new(2, 1, 1),
+            CubeDim::new(1, 1, 1),
             TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
-            TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
+            // TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
             TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
         )
     };
@@ -135,6 +138,7 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
     let expected = [F::new(110.0), F::new(121.0), F::new(132.0), F::new(143.0)];
 
     assert_eq!(actual, expected);
+    assert!(false);
 }
 
 #[allow(missing_docs)]
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 3f7e81afa..921efb224 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -51,7 +51,7 @@ impl<D: Dialect> Display for PipelineOps<D> {
                 let item = source.item();
                 let size = item.elem().size() * item.vectorization;
                 write!(f, "
-cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline});
+cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
                 ")
             }
             PipelineOps::Init {
@@ -63,8 +63,8 @@ cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {sour
                 write!(
                     f,
                     "
-__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
-auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
+__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_thread, {num_stages}> {pipeline}_state;
+auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
                 "
                 )
             }

From a12fbc692147f9725034debeb9c5ee05f8ea10e1 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Tue, 4 Feb 2025 11:13:51 -0500
Subject: [PATCH 06/14] cleanup example

---
 .../src/runtime_tests/memcpy_async.rs         | 95 +------------------
 1 file changed, 3 insertions(+), 92 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 5518b50a6..911d6612d 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -1,56 +1,7 @@
-use crate::{self as cubecl, as_bytes, runtime_tests::memcpy_async, Feature};
+use crate::{self as cubecl, as_bytes};
 use cubecl::prelude::*;
 use pipeline::Pipeline;
 
-#[cube]
-fn tile_computation_1<F: Float>(
-    lhs: Slice<Line<F>>,
-    rhs: Slice<Line<F>>,
-    mut out: SliceMut<Line<F>>,
-) {
-    for i in 0..2 {
-        out[i] = Line::cast_from(10u32) * lhs[i];
-    }
-    for i in 0..2 {
-        out[i] += rhs[i];
-    }
-}
-
-#[cube]
-fn tile_computation_2<F: Float>(
-    lhs: &SharedMemory<Line<F>>,
-    rhs: &SharedMemory<Line<F>>,
-    out: &mut Tensor<Line<F>>,
-    start: u32,
-    end: u32,
-) {
-    for i in start..end {
-        out[i] = Line::cast_from(10u32) * lhs[i];
-    }
-    for i in start..end {
-        out[i] += rhs[i];
-    }
-}
-
-#[cube]
-fn tile_computation_3<F: Float>(
-    lhs: &SharedMemory<Line<F>>,
-    rhs: &SharedMemory<Line<F>>,
-    out: &mut Tensor<Line<F>>,
-    start: u32,
-    end: u32,
-    pipeline: Pipeline<F>,
-) {
-    pipeline.consumer_wait();
-    for i in start..end {
-        out[i] = Line::cast_from(10u32) * lhs[i];
-    }
-    for i in start..end {
-        out[i] += rhs[i];
-    }
-    pipeline.consumer_release();
-}
-
 #[cube]
 fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<F>>) {
     for i in 0..source.len() {
@@ -59,13 +10,8 @@ fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<
 }
 
 #[cube(launch)]
-fn computation<F: Float>(
-    lhs: &Tensor<Line<F>>,
-    // rhs: &Tensor<Line<F>>,
-    output: &mut Tensor<Line<F>>,
-) {
+fn computation<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
     let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
-    // let mut rhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
 
     let pipeline = Pipeline::new(2u32);
 
@@ -81,45 +27,12 @@ fn computation<F: Float>(
         output[i] = lhs_smem[i];
     }
     pipeline.consumer_release();
-
-    // pipeline.producer_acquire();
-    // pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
-    // // memcpy_sync(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
-    // pipeline.producer_commit();
-
-    // tile_computation_0: inline
-
-    // pipeline.consumer_wait();
-    // for i in start..end {
-    //     output[i] += rhs_smem[i];
-    // }
-    // pipeline.consumer_release();
-
-    // pipeline.consumer_wait();
-    // tile_computation_1(
-    //     lhs_smem.slice(start, end),
-    //     rhs_smem.slice(start, end),
-    //     output.slice_mut(start, end),
-    // );
-    // pipeline.consumer_release();
-
-    // pipeline.consumer_wait();
-    // tile_computation_2(&lhs_smem, &rhs_smem, output, start, end);
-    // pipeline.consumer_release();
-
-    // tile_computation_3(&lhs_smem, &rhs_smem, output, start, end, pipeline);
 }
 
 pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
     client: ComputeClient<R::Server, R::Channel>,
 ) {
-    // if !client.properties().feature_enabled(Feature::Pipeline) {
-    //     // We can't execute the test, skip.
-    //     return;
-    // }
-
     let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
-    let rhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
     let output = client.empty(4 * core::mem::size_of::<F>());
 
     unsafe {
@@ -128,17 +41,15 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
             CubeCount::Static(1, 1, 1),
             CubeDim::new(1, 1, 1),
             TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
-            // TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
             TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
         )
     };
 
     let actual = client.read_one(output.binding());
     let actual = F::from_bytes(&actual);
-    let expected = [F::new(110.0), F::new(121.0), F::new(132.0), F::new(143.0)];
+    let expected = [F::new(10.0), F::new(11.0), F::new(12.0), F::new(13.0)];
 
     assert_eq!(actual, expected);
-    assert!(false);
 }
 
 #[allow(missing_docs)]

From c6de6142127231cf6752080c238fb9ef93df9786 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Tue, 4 Feb 2025 11:44:39 -0500
Subject: [PATCH 07/14] pipeline group enum

---
 crates/cubecl-core/src/frontend/pipeline.rs   | 24 +++++++--
 .../src/runtime_tests/memcpy_async.rs         |  4 +-
 .../cubecl-core/src/runtime_tests/pipeline.rs |  4 +-
 crates/cubecl-cpp/src/shared/base.rs          | 25 ++++++---
 crates/cubecl-cpp/src/shared/pipeline.rs      | 52 ++++++++++++++-----
 crates/cubecl-ir/src/allocator.rs             |  5 +-
 crates/cubecl-ir/src/scope.rs                 | 11 +++-
 crates/cubecl-ir/src/variable.rs              | 44 ++++++++++++----
 8 files changed, 128 insertions(+), 41 deletions(-)

diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs
index 847fa3248..f30f4cdb8 100644
--- a/crates/cubecl-core/src/frontend/pipeline.rs
+++ b/crates/cubecl-core/src/frontend/pipeline.rs
@@ -112,13 +112,18 @@ pub struct PipelineExpand<C: CubePrimitive> {
 
 impl<C: CubePrimitive> Default for Pipeline<C> {
     fn default() -> Self {
-        Self::new(1)
+        Self::new(1, PipelineGroup::Unit)
     }
 }
 
+pub enum PipelineGroup {
+    Unit,
+    Cube,
+}
+
 impl<C: CubePrimitive> Pipeline<C> {
     /// Create a pipeline instance
-    pub fn new(_num_stages: u32) -> Self {
+    pub fn new(_num_stages: u32, _group: PipelineGroup) -> Self {
         Self { _c: PhantomData }
     }
 
@@ -152,9 +157,20 @@ impl<C: CubePrimitive> Pipeline<C> {
         unexpanded!()
     }
 
-    pub fn __expand_new(scope: &mut Scope, num_stages: u32) -> PipelineExpand<C> {
+    pub fn __expand_new(
+        scope: &mut Scope,
+        num_stages: u32,
+        pipeline_group: PipelineGroup,
+    ) -> PipelineExpand<C> {
         let elem = C::as_elem(scope);
-        let variable = scope.create_pipeline(Item::new(elem), num_stages as u8);
+        let variable = scope.create_pipeline(
+            Item::new(elem),
+            num_stages as u8,
+            match pipeline_group {
+                PipelineGroup::Unit => 0,
+                PipelineGroup::Cube => 1,
+            },
+        );
         PipelineExpand {
             elem: variable,
             _c: PhantomData,
diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 911d6612d..658283a4c 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -1,6 +1,6 @@
 use crate::{self as cubecl, as_bytes};
 use cubecl::prelude::*;
-use pipeline::Pipeline;
+use pipeline::{Pipeline, PipelineGroup};
 
 #[cube]
 fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<F>>) {
@@ -13,7 +13,7 @@ fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<
 fn computation<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
     let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
 
-    let pipeline = Pipeline::new(2u32);
+    let pipeline = Pipeline::new(2u32, PipelineGroup::Unit);
 
     let start = UNIT_POS_X * 2u32;
     let end = start + 2u32;
diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs
index a33ee9be1..315a9ec10 100644
--- a/crates/cubecl-core/src/runtime_tests/pipeline.rs
+++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs
@@ -12,7 +12,7 @@ fn pipelined_sum<F: Float>(
     let smem_size = 2 * batch_len;
     let num_batches = input.len() / batch_len;
     let mut shared_memory = SharedMemory::<F>::new_lined(smem_size, input.line_size());
-    let pipeline = Pipeline::new(2u32);
+    let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Cube);
 
     let mut sum = Line::<F>::empty(input.line_size()).fill(F::new(0.));
 
@@ -63,7 +63,7 @@ fn pipelined_sum<F: Float>(
 
 #[cube(launch)]
 pub fn async_copy_test<F: Float>(input: &Array<Line<F>>, output: &mut Array<Line<F>>) {
-    let pipeline = pipeline::Pipeline::<F>::new(2u32);
+    let pipeline = pipeline::Pipeline::<F>::new(2u32, pipeline::PipelineGroup::Cube);
     let mut smem = SharedMemory::<F>::new_lined(1u32, 1u32);
 
     if UNIT_POS == 0 {
diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs
index e583ddae8..03f57d870 100644
--- a/crates/cubecl-cpp/src/shared/base.rs
+++ b/crates/cubecl-cpp/src/shared/base.rs
@@ -357,13 +357,22 @@ impl<D: Dialect> CppCompiler<D> {
                     source,
                     destination,
                 } => {
-                    instructions.push(Instruction::Pipeline(
-                        super::pipeline::PipelineOps::MemCopyAsync {
-                            pipeline: self.compile_variable(pipeline),
-                            source: self.compile_variable(source),
-                            destination: self.compile_variable(destination),
-                        },
-                    ));
+                    if let gpu::VariableKind::Pipeline {
+                        id: _,
+                        item: _,
+                        num_stages: _,
+                        pipeline_group,
+                    } = pipeline.kind
+                    {
+                        instructions.push(Instruction::Pipeline(
+                            super::pipeline::PipelineOps::MemCopyAsync {
+                                pipeline: self.compile_variable(pipeline),
+                                source: self.compile_variable(source),
+                                destination: self.compile_variable(destination),
+                                pipeline_group,
+                            },
+                        ));
+                    }
                 }
                 gpu::PipelineOps::ProducerAcquire { pipeline } => instructions.push(
                     Instruction::Pipeline(super::pipeline::PipelineOps::ProducerAcquire {
@@ -1047,6 +1056,7 @@ impl<D: Dialect> CppCompiler<D> {
                 id,
                 item,
                 num_stages,
+                pipeline_group,
             } => {
                 self.pipeline = true;
                 let pipeline = Variable::Pipeline {
@@ -1057,6 +1067,7 @@ impl<D: Dialect> CppCompiler<D> {
                     self.pipelines.push(PipelineOps::Init {
                         pipeline,
                         num_stages,
+                        pipeline_group,
                     });
                 }
                 pipeline
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 921efb224..63fe3166d 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -7,11 +7,13 @@ pub enum PipelineOps<D: Dialect> {
     Init {
         pipeline: Variable<D>,
         num_stages: u8,
+        pipeline_group: u8,
     },
     MemCopyAsync {
         pipeline: Variable<D>,
         source: Variable<D>,
         destination: Variable<D>,
+        pipeline_group: u8,
     },
     ProducerAcquire {
         pipeline: Variable<D>,
@@ -47,27 +49,49 @@ impl<D: Dialect> Display for PipelineOps<D> {
                 pipeline,
                 source,
                 destination,
+                pipeline_group,
             } => {
                 let item = source.item();
                 let size = item.elem().size() * item.vectorization;
-                write!(f, "
-cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
-                ")
+                match pipeline_group {
+                    0 => {
+                        write!(f, "
+                        cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
+                                        ")
+                    }
+                    1 => {
+                        write!(f, "
+                        cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline});
+                                        ")
+                    }
+                    _ => unreachable!(),
+                }
             }
             PipelineOps::Init {
                 pipeline,
                 num_stages,
-            } => {
-                // cuda::thread_scope::thread_scope_block -> sync only within thread_block, no gmem sync
-                // cooperative_groups::this_thread_block() -> each thread in threadblock has its pipeline instance
-                write!(
-                    f,
-                    "
-__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_thread, {num_stages}> {pipeline}_state;
-auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
-                "
-                )
-            }
+                pipeline_group,
+            } => match pipeline_group {
+                0 => {
+                    write!(
+                            f,
+                            "
+        __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_thread, {num_stages}> {pipeline}_state;
+        auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
+                        "
+                        )
+                }
+                1 => {
+                    write!(
+                            f,
+                            "
+        __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
+        auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
+                        "
+                        )
+                }
+                _ => unreachable!(),
+            },
             PipelineOps::ProducerAcquire { pipeline } => {
                 write!(
                     f,
diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs
index 6a4115d5b..fcbe86503 100644
--- a/crates/cubecl-ir/src/allocator.rs
+++ b/crates/cubecl-ir/src/allocator.rs
@@ -4,6 +4,8 @@ use core::cell::RefCell;
 use hashbrown::HashMap;
 use portable_atomic::{AtomicU32, Ordering};
 
+use crate::pipeline;
+
 use super::{Item, Matrix, Variable, VariableKind};
 
 /// An allocator for local variables of a kernel.
@@ -96,13 +98,14 @@ impl Allocator {
         ExpandElement::Plain(variable)
     }
 
-    pub fn create_pipeline(&self, item: Item, num_stages: u8) -> ExpandElement {
+    pub fn create_pipeline(&self, item: Item, num_stages: u8, pipeline_group: u8) -> ExpandElement {
         let id = self.new_local_index();
         let variable = Variable::new(
             VariableKind::Pipeline {
                 id,
                 item,
                 num_stages,
+                pipeline_group,
             },
             item,
         );
diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs
index a3f0e4540..751d5a37e 100644
--- a/crates/cubecl-ir/src/scope.rs
+++ b/crates/cubecl-ir/src/scope.rs
@@ -110,8 +110,15 @@ impl Scope {
     }
 
     /// Create a new pipeline element.
-    pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement {
-        let pipeline = self.allocator.create_pipeline(item, num_stages);
+    pub fn create_pipeline(
+        &mut self,
+        item: Item,
+        num_stages: u8,
+        pipeline_group: u8,
+    ) -> ExpandElement {
+        let pipeline = self
+            .allocator
+            .create_pipeline(item, num_stages, pipeline_group);
         self.add_pipeline(*pipeline);
         pipeline
     }
diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs
index c1c3e850a..d44c9e9a1 100644
--- a/crates/cubecl-ir/src/variable.rs
+++ b/crates/cubecl-ir/src/variable.rs
@@ -49,17 +49,43 @@ pub enum VariableKind {
     GlobalInputArray(Id),
     GlobalOutputArray(Id),
     GlobalScalar(Id),
-    LocalArray { id: Id, length: u32 },
-    LocalMut { id: Id },
-    LocalConst { id: Id },
-    Versioned { id: Id, version: u16 },
+    LocalArray {
+        id: Id,
+        length: u32,
+    },
+    LocalMut {
+        id: Id,
+    },
+    LocalConst {
+        id: Id,
+    },
+    Versioned {
+        id: Id,
+        version: u16,
+    },
     ConstantScalar(ConstantScalarValue),
-    ConstantArray { id: Id, length: u32 },
-    SharedMemory { id: Id, length: u32 },
-    Matrix { id: Id, mat: Matrix },
-    Slice { id: Id },
+    ConstantArray {
+        id: Id,
+        length: u32,
+    },
+    SharedMemory {
+        id: Id,
+        length: u32,
+    },
+    Matrix {
+        id: Id,
+        mat: Matrix,
+    },
+    Slice {
+        id: Id,
+    },
     Builtin(Builtin),
-    Pipeline { id: Id, item: Item, num_stages: u8 },
+    Pipeline {
+        id: Id,
+        item: Item,
+        num_stages: u8,
+        pipeline_group: u8,
+    },
 }
 
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]

From 2b6b65f74e77100a44db9cd88b947e5a6131719f Mon Sep 17 00:00:00 2001
From: louisfd <louisfd@gmail.com>
Date: Tue, 4 Feb 2025 12:05:40 -0500
Subject: [PATCH 08/14] works

---
 .../src/runtime_tests/memcpy_async.rs         | 82 ++++++++++++++++---
 crates/cubecl-cpp/src/shared/pipeline.rs      |  2 +-
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 658283a4c..919cc341e 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -2,44 +2,67 @@ use crate::{self as cubecl, as_bytes};
 use cubecl::prelude::*;
 use pipeline::{Pipeline, PipelineGroup};
 
-#[cube]
-fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<F>>) {
-    for i in 0..source.len() {
-        destination[i] = source[i];
+// TODO delete
+// #[cube]
+// fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<F>>) {
+//     for i in 0..source.len() {
+//         destination[i] = source[i];
+//     }
+// }
+
+#[cube(launch)]
+fn one_load<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
+    let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
+
+    let pipeline = Pipeline::new(1u32, PipelineGroup::Unit);
+
+    let start = UNIT_POS_X * 2u32;
+    let end = start + 2u32;
+
+    pipeline.producer_acquire();
+    pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
+    pipeline.producer_commit();
+
+    pipeline.consumer_wait();
+    for i in start..end {
+        output[i] = lhs_smem[i];
     }
+    pipeline.consumer_release();
 }
 
 #[cube(launch)]
-fn computation<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
+fn two_loads<F: Float>(lhs: &Tensor<Line<F>>, rhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
     let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
+    let mut rhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
 
-    let pipeline = Pipeline::new(2u32, PipelineGroup::Unit);
+    let pipeline = Pipeline::new(1u32, PipelineGroup::Unit);
 
     let start = UNIT_POS_X * 2u32;
     let end = start + 2u32;
 
     pipeline.producer_acquire();
     pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
+    pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
     pipeline.producer_commit();
 
     pipeline.consumer_wait();
     for i in start..end {
-        output[i] = lhs_smem[i];
+        output[i] = lhs_smem[i] + rhs_smem[i];
     }
     pipeline.consumer_release();
 }
 
-pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
+pub fn test_memcpy_one_load<R: Runtime, F: Float + CubeElement>(
     client: ComputeClient<R::Server, R::Channel>,
 ) {
     let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
     let output = client.empty(4 * core::mem::size_of::<F>());
 
     unsafe {
-        computation::launch::<F, R>(
+        one_load::launch::<F, R>(
             &client,
             CubeCount::Static(1, 1, 1),
-            CubeDim::new(1, 1, 1),
+            CubeDim::new(2, 1, 1),
             TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
             TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
         )
@@ -52,6 +75,31 @@ pub fn test_memcpy<R: Runtime, F: Float + CubeElement>(
     assert_eq!(actual, expected);
 }
 
+pub fn test_memcpy_two_loads<R: Runtime, F: Float + CubeElement>(
+    client: ComputeClient<R::Server, R::Channel>,
+) {
+    let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
+    let rhs = client.create(as_bytes![F: 20., 21., 22., 23.]);
+    let output = client.empty(4 * core::mem::size_of::<F>());
+
+    unsafe {
+        two_loads::launch::<F, R>(
+            &client,
+            CubeCount::Static(1, 1, 1),
+            CubeDim::new(2, 1, 1),
+            TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
+            TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
+            TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
+        )
+    };
+
+    let actual = client.read_one(output.binding());
+    let actual = F::from_bytes(&actual);
+    let expected = [F::new(30.0), F::new(32.0), F::new(34.0), F::new(36.0)];
+
+    assert_eq!(actual, expected);
+}
+
 #[allow(missing_docs)]
 #[macro_export]
 macro_rules! testgen_memcpy_async {
@@ -59,9 +107,19 @@ macro_rules! testgen_memcpy_async {
         use super::*;
 
         #[test]
-        fn test_memcpy_async() {
+        fn test_memcpy_async_one_load() {
+            let client = TestRuntime::client(&Default::default());
+            cubecl_core::runtime_tests::memcpy_async::test_memcpy_one_load::<TestRuntime, FloatType>(
+                client,
+            );
+        }
+
+        #[test]
+        fn test_memcpy_async_two_loads() {
             let client = TestRuntime::client(&Default::default());
-            cubecl_core::runtime_tests::memcpy_async::test_memcpy::<TestRuntime, FloatType>(client);
+            cubecl_core::runtime_tests::memcpy_async::test_memcpy_two_loads::<TestRuntime, FloatType>(
+                client,
+            );
         }
     };
 }
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 63fe3166d..000d53e8c 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -76,7 +76,7 @@ impl<D: Dialect> Display for PipelineOps<D> {
                     write!(
                             f,
                             "
-        __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_thread, {num_stages}> {pipeline}_state;
+        cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
         auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
                         "
                         )

From 4bef2d0fdc79e8d82e37e925b68ade5a8344359a Mon Sep 17 00:00:00 2001
From: louisfd <louisfd@gmail.com>
Date: Wed, 5 Feb 2025 09:12:57 -0500
Subject: [PATCH 09/14] tests

---
 .../src/runtime_tests/memcpy_async.rs         | 162 ++++++++++++++----
 crates/cubecl-cpp/src/shared/pipeline.rs      |  18 +-
 crates/cubecl-ir/src/allocator.rs             |   2 -
 3 files changed, 142 insertions(+), 40 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 919cc341e..0b656ccd1 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -2,14 +2,6 @@ use crate::{self as cubecl, as_bytes};
 use cubecl::prelude::*;
 use pipeline::{Pipeline, PipelineGroup};
 
-// TODO delete
-// #[cube]
-// fn memcpy_sync<F: Float>(source: Slice<Line<F>>, mut destination: SliceMut<Line<F>>) {
-//     for i in 0..source.len() {
-//         destination[i] = source[i];
-//     }
-// }
-
 #[cube(launch)]
 fn one_load<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
     let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
@@ -31,14 +23,19 @@ fn one_load<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
 }
 
 #[cube(launch)]
-fn two_loads<F: Float>(lhs: &Tensor<Line<F>>, rhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
-    let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
-    let mut rhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
+fn two_loads<F: Float>(
+    lhs: &Tensor<Line<F>>,
+    rhs: &Tensor<Line<F>>,
+    output: &mut Tensor<Line<F>>,
+    #[comptime] num_data: u32, // should be even
+) {
+    let mut lhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
+    let mut rhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
 
     let pipeline = Pipeline::new(1u32, PipelineGroup::Unit);
 
-    let start = UNIT_POS_X * 2u32;
-    let end = start + 2u32;
+    let start = UNIT_POS_X * num_data / 2;
+    let end = start + num_data / 2;
 
     pipeline.producer_acquire();
     pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
@@ -46,10 +43,57 @@ fn two_loads<F: Float>(lhs: &Tensor<Line<F>>, rhs: &Tensor<Line<F>>, output: &mu
     pipeline.producer_commit();
 
     pipeline.consumer_wait();
+    let mut dot = Line::cast_from(0u32);
     for i in start..end {
-        output[i] = lhs_smem[i] + rhs_smem[i];
+        dot += lhs_smem[i] * rhs_smem[i];
     }
     pipeline.consumer_release();
+
+    output[UNIT_POS_X] = dot;
+}
+
+#[cube(launch)]
+fn two_independant_loads<F: Float>(
+    lhs: &Tensor<Line<F>>,
+    rhs: &Tensor<Line<F>>,
+    output: &mut Tensor<Line<F>>,
+    #[comptime] num_data: u32,
+) {
+    let mut lhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
+    let mut rhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
+
+    let pipeline = Pipeline::new(2u32, PipelineGroup::Unit);
+
+    let start = UNIT_POS_X * num_data / 2;
+    let end = start + num_data / 2;
+
+    for i in start..end {
+        lhs_smem[i] = Line::cast_from(0u32);
+        rhs_smem[i] = Line::cast_from(0u32);
+        output[i] = Line::cast_from(0u32);
+    }
+
+    pipeline.producer_acquire();
+    pipeline.memcpy_async(lhs.slice(start, end), lhs_smem.slice_mut(start, end));
+    pipeline.producer_commit();
+
+    pipeline.producer_acquire();
+    pipeline.memcpy_async(rhs.slice(start, end), rhs_smem.slice_mut(start, end));
+    pipeline.producer_commit();
+
+    let mut dot = Line::cast_from(0u32);
+
+    pipeline.consumer_wait();
+    pipeline.consumer_wait();
+    pipeline.consumer_wait();
+    for i in start..end {
+        dot += lhs_smem[i] * rhs_smem[i];
+    }
+    pipeline.consumer_release();
+    pipeline.consumer_release();
+    pipeline.consumer_release();
+
+    output[UNIT_POS_X] = dot;
 }
 
 pub fn test_memcpy_one_load<R: Runtime, F: Float + CubeElement>(
@@ -76,30 +120,80 @@ pub fn test_memcpy_one_load<R: Runtime, F: Float + CubeElement>(
 }
 
 pub fn test_memcpy_two_loads<R: Runtime, F: Float + CubeElement>(
+    independant: bool,
     client: ComputeClient<R::Server, R::Channel>,
 ) {
-    let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
-    let rhs = client.create(as_bytes![F: 20., 21., 22., 23.]);
-    let output = client.empty(4 * core::mem::size_of::<F>());
-
-    unsafe {
-        two_loads::launch::<F, R>(
-            &client,
-            CubeCount::Static(1, 1, 1),
-            CubeDim::new(2, 1, 1),
-            TensorArg::from_raw_parts::<F>(&lhs, &[4, 1], &[4, 4], 1),
-            TensorArg::from_raw_parts::<F>(&rhs, &[4, 1], &[4, 4], 1),
-            TensorArg::from_raw_parts::<F>(&output, &[4, 1], &[4, 4], 1),
-        )
-    };
+    let num_data = 4;
+    let lhs_data = generate_random_data(num_data, 42);
+    let rhs_data = generate_random_data(num_data, 43);
+
+    let lhs = client.create(F::as_bytes(&lhs_data));
+    let rhs = client.create(F::as_bytes(&rhs_data));
+    let output = client.empty(2 * core::mem::size_of::<F>());
+
+    if independant {
+        unsafe {
+            two_independant_loads::launch::<F, R>(
+                &client,
+                CubeCount::Static(1, 1, 1),
+                CubeDim::new(2, 1, 1),
+                TensorArg::from_raw_parts::<F>(&lhs, &[1], &[num_data], 1),
+                TensorArg::from_raw_parts::<F>(&rhs, &[1], &[num_data], 1),
+                TensorArg::from_raw_parts::<F>(&output, &[1], &[2], 1),
+                num_data as u32,
+            )
+        };
+    } else {
+        unsafe {
+            two_loads::launch::<F, R>(
+                &client,
+                CubeCount::Static(1, 1, 1),
+                CubeDim::new(2, 1, 1),
+                TensorArg::from_raw_parts::<F>(&lhs, &[1], &[num_data], 1),
+                TensorArg::from_raw_parts::<F>(&rhs, &[1], &[num_data], 1),
+                TensorArg::from_raw_parts::<F>(&output, &[1], &[2], 1),
+                num_data as u32,
+            )
+        };
+    }
 
     let actual = client.read_one(output.binding());
     let actual = F::from_bytes(&actual);
-    let expected = [F::new(30.0), F::new(32.0), F::new(34.0), F::new(36.0)];
+
+    let middle = num_data / 2;
+    let expected = [
+        dot(&lhs_data[..middle], &rhs_data[..middle]),
+        dot(&lhs_data[middle..], &rhs_data[middle..]),
+    ];
 
     assert_eq!(actual, expected);
 }
 
+fn dot<F: Float>(vec1: &[F], vec2: &[F]) -> F {
+    let mut sum = F::from_int(0);
+    for i in 0..vec1.len() {
+        sum += vec1[i] * vec2[i];
+    }
+    sum
+}
+
+// TODO tmp
+pub(crate) fn generate_random_data<F: Float + CubeElement>(
+    num_elements: usize,
+    mut seed: u64,
+) -> Vec<F> {
+    fn lcg(seed: &mut u64) -> f32 {
+        const A: u64 = 1664525;
+        const C: u64 = 1013904223;
+        const M: f64 = 2u64.pow(32) as f64;
+
+        *seed = (A.wrapping_mul(*seed).wrapping_add(C)) % (1u64 << 32);
+        (*seed as f64 / M * 2.0 - 1.0) as f32
+    }
+
+    (0..num_elements).map(|_| F::new(lcg(&mut seed))).collect()
+}
+
 #[allow(missing_docs)]
 #[macro_export]
 macro_rules! testgen_memcpy_async {
@@ -118,6 +212,16 @@ macro_rules! testgen_memcpy_async {
         fn test_memcpy_async_two_loads() {
             let client = TestRuntime::client(&Default::default());
             cubecl_core::runtime_tests::memcpy_async::test_memcpy_two_loads::<TestRuntime, FloatType>(
+                false,
+                client,
+            );
+        }
+
+        #[test]
+        fn test_memcpy_async_two_independant_loads() {
+            let client = TestRuntime::client(&Default::default());
+            cubecl_core::runtime_tests::memcpy_async::test_memcpy_two_loads::<TestRuntime, FloatType>(
+                true,
                 client,
             );
         }
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 000d53e8c..bdf2b926a 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -56,12 +56,12 @@ impl<D: Dialect> Display for PipelineOps<D> {
                 match pipeline_group {
                     0 => {
                         write!(f, "
-                        cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
+cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
                                         ")
                     }
                     1 => {
                         write!(f, "
-                        cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline});
+cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline});
                                         ")
                     }
                     _ => unreachable!(),
@@ -74,19 +74,19 @@ impl<D: Dialect> Display for PipelineOps<D> {
             } => match pipeline_group {
                 0 => {
                     write!(
-                            f,
-                            "
-        cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
-        auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
+                        f,
                         "
-                        )
+cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
+auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
+                        "
+                    )
                 }
                 1 => {
                     write!(
                             f,
                             "
-        __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
-        auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
+__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
+auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
                         "
                         )
                 }
diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs
index fcbe86503..83675f417 100644
--- a/crates/cubecl-ir/src/allocator.rs
+++ b/crates/cubecl-ir/src/allocator.rs
@@ -4,8 +4,6 @@ use core::cell::RefCell;
 use hashbrown::HashMap;
 use portable_atomic::{AtomicU32, Ordering};
 
-use crate::pipeline;
-
 use super::{Item, Matrix, Variable, VariableKind};
 
 /// An allocator for local variables of a kernel.

From be16046d0153fdd6137ed642c42d943a93cf3aaf Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Wed, 5 Feb 2025 09:17:56 -0500
Subject: [PATCH 10/14] cleaning

---
 crates/cubecl-core/src/frontend/context.rs    | 159 ------------------
 .../src/runtime_tests/memcpy_async.rs         |  21 +--
 2 files changed, 2 insertions(+), 178 deletions(-)
 delete mode 100644 crates/cubecl-core/src/frontend/context.rs

diff --git a/crates/cubecl-core/src/frontend/context.rs b/crates/cubecl-core/src/frontend/context.rs
deleted file mode 100644
index 284e14712..000000000
--- a/crates/cubecl-core/src/frontend/context.rs
+++ /dev/null
@@ -1,159 +0,0 @@
-use crate::ir::Id;
-use crate::ir::{self, Elem, Instruction, Item, Scope, Variable, VariableKind};
-use alloc::rc::Rc;
-use core::cell::RefCell;
-use cubecl_ir::ExpandElement;
-use cubecl_runtime::debug::DebugLogger;
-use std::any::TypeId;
-use std::collections::HashMap;
-
-pub struct CubeContext {
-    pub root: Rc<RefCell<Scope>>,
-    pub scope: Rc<RefCell<Scope>>,
-    pub debug_enabled: bool,
-    pub typemap: Rc<RefCell<HashMap<TypeId, Elem>>>,
-}
-
-impl Default for CubeContext {
-    fn default() -> Self {
-        Self::root()
-    }
-}
-
-impl CubeContext {
-    /// Create a new cube context, with a root scope
-    /// A root scope is at the root of a compute shader
-    /// Therefore there is one cube context per shader
-    /// The allocator will define the strategy for creating local intermediates and mutable variables
-    pub fn root() -> CubeContext {
-        let root = Rc::new(RefCell::new(Scope::root()));
-        let typemap = Rc::new(RefCell::new(HashMap::new()));
-        let scope = root.clone();
-
-        Self {
-            scope,
-            root,
-            debug_enabled: DebugLogger::default().is_activated(),
-            typemap,
-        }
-    }
-
-    pub fn register<O: Into<Instruction>>(&mut self, op: O) {
-        self.scope.borrow_mut().register(op)
-    }
-
-    /// Resolve the element type of the given generic type.
-    pub fn resolve_elem<T: 'static>(&self) -> Option<Elem> {
-        let map = self.typemap.borrow();
-        let result = map.get(&TypeId::of::<T>());
-
-        result.cloned()
-    }
-
-    /// Register the element type for the given generic type.
-    pub fn register_elem<T: 'static>(&mut self, elem: Elem) {
-        let mut map = self.typemap.borrow_mut();
-
-        map.insert(TypeId::of::<T>(), elem);
-    }
-
-    pub fn child(&mut self) -> CubeContext {
-        let scope = self.scope.borrow_mut().child();
-
-        Self {
-            scope: Rc::new(RefCell::new(scope)),
-            root: self.root.clone(),
-            debug_enabled: self.debug_enabled,
-            typemap: self.typemap.clone(),
-        }
-    }
-
-    pub fn into_scope(self) -> Scope {
-        core::mem::drop(self.root);
-
-        Rc::into_inner(self.scope)
-            .expect("Only one reference")
-            .into_inner()
-    }
-
-    /// Create a new mutable local variable.
-    pub fn create_local_mut(&mut self, item: Item) -> ExpandElement {
-        let local = self.scope.borrow().allocator.create_local_mut(item);
-        self.scope.borrow_mut().add_local_mut(*local);
-        local
-    }
-
-    /// Create a new immutable local variable.
-    pub fn create_local(&mut self, item: Item) -> ExpandElement {
-        self.scope.borrow().allocator.create_local(item)
-    }
-
-    /// Create a new immutable local binding that must never be a reused variable, regardless of
-    /// allocator
-    pub fn create_local_restricted(&mut self, item: Item) -> ExpandElement {
-        self.scope.borrow().allocator.create_local_restricted(item)
-    }
-
-    /// Create a new matrix element.
-    pub fn create_matrix(&mut self, matrix: ir::Matrix) -> ExpandElement {
-        let matrix = self.scope.borrow().allocator.create_matrix(matrix);
-        self.scope.borrow_mut().add_matrix(*matrix);
-        matrix
-    }
-
-    /// Create a new pipeline element.
-    pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement {
-        let pipeline = self
-            .scope
-            .borrow()
-            .allocator
-            .create_pipeline(item, num_stages);
-        self.scope.borrow_mut().add_pipeline(*pipeline);
-        pipeline
-    }
-
-    /// Create a new slice element.
-    pub fn create_slice(&mut self, item: Item) -> ExpandElement {
-        let slice = self.scope.borrow().allocator.create_slice(item);
-        self.scope.borrow_mut().add_slice(*slice);
-        slice
-    }
-
-    pub fn create_shared(&mut self, item: Item, size: u32) -> ExpandElement {
-        ExpandElement::Plain(self.root.borrow_mut().create_shared(item, size))
-    }
-
-    pub fn create_local_array(&mut self, item: Item, size: u32) -> ExpandElement {
-        let local_array: ExpandElement =
-            self.root.borrow().allocator.create_local_array(item, size);
-        self.root.borrow_mut().add_local_array(*local_array);
-        local_array
-    }
-
-    pub fn create_const_array(&mut self, item: Item, data: Vec<Variable>) -> ExpandElement {
-        ExpandElement::Plain(self.root.borrow_mut().create_const_array(item, data))
-    }
-
-    /// Obtain the index-th input
-    pub fn input(&mut self, id: Id, item: Item) -> ExpandElement {
-        ExpandElement::Plain(crate::ir::Variable::new(
-            VariableKind::GlobalInputArray(id),
-            item,
-        ))
-    }
-
-    /// Obtain the index-th output
-    pub fn output(&mut self, id: Id, item: Item) -> ExpandElement {
-        let var = crate::ir::Variable::new(VariableKind::GlobalOutputArray(id), item);
-        self.scope.borrow_mut().write_global_custom(var);
-        ExpandElement::Plain(var)
-    }
-
-    /// Obtain the index-th scalar
-    pub fn scalar(&self, id: Id, elem: Elem) -> ExpandElement {
-        ExpandElement::Plain(crate::ir::Variable::new(
-            VariableKind::GlobalScalar(id),
-            Item::new(elem),
-        ))
-    }
-}
diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 0b656ccd1..3e01770d2 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -124,8 +124,8 @@ pub fn test_memcpy_two_loads<R: Runtime, F: Float + CubeElement>(
     client: ComputeClient<R::Server, R::Channel>,
 ) {
     let num_data = 4;
-    let lhs_data = generate_random_data(num_data, 42);
-    let rhs_data = generate_random_data(num_data, 43);
+    let lhs_data: Vec<F> = (0..num_data).map(|i| F::new(i as f32)).collect();
+    let rhs_data: Vec<F> = (0..num_data).map(|i| F::new(i as f32)).collect();
 
     let lhs = client.create(F::as_bytes(&lhs_data));
     let rhs = client.create(F::as_bytes(&rhs_data));
@@ -177,23 +177,6 @@ fn dot<F: Float>(vec1: &[F], vec2: &[F]) -> F {
     sum
 }
 
-// TODO tmp
-pub(crate) fn generate_random_data<F: Float + CubeElement>(
-    num_elements: usize,
-    mut seed: u64,
-) -> Vec<F> {
-    fn lcg(seed: &mut u64) -> f32 {
-        const A: u64 = 1664525;
-        const C: u64 = 1013904223;
-        const M: f64 = 2u64.pow(32) as f64;
-
-        *seed = (A.wrapping_mul(*seed).wrapping_add(C)) % (1u64 << 32);
-        (*seed as f64 / M * 2.0 - 1.0) as f32
-    }
-
-    (0..num_elements).map(|_| F::new(lcg(&mut seed))).collect()
-}
-
 #[allow(missing_docs)]
 #[macro_export]
 macro_rules! testgen_memcpy_async {

From db3cb51903b1446d1a47b8aedccbc7b6f0c99962 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd@gmail.com>
Date: Wed, 5 Feb 2025 09:35:53 -0500
Subject: [PATCH 11/14] unit pipeline group

---
 crates/cubecl-core/src/runtime_tests/pipeline.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs
index 315a9ec10..9e754c47e 100644
--- a/crates/cubecl-core/src/runtime_tests/pipeline.rs
+++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs
@@ -12,7 +12,7 @@ fn pipelined_sum<F: Float>(
     let smem_size = 2 * batch_len;
     let num_batches = input.len() / batch_len;
     let mut shared_memory = SharedMemory::<F>::new_lined(smem_size, input.line_size());
-    let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Cube);
+    let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Unit);
 
     let mut sum = Line::<F>::empty(input.line_size()).fill(F::new(0.));
 
@@ -63,7 +63,7 @@ fn pipelined_sum<F: Float>(
 
 #[cube(launch)]
 pub fn async_copy_test<F: Float>(input: &Array<Line<F>>, output: &mut Array<Line<F>>) {
-    let pipeline = pipeline::Pipeline::<F>::new(2u32, pipeline::PipelineGroup::Cube);
+    let pipeline = pipeline::Pipeline::<F>::new(2u32, pipeline::PipelineGroup::Unit);
     let mut smem = SharedMemory::<F>::new_lined(1u32, 1u32);
 
     if UNIT_POS == 0 {

From da5ea7b049c9a4b02ce2b22830bd8c4de67baa79 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Wed, 5 Feb 2025 09:40:00 -0500
Subject: [PATCH 12/14] remove pipeline group

---
 crates/cubecl-core/src/frontend/pipeline.rs | 15 +------
 crates/cubecl-cpp/src/shared/base.rs        |  4 --
 crates/cubecl-cpp/src/shared/pipeline.rs    | 40 ++++---------------
 crates/cubecl-ir/src/allocator.rs           |  3 +-
 crates/cubecl-ir/src/scope.rs               | 11 +-----
 crates/cubecl-ir/src/variable.rs            | 44 +++++----------------
 6 files changed, 21 insertions(+), 96 deletions(-)

diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs
index f30f4cdb8..7aff0d532 100644
--- a/crates/cubecl-core/src/frontend/pipeline.rs
+++ b/crates/cubecl-core/src/frontend/pipeline.rs
@@ -157,20 +157,9 @@ impl<C: CubePrimitive> Pipeline<C> {
         unexpanded!()
     }
 
-    pub fn __expand_new(
-        scope: &mut Scope,
-        num_stages: u32,
-        pipeline_group: PipelineGroup,
-    ) -> PipelineExpand<C> {
+    pub fn __expand_new(scope: &mut Scope, num_stages: u32) -> PipelineExpand<C> {
         let elem = C::as_elem(scope);
-        let variable = scope.create_pipeline(
-            Item::new(elem),
-            num_stages as u8,
-            match pipeline_group {
-                PipelineGroup::Unit => 0,
-                PipelineGroup::Cube => 1,
-            },
-        );
+        let variable = scope.create_pipeline(Item::new(elem), num_stages as u8);
         PipelineExpand {
             elem: variable,
             _c: PhantomData,
diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs
index 03f57d870..bab3bc642 100644
--- a/crates/cubecl-cpp/src/shared/base.rs
+++ b/crates/cubecl-cpp/src/shared/base.rs
@@ -361,7 +361,6 @@ impl<D: Dialect> CppCompiler<D> {
                         id: _,
                         item: _,
                         num_stages: _,
-                        pipeline_group,
                     } = pipeline.kind
                     {
                         instructions.push(Instruction::Pipeline(
@@ -369,7 +368,6 @@ impl<D: Dialect> CppCompiler<D> {
                                 pipeline: self.compile_variable(pipeline),
                                 source: self.compile_variable(source),
                                 destination: self.compile_variable(destination),
-                                pipeline_group,
                             },
                         ));
                     }
@@ -1056,7 +1054,6 @@ impl<D: Dialect> CppCompiler<D> {
                 id,
                 item,
                 num_stages,
-                pipeline_group,
             } => {
                 self.pipeline = true;
                 let pipeline = Variable::Pipeline {
@@ -1067,7 +1064,6 @@ impl<D: Dialect> CppCompiler<D> {
                     self.pipelines.push(PipelineOps::Init {
                         pipeline,
                         num_stages,
-                        pipeline_group,
                     });
                 }
                 pipeline
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index bdf2b926a..9423dad40 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -7,13 +7,11 @@ pub enum PipelineOps<D: Dialect> {
     Init {
         pipeline: Variable<D>,
         num_stages: u8,
-        pipeline_group: u8,
     },
     MemCopyAsync {
         pipeline: Variable<D>,
         source: Variable<D>,
         destination: Variable<D>,
-        pipeline_group: u8,
     },
     ProducerAcquire {
         pipeline: Variable<D>,
@@ -49,49 +47,25 @@ impl<D: Dialect> Display for PipelineOps<D> {
                 pipeline,
                 source,
                 destination,
-                pipeline_group,
             } => {
                 let item = source.item();
                 let size = item.elem().size() * item.vectorization;
-                match pipeline_group {
-                    0 => {
-                        write!(f, "
+                write!(f, "
 cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
                                         ")
-                    }
-                    1 => {
-                        write!(f, "
-cuda::memcpy_async(cooperative_groups::this_thread_block(), {destination}, {source}, {source}_length * {size}, {pipeline});
-                                        ")
-                    }
-                    _ => unreachable!(),
-                }
             }
             PipelineOps::Init {
                 pipeline,
                 num_stages,
-                pipeline_group,
-            } => match pipeline_group {
-                0 => {
-                    write!(
-                        f,
-                        "
+            } => {
+                write!(
+                    f,
+                    "
 cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
 auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
                         "
-                    )
-                }
-                1 => {
-                    write!(
-                            f,
-                            "
-__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
-auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread_block(), &{pipeline}_state);
-                        "
-                        )
-                }
-                _ => unreachable!(),
-            },
+                )
+            }
             PipelineOps::ProducerAcquire { pipeline } => {
                 write!(
                     f,
diff --git a/crates/cubecl-ir/src/allocator.rs b/crates/cubecl-ir/src/allocator.rs
index 83675f417..6a4115d5b 100644
--- a/crates/cubecl-ir/src/allocator.rs
+++ b/crates/cubecl-ir/src/allocator.rs
@@ -96,14 +96,13 @@ impl Allocator {
         ExpandElement::Plain(variable)
     }
 
-    pub fn create_pipeline(&self, item: Item, num_stages: u8, pipeline_group: u8) -> ExpandElement {
+    pub fn create_pipeline(&self, item: Item, num_stages: u8) -> ExpandElement {
         let id = self.new_local_index();
         let variable = Variable::new(
             VariableKind::Pipeline {
                 id,
                 item,
                 num_stages,
-                pipeline_group,
             },
             item,
         );
diff --git a/crates/cubecl-ir/src/scope.rs b/crates/cubecl-ir/src/scope.rs
index 751d5a37e..a3f0e4540 100644
--- a/crates/cubecl-ir/src/scope.rs
+++ b/crates/cubecl-ir/src/scope.rs
@@ -110,15 +110,8 @@ impl Scope {
     }
 
     /// Create a new pipeline element.
-    pub fn create_pipeline(
-        &mut self,
-        item: Item,
-        num_stages: u8,
-        pipeline_group: u8,
-    ) -> ExpandElement {
-        let pipeline = self
-            .allocator
-            .create_pipeline(item, num_stages, pipeline_group);
+    pub fn create_pipeline(&mut self, item: Item, num_stages: u8) -> ExpandElement {
+        let pipeline = self.allocator.create_pipeline(item, num_stages);
         self.add_pipeline(*pipeline);
         pipeline
     }
diff --git a/crates/cubecl-ir/src/variable.rs b/crates/cubecl-ir/src/variable.rs
index d44c9e9a1..c1c3e850a 100644
--- a/crates/cubecl-ir/src/variable.rs
+++ b/crates/cubecl-ir/src/variable.rs
@@ -49,43 +49,17 @@ pub enum VariableKind {
     GlobalInputArray(Id),
     GlobalOutputArray(Id),
     GlobalScalar(Id),
-    LocalArray {
-        id: Id,
-        length: u32,
-    },
-    LocalMut {
-        id: Id,
-    },
-    LocalConst {
-        id: Id,
-    },
-    Versioned {
-        id: Id,
-        version: u16,
-    },
+    LocalArray { id: Id, length: u32 },
+    LocalMut { id: Id },
+    LocalConst { id: Id },
+    Versioned { id: Id, version: u16 },
     ConstantScalar(ConstantScalarValue),
-    ConstantArray {
-        id: Id,
-        length: u32,
-    },
-    SharedMemory {
-        id: Id,
-        length: u32,
-    },
-    Matrix {
-        id: Id,
-        mat: Matrix,
-    },
-    Slice {
-        id: Id,
-    },
+    ConstantArray { id: Id, length: u32 },
+    SharedMemory { id: Id, length: u32 },
+    Matrix { id: Id, mat: Matrix },
+    Slice { id: Id },
     Builtin(Builtin),
-    Pipeline {
-        id: Id,
-        item: Item,
-        num_stages: u8,
-        pipeline_group: u8,
-    },
+    Pipeline { id: Id, item: Item, num_stages: u8 },
 }
 
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]

From cc88ab8d5fe0d732f773a1c009f79c2e63c6fbdc Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Wed, 5 Feb 2025 09:44:44 -0500
Subject: [PATCH 13/14] finish removing pipeline group

---
 crates/cubecl-core/src/frontend/pipeline.rs   |  9 ++------
 .../src/runtime_tests/memcpy_async.rs         |  8 +++----
 .../cubecl-core/src/runtime_tests/pipeline.rs |  4 ++--
 crates/cubecl-cpp/src/shared/base.rs          | 21 +++++++------------
 crates/cubecl-cpp/src/shared/pipeline.rs      |  4 ++--
 5 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/crates/cubecl-core/src/frontend/pipeline.rs b/crates/cubecl-core/src/frontend/pipeline.rs
index 7aff0d532..847fa3248 100644
--- a/crates/cubecl-core/src/frontend/pipeline.rs
+++ b/crates/cubecl-core/src/frontend/pipeline.rs
@@ -112,18 +112,13 @@ pub struct PipelineExpand<C: CubePrimitive> {
 
 impl<C: CubePrimitive> Default for Pipeline<C> {
     fn default() -> Self {
-        Self::new(1, PipelineGroup::Unit)
+        Self::new(1)
     }
 }
 
-pub enum PipelineGroup {
-    Unit,
-    Cube,
-}
-
 impl<C: CubePrimitive> Pipeline<C> {
     /// Create a pipeline instance
-    pub fn new(_num_stages: u32, _group: PipelineGroup) -> Self {
+    pub fn new(_num_stages: u32) -> Self {
         Self { _c: PhantomData }
     }
 
diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index 3e01770d2..b389e4373 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -1,12 +1,12 @@
 use crate::{self as cubecl, as_bytes};
 use cubecl::prelude::*;
-use pipeline::{Pipeline, PipelineGroup};
+use pipeline::Pipeline;
 
 #[cube(launch)]
 fn one_load<F: Float>(lhs: &Tensor<Line<F>>, output: &mut Tensor<Line<F>>) {
     let mut lhs_smem = SharedMemory::<F>::new_lined(4u32, 1u32);
 
-    let pipeline = Pipeline::new(1u32, PipelineGroup::Unit);
+    let pipeline = Pipeline::new(1u32);
 
     let start = UNIT_POS_X * 2u32;
     let end = start + 2u32;
@@ -32,7 +32,7 @@ fn two_loads<F: Float>(
     let mut lhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
     let mut rhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
 
-    let pipeline = Pipeline::new(1u32, PipelineGroup::Unit);
+    let pipeline = Pipeline::new(1u32);
 
     let start = UNIT_POS_X * num_data / 2;
     let end = start + num_data / 2;
@@ -62,7 +62,7 @@ fn two_independant_loads<F: Float>(
     let mut lhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
     let mut rhs_smem = SharedMemory::<F>::new_lined(num_data, 1u32);
 
-    let pipeline = Pipeline::new(2u32, PipelineGroup::Unit);
+    let pipeline = Pipeline::new(2u32);
 
     let start = UNIT_POS_X * num_data / 2;
     let end = start + num_data / 2;
diff --git a/crates/cubecl-core/src/runtime_tests/pipeline.rs b/crates/cubecl-core/src/runtime_tests/pipeline.rs
index 9e754c47e..a33ee9be1 100644
--- a/crates/cubecl-core/src/runtime_tests/pipeline.rs
+++ b/crates/cubecl-core/src/runtime_tests/pipeline.rs
@@ -12,7 +12,7 @@ fn pipelined_sum<F: Float>(
     let smem_size = 2 * batch_len;
     let num_batches = input.len() / batch_len;
     let mut shared_memory = SharedMemory::<F>::new_lined(smem_size, input.line_size());
-    let pipeline = Pipeline::new(2u32, pipeline::PipelineGroup::Unit);
+    let pipeline = Pipeline::new(2u32);
 
     let mut sum = Line::<F>::empty(input.line_size()).fill(F::new(0.));
 
@@ -63,7 +63,7 @@ fn pipelined_sum<F: Float>(
 
 #[cube(launch)]
 pub fn async_copy_test<F: Float>(input: &Array<Line<F>>, output: &mut Array<Line<F>>) {
-    let pipeline = pipeline::Pipeline::<F>::new(2u32, pipeline::PipelineGroup::Unit);
+    let pipeline = pipeline::Pipeline::<F>::new(2u32);
     let mut smem = SharedMemory::<F>::new_lined(1u32, 1u32);
 
     if UNIT_POS == 0 {
diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs
index bab3bc642..e583ddae8 100644
--- a/crates/cubecl-cpp/src/shared/base.rs
+++ b/crates/cubecl-cpp/src/shared/base.rs
@@ -357,20 +357,13 @@ impl<D: Dialect> CppCompiler<D> {
                     source,
                     destination,
                 } => {
-                    if let gpu::VariableKind::Pipeline {
-                        id: _,
-                        item: _,
-                        num_stages: _,
-                    } = pipeline.kind
-                    {
-                        instructions.push(Instruction::Pipeline(
-                            super::pipeline::PipelineOps::MemCopyAsync {
-                                pipeline: self.compile_variable(pipeline),
-                                source: self.compile_variable(source),
-                                destination: self.compile_variable(destination),
-                            },
-                        ));
-                    }
+                    instructions.push(Instruction::Pipeline(
+                        super::pipeline::PipelineOps::MemCopyAsync {
+                            pipeline: self.compile_variable(pipeline),
+                            source: self.compile_variable(source),
+                            destination: self.compile_variable(destination),
+                        },
+                    ));
                 }
                 gpu::PipelineOps::ProducerAcquire { pipeline } => instructions.push(
                     Instruction::Pipeline(super::pipeline::PipelineOps::ProducerAcquire {
diff --git a/crates/cubecl-cpp/src/shared/pipeline.rs b/crates/cubecl-cpp/src/shared/pipeline.rs
index 9423dad40..844039b69 100644
--- a/crates/cubecl-cpp/src/shared/pipeline.rs
+++ b/crates/cubecl-cpp/src/shared/pipeline.rs
@@ -52,7 +52,7 @@ impl<D: Dialect> Display for PipelineOps<D> {
                 let size = item.elem().size() * item.vectorization;
                 write!(f, "
 cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {source}_length * {size}, {pipeline});
-                                        ")
+                ")
             }
             PipelineOps::Init {
                 pipeline,
@@ -63,7 +63,7 @@ cuda::memcpy_async(cooperative_groups::this_thread(), {destination}, {source}, {
                     "
 cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, {num_stages}> {pipeline}_state;
 auto {pipeline} = cuda::make_pipeline(cooperative_groups::this_thread(), &{pipeline}_state);
-                        "
+                "
                 )
             }
             PipelineOps::ProducerAcquire { pipeline } => {

From 63f92f1e0d282891f3bb63c3a7744c8c1d1add79 Mon Sep 17 00:00:00 2001
From: louisfd <louisfd94@gmail.com>
Date: Wed, 5 Feb 2025 09:51:45 -0500
Subject: [PATCH 14/14] add guards

---
 crates/cubecl-core/src/runtime_tests/memcpy_async.rs | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
index b389e4373..9936176af 100644
--- a/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
+++ b/crates/cubecl-core/src/runtime_tests/memcpy_async.rs
@@ -1,4 +1,4 @@
-use crate::{self as cubecl, as_bytes};
+use crate::{self as cubecl, as_bytes, Feature};
 use cubecl::prelude::*;
 use pipeline::Pipeline;
 
@@ -99,6 +99,11 @@ fn two_independant_loads<F: Float>(
 pub fn test_memcpy_one_load<R: Runtime, F: Float + CubeElement>(
     client: ComputeClient<R::Server, R::Channel>,
 ) {
+    if !client.properties().feature_enabled(Feature::Pipeline) {
+        // We can't execute the test, skip.
+        return;
+    }
+
     let lhs = client.create(as_bytes![F: 10., 11., 12., 13.]);
     let output = client.empty(4 * core::mem::size_of::<F>());
 
@@ -123,6 +128,11 @@ pub fn test_memcpy_two_loads<R: Runtime, F: Float + CubeElement>(
     independant: bool,
     client: ComputeClient<R::Server, R::Channel>,
 ) {
+    if !client.properties().feature_enabled(Feature::Pipeline) {
+        // We can't execute the test, skip.
+        return;
+    }
+
     let num_data = 4;
     let lhs_data: Vec<F> = (0..num_data).map(|i| F::new(i as f32)).collect();
     let rhs_data: Vec<F> = (0..num_data).map(|i| F::new(i as f32)).collect();