tracel-ai · wingertge · Jan 25, 2025 · Jan 26, 2025 · Jan 26, 2025 · Jan 28, 2025
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -6,6 +6,25 @@ on:
       - "v*"
 
 jobs:
+  publish-burn-vision:
+    uses: tracel-ai/github-actions/.github/workflows/publish-crate.yml@v1
+    with:
+      crate: burn-vision
+    needs:
+      - publish-burn-autodiff
+      - publish-burn-candle
+      - publish-burn-fusion
+      - publish-burn-jit
+      - publish-burn-ndarray
+      - publish-burn-tch
+      - publish-burn-tensor
+      - publish-burn-tensor-testgen
+      # dev dependencies
+      - publish-burn-wgpu
+      - publish-burn-cuda
+    secrets:
+      CRATES_IO_API_TOKEN: ${{ secrets.CRATES_IO_API_TOKEN }}
+
   publish-burn-router:
     uses: tracel-ai/github-actions/.github/workflows/publish-crate.yml@v1
     with:

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/burn-candle/src/element.rs b/crates/burn-candle/src/element.rs
@@ -4,8 +4,11 @@ use burn_tensor::Element;
 use candle_core::{FloatDType, Tensor, WithDType};
 use half::{bf16, f16};
 
+/// Candle element
 pub trait CandleElement: Element + WithDType {}
+/// Candle float element
 pub trait FloatCandleElement: CandleElement + FloatDType {}
+/// Candle int element
 pub trait IntCandleElement: CandleElement {}
 
 impl CandleElement for f64 {}

diff --git a/crates/burn-candle/src/lib.rs b/crates/burn-candle/src/lib.rs
@@ -13,6 +13,7 @@ mod ops;
 mod tensor;
 
 pub use backend::*;
+pub use element::*;
 pub use tensor::*;
 
 #[cfg(test)]

diff --git a/crates/burn-cuda/Cargo.toml b/crates/burn-cuda/Cargo.toml
@@ -12,8 +12,8 @@ repository = "https://github.com/tracel-ai/burn/tree/main/crates/burn-cuda"
 version.workspace = true
 
 [features]
-default = ["fusion", "autotune", "burn-jit/default", "cubecl/default"]
 autotune = ["burn-jit/autotune"]
+default = ["fusion", "autotune", "burn-jit/default", "cubecl/default"]
 doc = ["burn-jit/doc"]
 fusion = ["burn-fusion", "burn-jit/fusion"]
 std = ["burn-jit/std", "cubecl/std"]

diff --git a/crates/burn-jit/src/kernel/index/mod.rs b/crates/burn-jit/src/kernel/index/mod.rs
@@ -11,7 +11,7 @@ pub(crate) use flip::*;
 pub(crate) use repeat_dim::*;
 pub(crate) use select::*;
 pub(crate) use select_assign::*;
-pub(crate) use slice::*;
+pub use slice::*;
 pub(crate) use slice_assign::*;
 
 pub(crate) use gather::*;

diff --git a/crates/burn-jit/src/kernel/index/slice.rs b/crates/burn-jit/src/kernel/index/slice.rs
@@ -3,7 +3,8 @@ use burn_tensor::Shape;
 use cubecl::{calculate_cube_count_elemwise, prelude::*};
 use std::ops::Range;
 
-pub(crate) fn slice<R: JitRuntime, E: JitElement>(
+/// Slice a jit tensor with a set of ranges
+pub fn slice<R: JitRuntime, E: JitElement>(
     tensor: JitTensor<R>,
     indices: &[Range<usize>],
 ) -> JitTensor<R> {

diff --git a/crates/burn-jit/src/kernel/mod.rs b/crates/burn-jit/src/kernel/mod.rs
@@ -39,4 +39,4 @@ pub mod reduce;
 
 pub(crate) use clamp::*;
 pub(crate) use comparison::*;
-pub(crate) use index::*;
+pub use index::*;
diff --git a/crates/burn-jit/src/lib.rs b/crates/burn-jit/src/lib.rs
@@ -7,7 +7,8 @@
 extern crate derive_new;
 extern crate alloc;
 
-mod ops;
+/// Utilities for implementing JIT kernels
+pub mod ops;
 
 /// Kernel module
 pub mod kernel;

diff --git a/crates/burn-jit/src/ops/base.rs b/crates/burn-jit/src/ops/base.rs
@@ -76,6 +76,7 @@ pub(crate) fn swap_dims<R: JitRuntime>(
     tensor
 }
 
+/// Permute a tensor's dimensions
 pub fn permute<R: JitRuntime>(mut tensor: JitTensor<R>, axes: &[usize]) -> JitTensor<R> {
     // remap strides
     tensor.strides = axes.iter().map(|i| tensor.strides[*i]).collect();
@@ -135,7 +136,8 @@ pub(crate) fn expand<R: JitRuntime>(tensor: JitTensor<R>, target_shape: Shape) -
     }
 }
 
-pub(crate) fn reshape<R: JitRuntime>(tensor: JitTensor<R>, shape: Shape) -> JitTensor<R> {
+/// Reshape a jit tensor to a new shape
+pub fn reshape<R: JitRuntime>(tensor: JitTensor<R>, shape: Shape) -> JitTensor<R> {
     // TODO: Not force standard layout all the time (improve performance).
     let tensor = kernel::into_contiguous(tensor);
 

diff --git a/crates/burn-jit/src/ops/mod.rs b/crates/burn-jit/src/ops/mod.rs
@@ -7,6 +7,7 @@ mod qtensor;
 mod transaction;
 
 pub(crate) mod base;
-pub(crate) use base::*;
+pub use base::*;
 
-pub(crate) mod numeric;
+/// Numeric utility functions for jit backends
+pub mod numeric;
diff --git a/crates/burn-jit/src/ops/numeric.rs b/crates/burn-jit/src/ops/numeric.rs
@@ -9,6 +9,7 @@ use cubecl::client::ComputeClient;
 use cubecl::tensor_vectorization_factor;
 use cubecl::{calculate_cube_count_elemwise, prelude::*};
 
+/// Create a tensor filled with `value`
 pub fn full<R: JitRuntime, E: JitElement>(
     shape: Shape,
     device: &R::Device,
@@ -19,6 +20,7 @@ pub fn full<R: JitRuntime, E: JitElement>(
     full_device::<R, E>(client, shape, device.clone(), value)
 }
 
+/// Create a tensor filled with `value`
 pub fn full_device<R: JitRuntime, E: JitElement>(
     client: ComputeClient<R::Server, R::Channel>,
     shape: Shape,
@@ -56,12 +58,14 @@ pub fn full_device<R: JitRuntime, E: JitElement>(
     empty
 }
 
+/// Create a tensor filled with zeros
 pub fn zeros<R: JitRuntime, E: JitElement>(shape: Shape, device: &R::Device) -> JitTensor<R> {
     let client = R::client(device);
 
     zeros_device::<R, E>(client, device.clone(), shape)
 }
 
+/// Create a tensor filled with zeros
 pub fn zeros_device<R: JitRuntime, E: JitElement>(
     client: ComputeClient<R::Server, R::Channel>,
     device: R::Device,
@@ -70,12 +74,14 @@ pub fn zeros_device<R: JitRuntime, E: JitElement>(
     full_device::<R, E>(client, shape, device, 0.elem())
 }
 
+/// Create a tensor filled with ones
 pub fn ones<R: JitRuntime, E: JitElement>(shape: Shape, device: &R::Device) -> JitTensor<R> {
     let client = R::client(device);
 
     ones_device::<R, E>(client, device.clone(), shape)
 }
 
+/// Create a tensor filled with ones
 pub fn ones_device<R: JitRuntime, E: JitElement>(
     client: ComputeClient<R::Server, R::Channel>,
     device: R::Device,
@@ -84,6 +90,7 @@ pub fn ones_device<R: JitRuntime, E: JitElement>(
     full_device::<R, E>(client, shape, device, 1.elem())
 }
 
+/// Create a tensor with uninitialized memory
 pub fn empty_device<R: JitRuntime, E: JitElement>(
     client: ComputeClient<R::Server, R::Channel>,
     device: R::Device,
@@ -94,82 +101,99 @@ pub fn empty_device<R: JitRuntime, E: JitElement>(
     JitTensor::new_contiguous(client, device, shape, buffer, E::dtype())
 }
 
+/// Add two tensors
 pub fn add<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
     launch_binop::<R, E, AddOp>(lhs, rhs)
 }
 
+/// Add a tensor and a scalar
 pub fn add_scalar<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop::<R, E, AddOp>(lhs, rhs)
 }
 
+/// Subtract two tensors
 pub fn sub<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
     launch_binop::<R, E, SubOp>(lhs, rhs)
 }
 
+/// Subtract a tensor and a scalar
 pub fn sub_scalar<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop::<R, E, SubOp>(lhs, rhs)
 }
 
+/// Multiply two tensors
 pub fn mul<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
     launch_binop::<R, E, MulOp>(lhs, rhs)
 }
 
+/// Multiply a tensor and a scalar
 pub fn mul_scalar<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop::<R, E, MulOp>(lhs, rhs)
 }
 
+/// Divide two tensors
 pub fn div<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
     launch_binop::<R, E, DivOp>(lhs, rhs)
 }
 
+/// Divide a tensor by a scalar
 pub fn div_scalar<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop::<R, E, DivOp>(lhs, rhs)
 }
 
+/// Calculate remainder of two tensors
 pub fn remainder<R: JitRuntime, E: JitElement>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
     launch_binop::<R, E, RemainderOp>(lhs, rhs)
 }
 
+/// Calculate the remainder of a tensor with a scalar
 pub fn remainder_scalar<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop::<R, E, RemainderOp>(lhs, rhs)
 }
 
+/// Calculate the power of two tensors
 pub fn pow<R: JitRuntime, E: FloatElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
     launch_binop::<R, E, PowOp<E>>(lhs, rhs)
 }
 
+/// Bitwise and two tensors
 pub fn bitwise_and<R: JitRuntime, E: IntElement>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
     launch_binop_int::<R, E, BitwiseAndOp>(lhs, rhs)
 }
 
+/// Bitwise and with a scalar
 pub fn bitwise_and_scalar<R: JitRuntime, E: IntElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop_int::<R, E, BitwiseAndOp>(lhs, rhs)
 }
 
+/// Bitwise or two tensors
 pub fn bitwise_or<R: JitRuntime, E: IntElement>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
     launch_binop_int::<R, E, BitwiseOrOp>(lhs, rhs)
 }
 
+/// Bitwise or with a scalar
 pub fn bitwise_or_scalar<R: JitRuntime, E: IntElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop_int::<R, E, BitwiseOrOp>(lhs, rhs)
 }
 
+/// Bitwise xor two tensors
 pub fn bitwise_xor<R: JitRuntime, E: IntElement>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
     launch_binop_int::<R, E, BitwiseXorOp>(lhs, rhs)
 }
 
+/// Bitwise xor with a scalar
 pub fn bitwise_xor_scalar<R: JitRuntime, E: IntElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
     launch_scalar_binop_int::<R, E, BitwiseXorOp>(lhs, rhs)
 }
diff --git a/crates/burn-jit/src/tensor/base.rs b/crates/burn-jit/src/tensor/base.rs
@@ -23,7 +23,8 @@ pub struct JitTensor<R: JitRuntime> {
     pub device: R::Device,
     /// The strides of the tensor.
     pub strides: Vec<usize>,
-    pub(crate) dtype: DType,
+    /// The datatype of the tensor.
+    pub dtype: DType,
 }
 
 impl<R: JitRuntime, E: JitElement> From<JitTensor<R>> for TensorHandle<R, E> {

diff --git a/crates/burn-ndarray/src/element.rs b/crates/burn-ndarray/src/element.rs
@@ -16,6 +16,7 @@ where
 {
 }
 
+/// An int element for ndarray backend.
 pub trait IntNdArrayElement: NdArrayElement + Signed {}
 
 /// A general element for ndarray backend.
@@ -34,13 +35,21 @@ pub trait NdArrayElement:
 
 /// A element for ndarray backend that supports exp ops.
 pub trait ExpElement {
+    /// Exponent
     fn exp_elem(self) -> Self;
+    /// Log
     fn log_elem(self) -> Self;
+    /// Log1p
     fn log1p_elem(self) -> Self;
+    /// Powf
     fn powf_elem(self, value: f32) -> Self;
+    /// Powi
     fn powi_elem(self, value: i32) -> Self;
+    /// Sqrt
     fn sqrt_elem(self) -> Self;
+    /// Abs
     fn abs_elem(self) -> Self;
+    /// Abs for int
     fn int_abs_elem(self) -> Self;
 }
 

diff --git a/crates/burn-ndarray/src/lib.rs b/crates/burn-ndarray/src/lib.rs
@@ -21,7 +21,7 @@ mod sharing;
 mod tensor;
 
 pub use backend::*;
-pub use element::FloatNdArrayElement;
+pub use element::*;
 pub(crate) use sharing::*;
 pub use tensor::*;
 

diff --git a/crates/burn-ndarray/src/ops/conv.rs b/crates/burn-ndarray/src/ops/conv.rs
@@ -11,7 +11,7 @@ use ndarray::{
 };
 
 use crate::{
-    element::{FloatNdArrayElement, IntNdArrayElement, QuantElement},
+    element::FloatNdArrayElement,
     ops::padding::{apply_padding_4d, apply_padding_5d},
     sharing::UnsafeSharedRef,
     tensor::NdArrayTensor,
@@ -98,7 +98,7 @@ fn conv3d_mad_inner<E: FloatNdArrayElement>(
     }
 }
 
-pub(crate) fn conv2d<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantElement>(
+pub(crate) fn conv2d<E: FloatNdArrayElement>(
     x: NdArrayTensor<E>,
     weight: NdArrayTensor<E>,
     bias: Option<NdArrayTensor<E>>,
@@ -126,7 +126,7 @@ pub(crate) fn conv2d<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantEleme
         in_width,
     );
 
-    let x = apply_padding_4d::<E, I, Q>(x, options.padding, 0i32.elem()).array;
+    let x = apply_padding_4d::<E>(x, options.padding, 0i32.elem()).array;
 
     // Convert inputs from dynamic indexes to static to improve perf.
     let x = x.into_dimensionality::<ndarray::Ix4>().unwrap();
@@ -310,7 +310,7 @@ pub(crate) fn conv_transpose2d<E: FloatNdArrayElement>(
     NdArrayTensor::new(output.into_dyn().into_shared())
 }
 
-pub(crate) fn conv3d<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantElement>(
+pub(crate) fn conv3d<E: FloatNdArrayElement>(
     x: NdArrayTensor<E>,
     weight: NdArrayTensor<E>,
     bias: Option<NdArrayTensor<E>>,
@@ -345,7 +345,7 @@ pub(crate) fn conv3d<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantEleme
         in_width,
     );
 
-    let x = apply_padding_5d::<E, I, Q>(x, options.padding, 0i32.elem()).array;
+    let x = apply_padding_5d::<E>(x, options.padding, 0i32.elem()).array;
 
     // Convert inputs from dynamic indexes to static to improve perf.
     let x = x.into_dimensionality::<ndarray::Ix5>().unwrap();