Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix TensorStorage memory deallocation #145

Merged
merged 8 commits into from
Sep 28, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions crates/kornia-core/src/serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ use crate::{
use serde::ser::SerializeStruct;
use serde::Deserialize;

impl<T, const N: usize, A: TensorAllocator> serde::Serialize for Tensor<T, N, A>
impl<T, const N: usize, A> serde::Serialize for Tensor<T, N, A>
where
T: serde::Serialize + SafeTensorType,
A: TensorAllocator + 'static,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
Expand All @@ -23,7 +24,7 @@ where
}
}

impl<'de, T, const N: usize, A: TensorAllocator + Default> serde::Deserialize<'de>
impl<'de, T, const N: usize, A: TensorAllocator + Default + 'static> serde::Deserialize<'de>
for Tensor<T, N, A>
where
T: serde::Deserialize<'de> + SafeTensorType,
Expand Down
116 changes: 110 additions & 6 deletions crates/kornia-core/src/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,30 @@ impl SafeTensorType for i64 {}
impl SafeTensorType for f32 {}
impl SafeTensorType for f64 {}

/// Represents the owner of custom Arrow Buffer memory allocations.
///
/// This struct is used to facilitate the automatic deallocation of the memory it owns,
/// using the `Drop` trait.
pub struct TensorCustomAllocationOwner<A: TensorAllocator> {
/// The allocator used to allocate the tensor storage.
alloc: A,
/// The layout used for the allocation.
layout: Layout,
/// The pointer to the allocated memory
ptr: *const u8,
emilmgeorge marked this conversation as resolved.
Show resolved Hide resolved
}

// SAFETY: TensorCustomAllocationOwner is never modifed from multiple threads.
impl<A: TensorAllocator> std::panic::RefUnwindSafe for TensorCustomAllocationOwner<A> {}
unsafe impl<A: TensorAllocator> Sync for TensorCustomAllocationOwner<A> {}
unsafe impl<A: TensorAllocator> Send for TensorCustomAllocationOwner<A> {}

impl<A: TensorAllocator> Drop for TensorCustomAllocationOwner<A> {
fn drop(&mut self) {
self.alloc.dealloc(self.ptr as *mut u8, self.layout);
}
}

/// Represents a contiguous memory region that can be shared with other buffers and across thread boundaries.
///
/// This struct provides methods to create, access, and manage tensor storage using a custom allocator.
Expand All @@ -35,9 +59,10 @@ where
alloc: A,
}

impl<T, A: TensorAllocator> TensorStorage<T, A>
impl<T, A> TensorStorage<T, A>
where
T: SafeTensorType + Clone,
A: TensorAllocator + 'static,
{
/// Creates a new tensor storage with the given length and allocator.
///
Expand All @@ -51,16 +76,21 @@ where
/// A new tensor storage if successful, otherwise an error.
pub fn new(len: usize, alloc: A) -> Result<Self, TensorAllocatorError> {
// allocate memory for tensor storage
let ptr =
alloc.alloc(Layout::array::<T>(len).map_err(TensorAllocatorError::LayoutError)?)?;
let layout = Layout::array::<T>(len).map_err(TensorAllocatorError::LayoutError)?;
let ptr = alloc.alloc(layout)?;
let owner = TensorCustomAllocationOwner {
alloc: alloc.clone(),
emilmgeorge marked this conversation as resolved.
Show resolved Hide resolved
layout,
ptr,
};

// create the buffer
let buffer = unsafe {
// SAFETY: `ptr` is non-null and properly aligned, and `len` is the correct size.
Buffer::from_custom_allocation(
NonNull::new_unchecked(ptr),
len * std::mem::size_of::<T>(),
Arc::new(Vec::<T>::with_capacity(len)),
Arc::new(owner),
)
};

Expand Down Expand Up @@ -223,7 +253,7 @@ where
let buffer = Buffer::from_custom_allocation(
NonNull::new_unchecked(ptr as *mut u8),
len * std::mem::size_of::<T>(),
Arc::new(Vec::<T>::with_capacity(len)),
Arc::new(()),
);

// create tensor storage
Expand All @@ -238,7 +268,7 @@ where
impl<T, A> Clone for TensorStorage<T, A>
where
T: SafeTensorType + Clone,
A: TensorAllocator + Clone,
A: TensorAllocator + Clone + 'static,
{
fn clone(&self) -> Self {
let mut new_storage = Self::new(self.len(), self.alloc.clone())
Expand All @@ -253,6 +283,8 @@ mod tests {
use super::*;
use crate::allocator::CpuAllocator;
use std::alloc::Layout;
use std::cell::RefCell;
use std::rc::Rc;

#[test]
fn test_tensor_storage() -> Result<(), TensorAllocatorError> {
Expand Down Expand Up @@ -365,4 +397,76 @@ mod tests {
assert_eq!(result_vec.capacity(), original_vec_capacity);
assert!(std::ptr::eq(result_vec.as_ptr(), original_vec_ptr));
}

#[test]
fn test_tensor_storage_allocator() {
// A test TensorAllocator that keeps a count of the bytes that are allocated but not yet
// deallocated via the allocator.
#[derive(Clone)]
struct TestAllocator {
bytes_allocated: Rc<RefCell<i32>>,
}
impl TensorAllocator for TestAllocator {
fn alloc(&self, layout: Layout) -> Result<*mut u8, TensorAllocatorError> {
*self.bytes_allocated.borrow_mut() += layout.size() as i32;
CpuAllocator.alloc(layout)
}
fn dealloc(&self, ptr: *mut u8, layout: Layout) {
*self.bytes_allocated.borrow_mut() -= layout.size() as i32;
CpuAllocator.dealloc(ptr, layout)
}
}

let allocator = TestAllocator {
bytes_allocated: Rc::new(RefCell::new(0)),
};
let len = 1024;

// TensorStorage::new()
// Deallocation should happen when `storage` goes out of scope.
{
let _storage = TensorStorage::<u8, _>::new(len, allocator.clone()).unwrap();
emilmgeorge marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(*allocator.bytes_allocated.borrow(), len as i32);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No pointer test here ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case the storage is not created from another buffer so there is no original pointer to compare to. Are there any other checks I should have added?

}
assert_eq!(*allocator.bytes_allocated.borrow(), 0);

// TensorStorage::new() -> TensorStorage::into_vec()
// TensorStorage::into_vec() consumes the storage and creates a copy (in this case).
edgarriba marked this conversation as resolved.
Show resolved Hide resolved
// This should cause deallocation of the original memory.
{
let storage = TensorStorage::<u8, _>::new(len, allocator.clone()).unwrap();
assert_eq!(*allocator.bytes_allocated.borrow(), len as i32);

let _vec = storage.into_vec();
assert_eq!(*allocator.bytes_allocated.borrow(), 0);
}
assert_eq!(*allocator.bytes_allocated.borrow(), 0);

// TensorStorage::from_vec() -> TensorStorage::into_vec()
// TensorStorage::from_vec() currently does not use the custom allocator, so the
// bytes_allocated value should not change.
{
let vec = Vec::<u8>::with_capacity(len);
let storage = TensorStorage::<u8, _>::from_vec(vec, allocator.clone());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we need to resolve well here when we create a storage from a Vec how's the allocator involved

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth to check this apache/arrow-rs#6362

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that would be good. This is my related understanding:

  • Std Vector and Buffer::from_vec do not support custom allocators until allocator_api comes to stable rust. So, for our future custom allocators like CudaAllocator, from_vec will have to involve copying and use Buffer::from_custom_allocator.
  • For CpuAllocator, we can have zero-copy using Buffer::from_vec (as it is currently).
    But to be fully safe, I think we should also change CpuAllocator to use std::alloc::{alloc,dealloc} (Global allocator) instead of std::alloc::System.{alloc,dealloc}. This is because vector uses the Global allocator. This is usually the same as std::alloc::System but the user can change it using the global_allocator attribute. By changing CpuAllocator to use std::alloc::{alloc,dealloc}, it always matches the allocator used by the vector (even when user changes it).

I'm not sure how to switch the implementation of TensorStorage::from_vec to one of the above based on whether A is CpuAllocator or CudaAllocator though. (maybe different functions? Ideas welcome!)

I haven't done any of the above in this PR though. Please let me know your thoughts and I can change accordingly.

Copy link
Member

@edgarriba edgarriba Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But to be fully safe, I think we should also change CpuAllocator to use std::alloc::{alloc,dealloc} (Global allocator) instead of std::alloc::System.{alloc,dealloc}

please, do 👍

I'm not sure how to switch the implementation of TensorStorage::from_vec to one of the above based on whether A is CpuAllocator or CudaAllocator though. (maybe different functions? Ideas welcome!)

maybe the behaviour for cuda should be that when a cuda storage is created via vec, the data is consumed, cuda allocated and copied to device, and deallocate the original cpu vector ? Haven't faced yet the full use case. Probably we should use the kornia::dnn module to try this workflows and prototype from there. Found a similar c++ implementation maybe to have a reference: https://gist.github.com/CommitThis/1666517de32893e5dc4c441269f1029a

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one more request in this direction, is the ability to easily create Image views. As Image is tuple struct out of Tensor: https://github.com/kornia/kornia-rs/blob/main/crates/kornia-image/src/image.rs#L59

In some workflows i have different types of images out of Tensor which i need to convert to Image::new (`as_slice().to_vec() everytime which involves copies) in order to use any kornia function. Unless we decide e.g to adapt the whole api to accept Tensor, and Image in the end it's just a trait in order to give some semantics and define specific types of images with formats e.g Rgb8U, Mono8U.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't quite understand the image view part. But for converting Tensor3 to Image without copy, we could implement the TryFrom trait or a function from_tensor that uses the passed tensor if channel dimension matches.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image in the end is a Tensor3 — would be a bit overkill to have a from_tensor method. I think ideally for this case we might want to have a method that somehow transfers the ownership of the storage, shape and strides ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to make myself clear, this is what I had in mind:

impl<T, const C: usize> TryFrom<Tensor<T, 3, CpuAllocator>> for Image<T, C>
where
    T: SafeTensorType,
{
    type Error = ImageError;

    fn try_from(value: Tensor<T, 3, CpuAllocator>) -> Result<Self, Self::Error> {
        if value.shape[2] == C {
            Ok(Self(value))
        } else {
            Err(ImageError::InvalidTensorShape)
        }
    }
}

Used like:

let image: Image<_, 3> = tensor.try_into().unwrap(); 
// OR
let image = Image::<_, 3>::try_from(tensor).unwrap();

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see, sounds good! I’ll try myself in a separated PR. I have also some potential improvements for the image struct, like adding a third ImageColorSpace in order to define more specific types like
type ImageRgb8 = Image<u8, 3, ColorSpace::Rgb> which color will have associated values too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding this PR, I just want to note that it only fixes the deallocation issues noted in the first comment. It does not currently include the changes related to into_vec for non-CpuAllocators as discussed above in this review thread. I had started it, but it is not ready yet. I can send a separate PR when it's ready (hope that's ok).

assert_eq!(*allocator.bytes_allocated.borrow(), 0);

let _vec = storage.into_vec();
assert_eq!(*allocator.bytes_allocated.borrow(), 0);
}
assert_eq!(*allocator.bytes_allocated.borrow(), 0);

// TensorStorage::from_ptr()
// TensorStorage::from_ptr() does not take ownership of buffer. So the memory should not be
// deallocated when the TensorStorage goes out of scope.
// In this case, the memory will be deallocated when the vector goes out of scope.
{
let mut vec = Vec::<u8>::with_capacity(len);
{
let _storage =
unsafe { TensorStorage::<u8, _>::from_ptr(vec.as_mut_ptr(), len, &allocator) };
emilmgeorge marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(*allocator.bytes_allocated.borrow(), 0);
}
assert_eq!(*allocator.bytes_allocated.borrow(), 0);
}
}
}
4 changes: 2 additions & 2 deletions crates/kornia-core/src/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ where
impl<T, const N: usize, A> Tensor<T, N, A>
where
T: SafeTensorType,
A: TensorAllocator,
A: TensorAllocator + 'static,
{
/// Create a new `Tensor` with uninitialized data.
///
Expand Down Expand Up @@ -875,7 +875,7 @@ where
impl<T, const N: usize, A> Clone for Tensor<T, N, A>
where
T: SafeTensorType + Clone,
A: TensorAllocator + Clone,
A: TensorAllocator + Clone + 'static,
{
fn clone(&self) -> Self {
Self {
Expand Down
2 changes: 1 addition & 1 deletion crates/kornia-core/src/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub struct TensorView<'a, T: SafeTensorType, const N: usize, A: TensorAllocator>
pub strides: [usize; N],
}

impl<'a, T: SafeTensorType, const N: usize, A: TensorAllocator> TensorView<'a, T, N, A> {
impl<'a, T: SafeTensorType, const N: usize, A: TensorAllocator + 'static> TensorView<'a, T, N, A> {
/// Returns the data slice of the tensor.
#[inline]
pub fn as_slice(&self) -> &[T] {
Expand Down
Loading