From df69dfd92b805bf3a4cc1c8af493f6e859dc3d48 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Tue, 26 Nov 2024 19:02:00 -0800 Subject: [PATCH] Integrated temporary mem alloc functionality in place of malloc --- backends/cadence/aot/functions_hifi.yaml | 13 ++++++--- backends/cadence/hifi/kernels/kernels.cpp | 5 ++++ backends/cadence/hifi/kernels/kernels.h | 8 +++++- .../cadence/hifi/operators/CMakeLists.txt | 1 + .../cadence/hifi/operators/op_maximum.cpp | 2 ++ backends/cadence/hifi/operators/op_mean.cpp | 4 ++- backends/cadence/hifi/operators/op_pow.cpp | 14 +++++++--- backends/cadence/hifi/operators/op_where.cpp | 6 ++-- .../executor_runner/executor_runner.cpp | 28 ++++++++++++++++++- 9 files changed, 68 insertions(+), 13 deletions(-) diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 0f3e582884..ac981d321a 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -61,11 +61,16 @@ kernels: - arg_meta: null kernel_name: cadence::impl::HiFi::full_out + +- op: gt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_scalar_out - op: maximum.out kernels: - arg_meta: null - kernel_name: impl::HiFi::maximum_out + kernel_name: cadence::impl::HiFi::maximum_out - op: mean.out kernels: @@ -90,17 +95,17 @@ - op: pow.Scalar_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Scalar_out + kernel_name: cadence::impl::HiFi::pow_Scalar_out - op: pow.Tensor_Scalar_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Tensor_Scalar_out + kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out - op: pow.Tensor_Tensor_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Tensor_Tensor_out + kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out - op: rsqrt.out kernels: diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index 10e5fb176e..0934afde29 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -20,6 +20,11 @@ memcpy(void* dst, const void* src, size_t num_bytes) { MEMCPY_8b(dst, src, num_bytes); } +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { + Result temp_mem_res = ctx.allocate_temp(size); + return temp_mem_res.ok() ? temp_mem_res.get() : nullptr; +} + // Quantize a fp32 value to an int8_t/uint8_t value template __attribute__((always_inline)) T diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 9a4689c17c..e72284b99d 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -14,8 +14,12 @@ /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" -/* Potential NNLIB function/APIs */ +#include + +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::Result; +/* Potential NNLIB function/APIs */ extern "C" WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out, const int* const out_shape, @@ -149,6 +153,8 @@ namespace impl { namespace HiFi { namespace kernels { +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size); + void memcpy(void* dst, const void* src, size_t num_bytes); WORD32 matmul_asym8uxasym8u_asym8u( diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index c01dad5ce8..c66e55f0f4 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -38,6 +38,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp index 97578765cf..f85d3470e9 100644 --- a/backends/cadence/hifi/operators/op_maximum.cpp +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -170,3 +171,4 @@ Tensor& maximum_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index 478e10da71..cdc844ec5c 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -125,7 +125,9 @@ Tensor& mean_dim_out( int scratch_size = xa_nn_reduce_getsize_nhwc( -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1); - void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size); + void* __restrict__ p_scratch_in = + (void* __restrict__)kernels::allocate_temp_memory( + ctx, scratch_size * sizeof(int)); xa_nn_reduce_mean_4D_f32_f32( p_out, diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 04533b290b..74c24afbc0 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -26,6 +26,7 @@ using executorch::runtime::promoteTypes; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -119,9 +120,11 @@ Tensor& pow_Tensor_Tensor_out( if (optimized) { if (broadcast) { WORD32* __restrict__ ptr1 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ ptr2 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); @@ -154,7 +157,8 @@ Tensor& pow_Tensor_Tensor_out( free(ptr2); } else if (a_is_broadcasted && (!b_is_broadcasted)) { FLOAT32* __restrict__ ptr1 = - (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32)); + (FLOAT32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); @@ -181,7 +185,8 @@ Tensor& pow_Tensor_Tensor_out( free(ptr1); } else if (b_is_broadcasted && (!a_is_broadcasted)) { WORD32* __restrict__ ptr1 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); @@ -349,3 +354,4 @@ Tensor& pow_Scalar_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index 06bd0bc3c9..c4ad8177cf 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -109,8 +109,10 @@ Tensor& where_out( if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) { - void* p_scratch = - malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]); + void* p_scratch = (void*)kernels::allocate_temp_memory( + ctx, + (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * + sizeof(int)); const unsigned char* p_brd_cond = (const unsigned char*)p_scratch; xa_nn_broadcast_8_8( (WORD8* __restrict__)p_brd_cond, diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 93c150c0b9..a2476f9165 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -30,8 +30,16 @@ #include #include +#if __XTENSA__ +#include +#include +#include +#endif + static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB +static uint8_t temp_allocator_pool[1024U * 1024U]; + DEFINE_string( model_path, "model.pte", @@ -120,6 +128,10 @@ int main(int argc, char** argv) { MemoryAllocator method_allocator{ MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + // Temporary memory required by kernels + MemoryAllocator temp_allocator{ + MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool)}; + // The memory-planned buffers will back the mutable tensors used by the // method. The sizes of these buffers were determined ahead of time during the // memory-planning pasees. @@ -144,7 +156,7 @@ int main(int argc, char** argv) { // Assemble all of the allocators into the MemoryManager that the Executor // will use. - MemoryManager memory_manager(&method_allocator, &planned_memory); + MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator); // // Load the method from the program, using the provided allocators. Running @@ -170,8 +182,22 @@ int main(int argc, char** argv) { (uint32_t)inputs.error()); ET_LOG(Info, "Inputs prepared."); +#if __XTENSA__ + struct tms start, stop; + xt_iss_client_command("all", "disable"); + xt_iss_client_command("all", "enable"); + times(&start); +#endif + // Run the model. Error status = method->execute(); + +#if __XTENSA__ + times(&stop); + xt_iss_client_command("all", "disable"); + ET_LOG(Info, "Execute cycles = %ld", (stop.tms_utime - start.tms_utime)); +#endif + ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32,