Integrated temporary mem alloc functionality in place of malloc

dijopaul · Nov 27, 2024 · df69dfd · df69dfd
1 parent 3165566
commit df69dfd
Show file tree

Hide file tree

Showing 9 changed files with 68 additions and 13 deletions.
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -61,11 +61,16 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::full_out
+
+- op: gt.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::gt_scalar_out      
 
 - op: maximum.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::maximum_out
+      kernel_name: cadence::impl::HiFi::maximum_out
 
 - op: mean.out
   kernels:
@@ -90,17 +95,17 @@
 - op: pow.Scalar_out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::pow_Scalar_out
+      kernel_name: cadence::impl::HiFi::pow_Scalar_out
 
 - op: pow.Tensor_Scalar_out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::pow_Tensor_Scalar_out
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out
 
 - op: pow.Tensor_Tensor_out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::pow_Tensor_Tensor_out
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out
 
 - op: rsqrt.out
   kernels:

diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
@@ -20,6 +20,11 @@ memcpy(void* dst, const void* src, size_t num_bytes) {
   MEMCPY_8b(dst, src, num_bytes);
 }
 
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) {
+  Result<void*> temp_mem_res = ctx.allocate_temp(size);
+  return temp_mem_res.ok() ? temp_mem_res.get() : nullptr;
+}
+
 // Quantize a fp32 value to an int8_t/uint8_t value
 template <typename T>
 __attribute__((always_inline)) T

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -14,8 +14,12 @@
 /* For NNLIB APIs */
 #include "xa_nnlib_kernels_api.h"
 
-/* Potential NNLIB function/APIs */
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::Result;
 
+/* Potential NNLIB function/APIs */
 extern "C" WORD32 xa_nn_broadcast_32_32(
     WORD32* __restrict__ p_out,
     const int* const out_shape,
@@ -149,6 +153,8 @@ namespace impl {
 namespace HiFi {
 namespace kernels {
 
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size);
+
 void memcpy(void* dst, const void* src, size_t num_bytes);
 
 WORD32 matmul_asym8uxasym8u_asym8u(

diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -38,6 +38,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"

diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -170,3 +171,4 @@ Tensor& maximum_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
@@ -125,7 +125,9 @@ Tensor& mean_dim_out(
     int scratch_size = xa_nn_reduce_getsize_nhwc(
         -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);
 
-    void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);
+    void* __restrict__ p_scratch_in =
+        (void* __restrict__)kernels::allocate_temp_memory(
+            ctx, scratch_size * sizeof(int));
 
     xa_nn_reduce_mean_4D_f32_f32(
         p_out,

diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
@@ -26,6 +26,7 @@ using executorch::runtime::promoteTypes;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -119,9 +120,11 @@ Tensor& pow_Tensor_Tensor_out(
   if (optimized) {
     if (broadcast) {
       WORD32* __restrict__ ptr1 =
-          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+          (WORD32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
       WORD32* __restrict__ ptr2 =
-          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+          (WORD32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
 
       WORD32* __restrict__ pin1 =
           (WORD32* __restrict__)a.const_data_ptr<float>();
@@ -154,7 +157,8 @@ Tensor& pow_Tensor_Tensor_out(
       free(ptr2);
     } else if (a_is_broadcasted && (!b_is_broadcasted)) {
       FLOAT32* __restrict__ ptr1 =
-          (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32));
+          (FLOAT32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
 
       FLOAT32* __restrict__ pin1 =
           (FLOAT32* __restrict__)a.const_data_ptr<float>();
@@ -181,7 +185,8 @@ Tensor& pow_Tensor_Tensor_out(
       free(ptr1);
     } else if (b_is_broadcasted && (!a_is_broadcasted)) {
       WORD32* __restrict__ ptr1 =
-          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+          (WORD32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
 
       WORD32* __restrict__ pin1 =
           (WORD32* __restrict__)b.const_data_ptr<float>();
@@ -349,3 +354,4 @@ Tensor& pow_Scalar_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
@@ -109,8 +109,10 @@ Tensor& where_out(
 
       if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] ||
           con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) {
-        void* p_scratch =
-            malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]);
+        void* p_scratch = (void*)kernels::allocate_temp_memory(
+            ctx,
+            (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) *
+                sizeof(int));
         const unsigned char* p_brd_cond = (const unsigned char*)p_scratch;
         xa_nn_broadcast_8_8(
             (WORD8* __restrict__)p_brd_cond,

diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
@@ -30,8 +30,16 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
 
+#if __XTENSA__
+#include <stdio.h>
+#include <sys/times.h>
+#include <xtensa/sim.h>
+#endif
+
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
+static uint8_t temp_allocator_pool[1024U * 1024U];
+
 DEFINE_string(
     model_path,
     "model.pte",
@@ -120,6 +128,10 @@ int main(int argc, char** argv) {
   MemoryAllocator method_allocator{
       MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
 
+  // Temporary memory required by kernels
+  MemoryAllocator temp_allocator{
+      MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool)};
+
   // The memory-planned buffers will back the mutable tensors used by the
   // method. The sizes of these buffers were determined ahead of time during the
   // memory-planning pasees.
@@ -144,7 +156,7 @@ int main(int argc, char** argv) {
 
   // Assemble all of the allocators into the MemoryManager that the Executor
   // will use.
-  MemoryManager memory_manager(&method_allocator, &planned_memory);
+  MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator);
 
   //
   // Load the method from the program, using the provided allocators. Running
@@ -170,8 +182,22 @@ int main(int argc, char** argv) {
       (uint32_t)inputs.error());
   ET_LOG(Info, "Inputs prepared.");
 
+#if __XTENSA__
+  struct tms start, stop;
+  xt_iss_client_command("all", "disable");
+  xt_iss_client_command("all", "enable");
+  times(&start);
+#endif
+
   // Run the model.
   Error status = method->execute();
+
+#if __XTENSA__
+  times(&stop);
+  xt_iss_client_command("all", "disable");
+  ET_LOG(Info, "Execute cycles = %ld", (stop.tms_utime - start.tms_utime));
+#endif
+
   ET_CHECK_MSG(
       status == Error::Ok,
       "Execution of method %s failed with status 0x%" PRIx32,