From dabb14ee61b48bc527dbee292790d59d1ee633aa Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Mon, 14 Oct 2024 16:48:35 +0200
Subject: [PATCH] Arm backend: Add devtools support to example

New flags on run.sh
--etdump
  Build in etdump and profiling, the etdump base64 coded and put in
  the log
--debug_build
  Build debug instead of release
--extra_build_flags
  Extra flags to pass to cmake this makes it for example possible to
  override the allocator pool size or other build time cmake flags.

The devtools build has been updated so FLATCC_EXECUTABLE can be used
to point out the executable.

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
Change-Id: Ic0fb1e48ee633c5fe91473bdc2db9e894b2fc4fa
---
 CMakeLists.txt                                |  15 +-
 backends/arm/runtime/ArmBackendEthosU.cpp     |  70 ++++++-
 devtools/CMakeLists.txt                       |   8 +-
 examples/arm/CMakeLists.txt                   |   5 +
 examples/arm/aot_arm_compiler.py              |   4 +-
 .../patches/0001-Move-rodata-to-the-DDR.patch |   3 +-
 examples/arm/executor_runner/CMakeLists.txt   |  45 ++++-
 .../executor_runner/arm_executor_runner.cpp   | 116 +++++++++--
 .../arm/executor_runner/arm_perf_monitor.cpp  |   8 +-
 examples/arm/run.sh                           | 190 +++++++++++++-----
 10 files changed, 379 insertions(+), 85 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d79b49e6e..ac8950bc1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -673,10 +673,17 @@ if(EXECUTORCH_BUILD_XNNPACK)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
-      ON
-      CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
-  )
+  if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
+    set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+        ON
+        CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
+    )
+  else()
+    set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+        OFF
+        CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
+    )
+  endif()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
 endif()
 
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index a14c42140e..2cc716391b 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -15,6 +15,39 @@
 
 #include <ethosu_driver.h>
 
+#if defined(ET_EVENT_TRACER_ENABLED)
+#include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/event_tracer_hooks.h>
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerEntry;
+
+class EventTraceScope {
+ public:
+  EventTraceScope(EventTracer* event_tracer_, const char* name) {
+    event_tracer = event_tracer_;
+    event_tracer_entry_scope = event_tracer->start_profiling(name);
+  }
+  ~EventTraceScope() {
+    event_tracer->end_profiling(event_tracer_entry_scope);
+  }
+
+ private:
+  EventTracer* event_tracer;
+  EventTracerEntry event_tracer_entry_scope;
+};
+#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \
+  EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
+#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \
+  SCOPE = EVENTTRACER->start_profiling(NAME)
+#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \
+  EVENTTRACER->end_profiling(SCOPE)
+
+#else
+#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME)
+#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME)
+#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE)
+#endif
+
 #include <executorch/backends/arm/runtime/VelaBinStream.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       BackendExecutionContext& context,
       DelegateHandle* input_handle,
       EValue** args) const override {
+#if defined(ET_EVENT_TRACER_ENABLED)
+    EventTracer* event_tracer = context.event_tracer();
+    EventTracerEntry event_tracer_local_scope;
+#endif
+
+    EXECUTORCH_PROF_SCOPE(event_tracer, "ArmBackend::execute()");
+    ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
+
     ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
     VelaHandles handles;
 
-    ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
     // Command stream - we know at this point it's aligned
+    EXECUTORCH_PROF_START(
+        event_tracer,
+        event_tracer_local_scope,
+        "+ArmBackend::execute()processed_data");
     char* data = (char*)execution_handle->processed->data();
+    EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
+
     ET_LOG(Debug, "ArmBackend::execute %p", data);
 
+    EXECUTORCH_PROF_START(
+        event_tracer,
+        event_tracer_local_scope,
+        "+ArmBackend::execute()vela_bin_read()");
     // Read key sections from the vela_bin_stream
     if (vela_bin_read(data, &handles, execution_handle->processed->size()) ==
         false) {
       ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
       return Error::InvalidProgram;
     }
+    EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
     ET_LOG(
         Debug,
@@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
 
       // Select a compatible copy routine
       if (both_char and permuted_input_shape) {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer,
+            "+ArmBackend::execute()handles.input.permute_CHW_to_HWC()");
         // permuted byte copy CHW to HWC
         permute_CHW_to_HWC(
             tensor_in.mutable_data_ptr<char>(),
@@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             tensor_in.size(2),
             tensor_in.size(3));
       } else if (both_char or both_int) {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer, "+ArmBackend::execute()handles.input.memcpy()");
         // Sizes match and elt size matches so memcpy
         memcpy(
             scratch_addr,
@@ -234,7 +290,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
     size_t bases_size[2] = {
         handles.weight_data_size, handles.scratch_data_size};
-    int result = ethosu_invoke_v3(
+    int result = 0;
+    EXECUTORCH_PROF_START(
+        event_tracer, event_tracer_local_scope, "+ArmBackend::execute()NPU");
+    result = ethosu_invoke_v3(
         driver.get(),
         (void*)handles.cmd_data,
         handles.cmd_data_size,
@@ -242,6 +301,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         bases_size,
         2, /* fixed array of pointers to binary interface*/
         nullptr);
+    EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
     if (result != 0) {
       ET_LOG(
@@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
           &permuted_output_shape));
       if (tensor_out.scalar_type() == ScalarType::Char and
           permuted_output_shape) {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer,
+            "+ArmBackend::execute()handles.output.permute_HWC_to_CHW()");
+
         char* output_address = (char*)output_addr;
         permute_HWC_to_CHW(
             output_address,
@@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             tensor_out.size(2),
             tensor_out.size(3));
       } else {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer, "+ArmBackend::execute()handles.output.move()");
         for (int j = 0; j < tensor_out.numel(); j++) {
           if (tensor_out.scalar_type() == ScalarType::Char) {
             char* output_address = (char*)output_addr;
diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt
index df4bacb802..58043067a5 100644
--- a/devtools/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
+
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
 if(NOT FLATCC_EXECUTABLE)
-  set(FLATCC_EXECUTABLE flatcc)
+  set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
 endif()
 
 # Source root directory for executorch.
@@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE
     OFF
     CACHE BOOL ""
 )
-set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
+
 add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)
 
 # Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
@@ -163,7 +165,7 @@ add_custom_command(
     # Note that the flatcc project actually writes its outputs into the source
     # tree instead of under the binary directory, and there's no way to change
     # that behavior.
-    ${_flatcc_source_dir}/bin/flatcc -cwr -o
+    ${FLATCC_EXECUTABLE} -cwr -o
     ${_program_schema__include_dir}/executorch/devtools/etdump
     ${_etdump_schema__srcs}
   COMMAND rm -rf ${_etdump_schema_cleanup_paths}
diff --git a/examples/arm/CMakeLists.txt b/examples/arm/CMakeLists.txt
index e0a8186b46..0c754beaaa 100644
--- a/examples/arm/CMakeLists.txt
+++ b/examples/arm/CMakeLists.txt
@@ -57,3 +57,8 @@ generate_bindings_for_kernels(
 gen_operators_lib(
   LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
 )
+
+if(EXECUTORCH_ENABLE_EVENT_TRACER)
+  target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+  target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
+endif()
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 7fc2cf0564..6713c9b6f8 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -273,7 +273,7 @@ def get_compile_spec(
                 target,
                 system_config="Ethos_U55_High_End_Embedded",
                 memory_mode="Shared_Sram",
-                extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
+                extra_flags="--debug-force-regor --output-format=raw --verbose-operators",
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
@@ -286,7 +286,7 @@ def get_compile_spec(
                 target,
                 system_config="Ethos_U85_SYS_DRAM_Mid",
                 memory_mode="Shared_Sram",
-                extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
+                extra_flags="--output-format=raw --verbose-operators",
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
index 4467185ae7..9b47aa4e3a 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
@@ -20,11 +20,12 @@ index b458fc6..8d4bc73 100644
  
      KEEP(*(.eh_frame*))
    } > ITCM :rom_exec
-@@ -280,7 +280,7 @@ SECTIONS
+@@ -280,7 +280,8 @@ SECTIONS
  #endif
      * (expected_output_data_sec)
      * (sec_command_stream, sec_weight_data, sec_input_data)
 -
++    *(.got*)
 +    *(.rodata*)
      * (ethosu_core_in_queue)
      * (ethosu_core_out_queue)
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 7da3462924..ea180f4d23 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF)
 option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
 option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
 
-
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   message(
     FATAL_ERROR
@@ -220,10 +219,8 @@ target_sources(
 # Include the target's bare-metal linker script
 ethosu_eval_link_options(arm_executor_runner)
 
-# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
-# bin size as we link in a number of other symbols
-target_link_libraries(
-  arm_executor_runner
+set(arm_executor_runner_link)
+list(APPEND arm_executor_runner_link
   extension_runner_util
   ethosu_target_init
   executorch
@@ -237,6 +234,44 @@ target_link_libraries(
   -Xlinker -Map=arm_executor_runner.map
 )
 
+if(EXECUTORCH_ENABLE_EVENT_TRACER)
+  target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED)
+
+  add_library(etdump STATIC IMPORTED)
+  set_property(
+      TARGET etdump
+      PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/lib/libetdump.a"
+  )
+
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(FLATCCRT_LIB flatccrt_d)
+  else()
+    set(FLATCCRT_LIB flatccrt)
+  endif()
+
+  add_library(${FLATCCRT_LIB} STATIC IMPORTED)
+  set_property(
+      TARGET ${FLATCCRT_LIB}
+      PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
+  )
+
+  list(APPEND arm_executor_runner_link
+    etdump
+    ${FLATCCRT_LIB}
+  )
+endif()
+
+# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
+# bin size as we link in a number of other symbols
+target_link_libraries(
+  arm_executor_runner
+  ${arm_executor_runner_link}
+)
+
+target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map )
+
 # ET headers and generated headers includes
 target_include_directories(
   arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 5b9945be7a..e9c758a60b 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -21,8 +21,14 @@
 #include <executorch/runtime/platform/runtime.h>
 
 #include "arm_perf_monitor.h"
+#if defined(ET_EVENT_TRACER_ENABLED)
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#if !defined(SEMIHOSTING)
+#include <executorch/third-party/flatcc/include/flatcc/portable/pbase64.h>
+#endif
+#endif
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
 
 /**
  * The input_file_allocation_pool should be large enough to fit the various
@@ -75,7 +81,10 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::Tag;
 using executorch::runtime::TensorInfo;
-
+#if defined(ET_EVENT_TRACER_ENABLED)
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+#endif
 /**
  * The method_allocation_pool should be large enough to fit the setup, input
  * used and other data used like the planned memory pool (e.g. memory-planned
@@ -84,8 +93,8 @@ using executorch::runtime::TensorInfo;
  * large models if you run on HW this should be lowered to fit into your
  * availible memory.
  */
-#ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
-#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (20 * 1024 * 1024)
+#if !defined(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
+#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (60 * 1024 * 1024)
 #endif
 const size_t method_allocation_pool_size =
     ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE;
@@ -99,7 +108,7 @@ unsigned char __attribute__((
  * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
  * a better fit
  */
-#ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE
+#if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
 #define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
 #endif
 const size_t temp_allocation_pool_size =
@@ -108,16 +117,40 @@ unsigned char __attribute__((
     section("input_data_sec"),
     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
 
-void et_pal_init(void) {}
+void et_pal_init(void) {
+  // Enable ARM PMU Clock
+  ARM_PMU_Enable();
+  DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable
+  ARM_PMU_CYCCNT_Reset();
+  ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
+}
+
+/**
+ * Implementation of the et_pal_<funcs>()
+ *
+ * This functions are hardware adaption type of functions for things like
+ * time/logging/memory allocation that could call your RTOS or need to to
+ * be implemnted in some way.
+ */
 
 ET_NORETURN void et_pal_abort(void) {
-#ifndef SEMIHOSTING
+#if !defined(SEMIHOSTING)
   __builtin_trap();
 #else
   _exit(-1);
 #endif
 }
 
+et_timestamp_t et_pal_current_ticks(void) {
+  return ARM_PMU_Get_CCNTR();
+}
+
+et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
+  // Since we don't know the CPU freq for your target and justs cycles in the
+  // FVP for et_pal_current_ticks() we return a conversion ratio of 1
+  return {1, 1};
+}
+
 /**
  * Emit a log message via platform output (serial port, console, etc).
  */
@@ -133,6 +166,18 @@ void et_pal_emit_log_message(
       stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);
 }
 
+/**
+ * Dynamic memory allocators intended to be used by temp_allocator
+ * to implement malloc()/free() type of allocations.
+ * Currenyly not used.
+ */
+
+void* et_pal_allocate(ET_UNUSED size_t size) {
+  return nullptr;
+}
+
+void et_pal_free(ET_UNUSED void* ptr) {}
+
 namespace {
 
 // Setup our own allocator that can show some extra stuff like used and free
@@ -181,7 +226,7 @@ Result<BufferCleanup> prepare_input_tensors(
   size_t num_inputs = method_meta.num_inputs();
   size_t num_allocated = 0;
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   ET_CHECK_OR_RETURN_ERROR(
       input_buffers.size() > 0 && num_inputs == input_buffers.size(),
       InvalidArgument,
@@ -267,7 +312,7 @@ Result<BufferCleanup> prepare_input_tensors(
   return BufferCleanup({inputs, num_allocated});
 }
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
 
 std::pair<char*, size_t> read_binary_file(
     const char* filename,
@@ -304,7 +349,7 @@ std::pair<char*, size_t> read_binary_file(
 } // namespace
 
 int main(int argc, const char* argv[]) {
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   ET_LOG(Info, "Running executor with parameter:");
   if (argc < 7) {
     ET_LOG(Fatal, "Not right number of parameters!");
@@ -327,7 +372,7 @@ int main(int argc, const char* argv[]) {
   std::vector<std::pair<char*, size_t>> input_buffers;
   size_t pte_size = sizeof(model_pte);
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   const char* output_basename = nullptr;
   ArmMemoryAllocator input_file_allocator(
       input_file_allocation_pool_size, input_file_allocation_pool);
@@ -432,7 +477,16 @@ int main(int argc, const char* argv[]) {
 
   size_t method_loaded_membase = method_allocator.used_size();
 
-  Result<Method> method = program->load_method(method_name, &memory_manager);
+  executorch::runtime::EventTracer* event_tracer_ptr = nullptr;
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  event_tracer_ptr = &etdump_gen;
+#endif
+
+  Result<Method> method =
+      program->load_method(method_name, &memory_manager, event_tracer_ptr);
+
   if (!method.ok()) {
     ET_LOG(
         Info,
@@ -468,7 +522,7 @@ int main(int argc, const char* argv[]) {
   size_t executor_memsize = method_allocator.used_size() - executor_membase;
 
   ET_LOG(Info, "model_pte_loaded_size:     %lu bytes.", pte_size);
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   if (input_file_allocator.size() > 0) {
     ET_LOG(
         Info,
@@ -520,7 +574,7 @@ int main(int argc, const char* argv[]) {
   ET_CHECK(status == Error::Ok);
   for (int i = 0; i < outputs.size(); ++i) {
     Tensor t = outputs[i].toTensor();
-#ifndef SEMIHOSTING
+#if !defined(SEMIHOSTING)
     // The output might be collected and parsed so printf() is used instead
     // of ET_LOG() here
     for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
@@ -538,6 +592,25 @@ int main(int argc, const char* argv[]) {
             outputs[i].toTensor().const_data_ptr<float>()[j]);
       }
     }
+#if defined(ET_EVENT_TRACER_ENABLED)
+    ETDumpResult result = etdump_gen.get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+      // On a device with no file system we can't just write it out
+      // to the file-system so we base64 encode it and dump it on the log.
+      int mode = 0;
+      size_t len = result.size;
+      size_t encoded_len = base64_encoded_size(result.size, mode);
+      uint8_t* encoded_buf = reinterpret_cast<uint8_t*>(
+          method_allocator.allocate(encoded_len + 1));
+      int ret = base64_encode(
+          encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
+      encoded_buf[encoded_len] = 0x00; // Ensure null termination
+      ET_LOG(Info, "Writing etdump.bin [base64]");
+      printf(
+          "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#---\n",
+          encoded_buf);
+    }
+#endif
 #else
     char out_filename[255];
     snprintf(out_filename, 255, "%s-%d.bin", output_basename, i);
@@ -549,11 +622,24 @@ int main(int argc, const char* argv[]) {
         outputs[i].toTensor().nbytes(),
         out_file);
     fclose(out_file);
+#if defined(ET_EVENT_TRACER_ENABLED)
+    etdump_result result = etdump_gen.get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+      // On a device with a file system we can just write it out
+      // to the file-system.
+      char etdump_filename = "etdump.bin";
+      ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
+      FILE* f = fopen(etdump_filename, "w+");
+      fwrite((uint8_t*)result.buf, 1, result.size, f);
+      fclose(f);
+      free(result.buf);
+    }
+#endif
 #endif
   }
 out:
   ET_LOG(Info, "Program complete, exiting.");
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   _exit(0);
 #endif
   ET_LOG(Info, "\04");
diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp
index b75e510d9d..38868e8a1d 100644
--- a/examples/arm/executor_runner/arm_perf_monitor.cpp
+++ b/examples/arm/executor_runner/arm_perf_monitor.cpp
@@ -15,6 +15,7 @@
 #include <pmu_ethosu.h>
 
 static uint32_t ethosu_inference_count = 0;
+static uint64_t ethosu_ArmCycleCountStart = 0;
 static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0;
 static uint64_t ethosu_ArmBackendExecuteCycleCount = 0;
 static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0;
@@ -119,17 +120,14 @@ void StartMeasurements() {
   for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
     ethosu_pmuEventCounts[i] = 0;
   }
-  ARM_PMU_Enable();
-  DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable
-  ARM_PMU_CYCCNT_Reset();
-  ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
+  ethosu_ArmCycleCountStart = ARM_PMU_Get_CCNTR();
 }
 
 void StopMeasurements() {
   ARM_PMU_CNTR_Disable(
       PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk |
       PMU_CNTENCLR_CNT1_ENABLE_Msk);
-  uint32_t cycle_count = ARM_PMU_Get_CCNTR();
+  uint32_t cycle_count = ARM_PMU_Get_CCNTR() - ethosu_ArmCycleCountStart;
 
   // Number of comand streams handled by the NPU
   ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count);
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index cbc96c4b11..daab39ffc6 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -20,13 +20,16 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 root_dir=${script_dir}/ethos-u-scratch
 
 model_name=""
-reorder_inputs=""
 aot_arm_compiler_flags="--delegate --quantize"
+portable_kernels="aten::_softmax.out"
 target="ethos-u55-128"
 output_folder_set=false
 output_folder="."
+build_with_etdump=false
+build_type="Release"
+extra_build_flags=""
 build_only=false
-portable_kernels="aten::_softmax.out"
+reorder_inputs=""
 
 help() {
     echo "Usage: $(basename $0) [options]"
@@ -36,6 +39,9 @@ help() {
     echo "  --portable_kernels=<OPS>               Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
     echo "  --output=<FOLDER>                      Output folder Default: ${output_folder}"
+    echo "  --etdump                               Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --debug_build                          Build with debug flag, default is Release"
+    echo "  --extra_build_flags                    Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --build_only                           Only build, don't run FVP"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
     echo "  --reorder_inputs=<FLAGS>               Reorder the inputs. This can be required when inputs > 1."
@@ -50,6 +56,9 @@ for arg in "$@"; do
       --portable_kernels=*) portable_kernels="${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --etdump) build_with_etdump=true ;;
+      --debug_build) build_type="Debug" ;;
+      --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --build_only) build_only=true ;;
       --scratch-dir=*) root_dir="${arg#*=}";;
       --reorder_inputs=*) reorder_inputs="${arg#*=}";;
@@ -94,6 +103,7 @@ if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]];
 fi
 
 # Generate a pte file
+# output from this function is the pte filename e.g. echo should be avoided or directed to stderr e.g. >&2
 function generate_pte_file() {
     [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; }
     local model=${1}
@@ -115,33 +125,14 @@ function generate_pte_file() {
     # We are using the aot_lib from build_quantization_aot_lib below
     SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
 
-    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags}  --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library="$SO_LIB" 1>&2
+    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB"
+    echo "CALL ${ARM_AOT_CMD}" >&2
+    ${ARM_AOT_CMD} 1>&2
+
     [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
 
-# Build .so library to register quant ops with AoT flow
-function build_quantization_aot_lib()
-{
-    SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
-
-    cd $et_root_dir
-    mkdir -p cmake-out-aot-lib
-    cmake \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_XNNPACK=OFF \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-        -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-        -DPYTHON_EXECUTABLE=python3 \
-        -Bcmake-out-aot-lib \
-        "${et_root_dir}"
-
-    cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
-}
-
-
 # build ExecuTorch Libraries
 function build_executorch() {
     set -x
@@ -151,32 +142,86 @@ function build_executorch() {
     mkdir -p "${et_build_dir}"
 
     cd "${et_root_dir}"
+
+    build_with_etdump_flags=""
+    if [ "$build_with_etdump" = true ] ; then
+        ( set +x ;
+            echo "--------------------------------------------------------------------------------" ;
+            echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_root_dir} - cmake-out-host-tools/bin/flatcc" ;
+            echo "--------------------------------------------------------------------------------" )
+
+
+        # Build host flatcc bin
+        mkdir -p cmake-out-host-tools
+        cmake                                                 \
+            -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+            -DCMAKE_BUILD_TYPE=${build_type}                  \
+            -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
+            -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+            -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
+            -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+            -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+            -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
+            -DFLATCC_ALLOW_WERROR=OFF                         \
+            -DFLATC_EXECUTABLE="$(which flatc)"               \
+            ${extra_build_flags}                              \
+            -Bcmake-out-host-tools                            \
+            "${et_root_dir}"
+
+        mkdir -p cmake-out-host-tools/bin
+        cp third-party/flatcc/bin/flatcc cmake-out-host-tools/bin
+
+        build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+                                 -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+                                 -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
+                                 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
+                                 -DFLATCC_ALLOW_WERROR=OFF                         \
+                                 -DFLATCC_EXECUTABLE=${et_root_dir}/cmake-out-host-tools/bin/flatcc "
+    fi
+
+    ( set +x ;
+        echo "--------------------------------------------------------------------------------" ;
+        echo "Build ExecuTorch Libraries target libs with --target install ${build_type} into '${et_root_dir}' - '${et_build_dir}'" ;
+        echo "--------------------------------------------------------------------------------" )
+
+    # Build
     cmake                                                 \
         -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+        -DCMAKE_BUILD_TYPE=${build_type}                  \
+        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-        -DCMAKE_BUILD_TYPE=Release                        \
-        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
         -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+        ${build_with_etdump_flags}                        \
         -DFLATC_EXECUTABLE="$(which flatc)"               \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
+        ${extra_build_flags}                              \
         -B${et_build_dir}                                 \
         "${et_root_dir}"
 
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
-    cmake --build ${et_build_dir} --parallel --target install --config Release
+    cmake --build ${et_build_dir} --parallel --target install --config ${build_type} --
+
+    ( set +x ;
+        echo "--------------------------------------------------------------------------------" ;
+        echo "Build ExecuTorch Libraries ${build_type} into '${et_root_dir}/examples/arm' - '${et_build_dir}/examples/arm'" ;
+        echo "--------------------------------------------------------------------------------" )
 
     cmake                                                 \
         -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-        -DCMAKE_BUILD_TYPE=Release                        \
+        -DCMAKE_BUILD_TYPE=${build_type}                  \
+        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
         -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
         -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-        -B"${et_build_dir}"/examples/arm                  \
-        "${et_root_dir}"/examples/arm
-    cmake --build ${et_build_dir}/examples/arm --parallel --
+        ${extra_build_flags}                              \
+        -B"${et_build_dir}/examples/arm"                  \
+        "${et_root_dir}/examples/arm"
+
+    cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} --
 
     set +x
 
@@ -185,6 +230,40 @@ function build_executorch() {
     find . -name "*.a" -exec ls -al {} \;
 }
 
+# Build .so library to register quant ops with AoT flow
+function build_quantization_aot_lib()
+{
+    SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+
+    cd $et_root_dir
+    mkdir -p cmake-out-aot-lib
+
+    echo "--------------------------------------------------------------------------------"
+    echo "Build .so library to register quant ops with AoT flow ${build_type} into '${et_root_dir}' - 'cmake-out-aot-lib'"
+    echo "--------------------------------------------------------------------------------"
+
+    build_with_etdump_flags=""
+    if [ "$build_with_etdump" = true ] ; then
+        build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON        \
+                                 -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    fi
+
+    cmake \
+        -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH"    \
+        -DCMAKE_BUILD_TYPE=${build_type}            \
+        -DEXECUTORCH_BUILD_XNNPACK=OFF              \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
+        ${build_with_etdump_flags}                  \
+        -DPYTHON_EXECUTABLE=$(which python3)        \
+        ${extra_build_flags}                        \
+        -Bcmake-out-aot-lib                         \
+        "${et_root_dir}"
+
+    cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
+}
+
 # build Arm Baremetal executor_runner
 function build_executorch_runner() {
     echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries"
@@ -192,22 +271,37 @@ function build_executorch_runner() {
     local pte=${1}
     if [[ ${target} == *"ethos-u55"*  ]]; then
         local target_cpu=cortex-m55
-	local target_board=corstone-300
+      local target_board=corstone-300
     else
         local target_cpu=cortex-m85
-	local target_board=corstone-320
+      local target_board=corstone-320
     fi
+    echo "--------------------------------------------------------------------------------"
+    echo "Build Arm Baremetal executor_runner for ${target} - '${executor_runner_path}/cmake-out'"
+    echo "--------------------------------------------------------------------------------"
+
     cd ${script_dir}/executor_runner
-    cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}     \
-	  -DTARGET_CPU=${target_cpu}                    \
-	  -DTARGET_BOARD=${target_board}                \
-	  -DETHOSU_TARGET_NPU_CONFIG=${target}          \
-	  -B ${executor_runner_path}/cmake-out          \
-	  -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}     \
-	  -DET_DIR_PATH:PATH=${et_root_dir}             \
-	  -DET_BUILD_DIR_PATH:PATH=${et_build_dir}      \
-	  -DET_PTE_FILE_PATH:PATH="${pte}"              \
-	  -DPYTHON_EXECUTABLE=$(which python3)
+
+    build_with_etdump_flags=""
+    if [ "$build_with_etdump" = true ] ; then
+        build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    fi
+
+    cmake \
+      -DCMAKE_BUILD_TYPE=${build_type}            \
+      -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}   \
+      -DTARGET_CPU=${target_cpu}                  \
+      -DTARGET_BOARD=${target_board}              \
+      -DET_DIR_PATH:PATH=${et_root_dir}           \
+      -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
+      -DET_PTE_FILE_PATH:PATH="${pte}"            \
+      -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
+      -DETHOSU_TARGET_NPU_CONFIG=${target}        \
+      ${build_with_etdump_flags}                  \
+      -DPYTHON_EXECUTABLE=$(which python3)        \
+      ${extra_build_flags}                        \
+      -B ${executor_runner_path}/cmake-out
+
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
     cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner
@@ -235,7 +329,7 @@ function run_fvp() {
             -C mps3_board.uart0.out_file='-'                    \
             -C mps3_board.uart0.shutdown_on_eot=1               \
             -a "${elf}"                                         \
-            --timelimit 120 || true # seconds
+            --timelimit 220 || true # seconds
         echo "[${FUNCNAME[0]}] Simulation complete, $?"
     elif [[ ${target} == *"ethos-u85"*  ]]; then
         echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
@@ -247,7 +341,7 @@ function run_fvp() {
             -C mps4_board.uart0.out_file='-'                    \
             -C mps4_board.uart0.shutdown_on_eot=1               \
             -a "${elf}"                                         \
-            --timelimit 120 || true # seconds
+            --timelimit 220 || true # seconds
         echo "[${FUNCNAME[0]}] Simulation complete, $?"
     else
         echo "Running ${elf} for ${target} is not supported"