diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d79b49e6e..ac8950bc1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -673,10 +673,17 @@ if(EXECUTORCH_BUILD_XNNPACK)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
-      ON
-      CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
-  )
+  if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
+    set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+        ON
+        CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
+    )
+  else()
+    set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+        OFF
+        CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
+    )
+  endif()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
 endif()
 
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index a14c42140e..2cc716391b 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -15,6 +15,39 @@
 
 #include <ethosu_driver.h>
 
+#if defined(ET_EVENT_TRACER_ENABLED)
+#include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/event_tracer_hooks.h>
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerEntry;
+
+class EventTraceScope {
+ public:
+  EventTraceScope(EventTracer* event_tracer_, const char* name) {
+    event_tracer = event_tracer_;
+    event_tracer_entry_scope = event_tracer->start_profiling(name);
+  }
+  ~EventTraceScope() {
+    event_tracer->end_profiling(event_tracer_entry_scope);
+  }
+
+ private:
+  EventTracer* event_tracer;
+  EventTracerEntry event_tracer_entry_scope;
+};
+#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \
+  EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
+#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \
+  SCOPE = EVENTTRACER->start_profiling(NAME)
+#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \
+  EVENTTRACER->end_profiling(SCOPE)
+
+#else
+#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME)
+#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME)
+#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE)
+#endif
+
 #include <executorch/backends/arm/runtime/VelaBinStream.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       BackendExecutionContext& context,
       DelegateHandle* input_handle,
       EValue** args) const override {
+#if defined(ET_EVENT_TRACER_ENABLED)
+    EventTracer* event_tracer = context.event_tracer();
+    EventTracerEntry event_tracer_local_scope;
+#endif
+
+    EXECUTORCH_PROF_SCOPE(event_tracer, "ArmBackend::execute()");
+    ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
+
     ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
     VelaHandles handles;
 
-    ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
     // Command stream - we know at this point it's aligned
+    EXECUTORCH_PROF_START(
+        event_tracer,
+        event_tracer_local_scope,
+        "+ArmBackend::execute()processed_data");
     char* data = (char*)execution_handle->processed->data();
+    EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
+
     ET_LOG(Debug, "ArmBackend::execute %p", data);
 
+    EXECUTORCH_PROF_START(
+        event_tracer,
+        event_tracer_local_scope,
+        "+ArmBackend::execute()vela_bin_read()");
     // Read key sections from the vela_bin_stream
     if (vela_bin_read(data, &handles, execution_handle->processed->size()) ==
         false) {
       ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
       return Error::InvalidProgram;
     }
+    EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
     ET_LOG(
         Debug,
@@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
 
       // Select a compatible copy routine
       if (both_char and permuted_input_shape) {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer,
+            "+ArmBackend::execute()handles.input.permute_CHW_to_HWC()");
         // permuted byte copy CHW to HWC
         permute_CHW_to_HWC(
             tensor_in.mutable_data_ptr<char>(),
@@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             tensor_in.size(2),
             tensor_in.size(3));
       } else if (both_char or both_int) {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer, "+ArmBackend::execute()handles.input.memcpy()");
         // Sizes match and elt size matches so memcpy
         memcpy(
             scratch_addr,
@@ -234,7 +290,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
     size_t bases_size[2] = {
         handles.weight_data_size, handles.scratch_data_size};
-    int result = ethosu_invoke_v3(
+    int result = 0;
+    EXECUTORCH_PROF_START(
+        event_tracer, event_tracer_local_scope, "+ArmBackend::execute()NPU");
+    result = ethosu_invoke_v3(
         driver.get(),
         (void*)handles.cmd_data,
         handles.cmd_data_size,
@@ -242,6 +301,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         bases_size,
         2, /* fixed array of pointers to binary interface*/
         nullptr);
+    EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
     if (result != 0) {
       ET_LOG(
@@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
           &permuted_output_shape));
       if (tensor_out.scalar_type() == ScalarType::Char and
           permuted_output_shape) {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer,
+            "+ArmBackend::execute()handles.output.permute_HWC_to_CHW()");
+
         char* output_address = (char*)output_addr;
         permute_HWC_to_CHW(
             output_address,
@@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
             tensor_out.size(2),
             tensor_out.size(3));
       } else {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer, "+ArmBackend::execute()handles.output.move()");
         for (int j = 0; j < tensor_out.numel(); j++) {
           if (tensor_out.scalar_type() == ScalarType::Char) {
             char* output_address = (char*)output_addr;
diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt
index df4bacb802..58043067a5 100644
--- a/devtools/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
+
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
 if(NOT FLATCC_EXECUTABLE)
-  set(FLATCC_EXECUTABLE flatcc)
+  set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
 endif()
 
 # Source root directory for executorch.
@@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE
     OFF
     CACHE BOOL ""
 )
-set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)
+
 add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)
 
 # Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
@@ -163,7 +165,7 @@ add_custom_command(
     # Note that the flatcc project actually writes its outputs into the source
     # tree instead of under the binary directory, and there's no way to change
     # that behavior.
-    ${_flatcc_source_dir}/bin/flatcc -cwr -o
+    ${FLATCC_EXECUTABLE} -cwr -o
     ${_program_schema__include_dir}/executorch/devtools/etdump
     ${_etdump_schema__srcs}
   COMMAND rm -rf ${_etdump_schema_cleanup_paths}
diff --git a/examples/arm/CMakeLists.txt b/examples/arm/CMakeLists.txt
index e0a8186b46..0c754beaaa 100644
--- a/examples/arm/CMakeLists.txt
+++ b/examples/arm/CMakeLists.txt
@@ -57,3 +57,8 @@ generate_bindings_for_kernels(
 gen_operators_lib(
   LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
 )
+
+if(EXECUTORCH_ENABLE_EVENT_TRACER)
+  target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+  target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
+endif()
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 7fc2cf0564..6713c9b6f8 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -273,7 +273,7 @@ def get_compile_spec(
                 target,
                 system_config="Ethos_U55_High_End_Embedded",
                 memory_mode="Shared_Sram",
-                extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
+                extra_flags="--debug-force-regor --output-format=raw --verbose-operators",
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
@@ -286,7 +286,7 @@ def get_compile_spec(
                 target,
                 system_config="Ethos_U85_SYS_DRAM_Mid",
                 memory_mode="Shared_Sram",
-                extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
+                extra_flags="--output-format=raw --verbose-operators",
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
index 4467185ae7..9b47aa4e3a 100644
--- a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
@@ -20,11 +20,12 @@ index b458fc6..8d4bc73 100644
  
      KEEP(*(.eh_frame*))
    } > ITCM :rom_exec
-@@ -280,7 +280,7 @@ SECTIONS
+@@ -280,7 +280,8 @@ SECTIONS
  #endif
      * (expected_output_data_sec)
      * (sec_command_stream, sec_weight_data, sec_input_data)
 -
++    *(.got*)
 +    *(.rodata*)
      * (ethosu_core_in_queue)
      * (ethosu_core_out_queue)
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 7da3462924..ea180f4d23 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF)
 option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
 option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
 
-
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   message(
     FATAL_ERROR
@@ -220,10 +219,8 @@ target_sources(
 # Include the target's bare-metal linker script
 ethosu_eval_link_options(arm_executor_runner)
 
-# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
-# bin size as we link in a number of other symbols
-target_link_libraries(
-  arm_executor_runner
+set(arm_executor_runner_link)
+list(APPEND arm_executor_runner_link
   extension_runner_util
   ethosu_target_init
   executorch
@@ -237,6 +234,44 @@ target_link_libraries(
   -Xlinker -Map=arm_executor_runner.map
 )
 
+if(EXECUTORCH_ENABLE_EVENT_TRACER)
+  target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED)
+
+  add_library(etdump STATIC IMPORTED)
+  set_property(
+      TARGET etdump
+      PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/lib/libetdump.a"
+  )
+
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(FLATCCRT_LIB flatccrt_d)
+  else()
+    set(FLATCCRT_LIB flatccrt)
+  endif()
+
+  add_library(${FLATCCRT_LIB} STATIC IMPORTED)
+  set_property(
+      TARGET ${FLATCCRT_LIB}
+      PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
+  )
+
+  list(APPEND arm_executor_runner_link
+    etdump
+    ${FLATCCRT_LIB}
+  )
+endif()
+
+# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
+# bin size as we link in a number of other symbols
+target_link_libraries(
+  arm_executor_runner
+  ${arm_executor_runner_link}
+)
+
+target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map )
+
 # ET headers and generated headers includes
 target_include_directories(
   arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 5b9945be7a..e9c758a60b 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -21,8 +21,14 @@
 #include <executorch/runtime/platform/runtime.h>
 
 #include "arm_perf_monitor.h"
+#if defined(ET_EVENT_TRACER_ENABLED)
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#if !defined(SEMIHOSTING)
+#include <executorch/third-party/flatcc/include/flatcc/portable/pbase64.h>
+#endif
+#endif
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
 
 /**
  * The input_file_allocation_pool should be large enough to fit the various
@@ -75,7 +81,10 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::Tag;
 using executorch::runtime::TensorInfo;
-
+#if defined(ET_EVENT_TRACER_ENABLED)
+using executorch::etdump::ETDumpGen;
+using executorch::etdump::ETDumpResult;
+#endif
 /**
  * The method_allocation_pool should be large enough to fit the setup, input
  * used and other data used like the planned memory pool (e.g. memory-planned
@@ -84,8 +93,8 @@ using executorch::runtime::TensorInfo;
  * large models if you run on HW this should be lowered to fit into your
  * availible memory.
  */
-#ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
-#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (20 * 1024 * 1024)
+#if !defined(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
+#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (60 * 1024 * 1024)
 #endif
 const size_t method_allocation_pool_size =
     ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE;
@@ -99,7 +108,7 @@ unsigned char __attribute__((
  * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
  * a better fit
  */
-#ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE
+#if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
 #define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
 #endif
 const size_t temp_allocation_pool_size =
@@ -108,16 +117,40 @@ unsigned char __attribute__((
     section("input_data_sec"),
     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
 
-void et_pal_init(void) {}
+void et_pal_init(void) {
+  // Enable ARM PMU Clock
+  ARM_PMU_Enable();
+  DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable
+  ARM_PMU_CYCCNT_Reset();
+  ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
+}
+
+/**
+ * Implementation of the et_pal_<funcs>()
+ *
+ * This functions are hardware adaption type of functions for things like
+ * time/logging/memory allocation that could call your RTOS or need to to
+ * be implemnted in some way.
+ */
 
 ET_NORETURN void et_pal_abort(void) {
-#ifndef SEMIHOSTING
+#if !defined(SEMIHOSTING)
   __builtin_trap();
 #else
   _exit(-1);
 #endif
 }
 
+et_timestamp_t et_pal_current_ticks(void) {
+  return ARM_PMU_Get_CCNTR();
+}
+
+et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
+  // Since we don't know the CPU freq for your target and justs cycles in the
+  // FVP for et_pal_current_ticks() we return a conversion ratio of 1
+  return {1, 1};
+}
+
 /**
  * Emit a log message via platform output (serial port, console, etc).
  */
@@ -133,6 +166,18 @@ void et_pal_emit_log_message(
       stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);
 }
 
+/**
+ * Dynamic memory allocators intended to be used by temp_allocator
+ * to implement malloc()/free() type of allocations.
+ * Currenyly not used.
+ */
+
+void* et_pal_allocate(ET_UNUSED size_t size) {
+  return nullptr;
+}
+
+void et_pal_free(ET_UNUSED void* ptr) {}
+
 namespace {
 
 // Setup our own allocator that can show some extra stuff like used and free
@@ -181,7 +226,7 @@ Result<BufferCleanup> prepare_input_tensors(
   size_t num_inputs = method_meta.num_inputs();
   size_t num_allocated = 0;
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   ET_CHECK_OR_RETURN_ERROR(
       input_buffers.size() > 0 && num_inputs == input_buffers.size(),
       InvalidArgument,
@@ -267,7 +312,7 @@ Result<BufferCleanup> prepare_input_tensors(
   return BufferCleanup({inputs, num_allocated});
 }
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
 
 std::pair<char*, size_t> read_binary_file(
     const char* filename,
@@ -304,7 +349,7 @@ std::pair<char*, size_t> read_binary_file(
 } // namespace
 
 int main(int argc, const char* argv[]) {
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   ET_LOG(Info, "Running executor with parameter:");
   if (argc < 7) {
     ET_LOG(Fatal, "Not right number of parameters!");
@@ -327,7 +372,7 @@ int main(int argc, const char* argv[]) {
   std::vector<std::pair<char*, size_t>> input_buffers;
   size_t pte_size = sizeof(model_pte);
 
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   const char* output_basename = nullptr;
   ArmMemoryAllocator input_file_allocator(
       input_file_allocation_pool_size, input_file_allocation_pool);
@@ -432,7 +477,16 @@ int main(int argc, const char* argv[]) {
 
   size_t method_loaded_membase = method_allocator.used_size();
 
-  Result<Method> method = program->load_method(method_name, &memory_manager);
+  executorch::runtime::EventTracer* event_tracer_ptr = nullptr;
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+  torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
+  event_tracer_ptr = &etdump_gen;
+#endif
+
+  Result<Method> method =
+      program->load_method(method_name, &memory_manager, event_tracer_ptr);
+
   if (!method.ok()) {
     ET_LOG(
         Info,
@@ -468,7 +522,7 @@ int main(int argc, const char* argv[]) {
   size_t executor_memsize = method_allocator.used_size() - executor_membase;
 
   ET_LOG(Info, "model_pte_loaded_size:     %lu bytes.", pte_size);
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   if (input_file_allocator.size() > 0) {
     ET_LOG(
         Info,
@@ -520,7 +574,7 @@ int main(int argc, const char* argv[]) {
   ET_CHECK(status == Error::Ok);
   for (int i = 0; i < outputs.size(); ++i) {
     Tensor t = outputs[i].toTensor();
-#ifndef SEMIHOSTING
+#if !defined(SEMIHOSTING)
     // The output might be collected and parsed so printf() is used instead
     // of ET_LOG() here
     for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
@@ -538,6 +592,25 @@ int main(int argc, const char* argv[]) {
             outputs[i].toTensor().const_data_ptr<float>()[j]);
       }
     }
+#if defined(ET_EVENT_TRACER_ENABLED)
+    ETDumpResult result = etdump_gen.get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+      // On a device with no file system we can't just write it out
+      // to the file-system so we base64 encode it and dump it on the log.
+      int mode = 0;
+      size_t len = result.size;
+      size_t encoded_len = base64_encoded_size(result.size, mode);
+      uint8_t* encoded_buf = reinterpret_cast<uint8_t*>(
+          method_allocator.allocate(encoded_len + 1));
+      int ret = base64_encode(
+          encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
+      encoded_buf[encoded_len] = 0x00; // Ensure null termination
+      ET_LOG(Info, "Writing etdump.bin [base64]");
+      printf(
+          "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#---\n",
+          encoded_buf);
+    }
+#endif
 #else
     char out_filename[255];
     snprintf(out_filename, 255, "%s-%d.bin", output_basename, i);
@@ -549,11 +622,24 @@ int main(int argc, const char* argv[]) {
         outputs[i].toTensor().nbytes(),
         out_file);
     fclose(out_file);
+#if defined(ET_EVENT_TRACER_ENABLED)
+    etdump_result result = etdump_gen.get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+      // On a device with a file system we can just write it out
+      // to the file-system.
+      char etdump_filename = "etdump.bin";
+      ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
+      FILE* f = fopen(etdump_filename, "w+");
+      fwrite((uint8_t*)result.buf, 1, result.size, f);
+      fclose(f);
+      free(result.buf);
+    }
+#endif
 #endif
   }
 out:
   ET_LOG(Info, "Program complete, exiting.");
-#ifdef SEMIHOSTING
+#if defined(SEMIHOSTING)
   _exit(0);
 #endif
   ET_LOG(Info, "\04");
diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp
index b75e510d9d..38868e8a1d 100644
--- a/examples/arm/executor_runner/arm_perf_monitor.cpp
+++ b/examples/arm/executor_runner/arm_perf_monitor.cpp
@@ -15,6 +15,7 @@
 #include <pmu_ethosu.h>
 
 static uint32_t ethosu_inference_count = 0;
+static uint64_t ethosu_ArmCycleCountStart = 0;
 static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0;
 static uint64_t ethosu_ArmBackendExecuteCycleCount = 0;
 static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0;
@@ -119,17 +120,14 @@ void StartMeasurements() {
   for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
     ethosu_pmuEventCounts[i] = 0;
   }
-  ARM_PMU_Enable();
-  DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable
-  ARM_PMU_CYCCNT_Reset();
-  ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
+  ethosu_ArmCycleCountStart = ARM_PMU_Get_CCNTR();
 }
 
 void StopMeasurements() {
   ARM_PMU_CNTR_Disable(
       PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk |
       PMU_CNTENCLR_CNT1_ENABLE_Msk);
-  uint32_t cycle_count = ARM_PMU_Get_CCNTR();
+  uint32_t cycle_count = ARM_PMU_Get_CCNTR() - ethosu_ArmCycleCountStart;
 
   // Number of comand streams handled by the NPU
   ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count);
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index cbc96c4b11..daab39ffc6 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -20,13 +20,16 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 root_dir=${script_dir}/ethos-u-scratch
 
 model_name=""
-reorder_inputs=""
 aot_arm_compiler_flags="--delegate --quantize"
+portable_kernels="aten::_softmax.out"
 target="ethos-u55-128"
 output_folder_set=false
 output_folder="."
+build_with_etdump=false
+build_type="Release"
+extra_build_flags=""
 build_only=false
-portable_kernels="aten::_softmax.out"
+reorder_inputs=""
 
 help() {
     echo "Usage: $(basename $0) [options]"
@@ -36,6 +39,9 @@ help() {
     echo "  --portable_kernels=<OPS>               Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
     echo "  --output=<FOLDER>                      Output folder Default: ${output_folder}"
+    echo "  --etdump                               Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --debug_build                          Build with debug flag, default is Release"
+    echo "  --extra_build_flags                    Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --build_only                           Only build, don't run FVP"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
     echo "  --reorder_inputs=<FLAGS>               Reorder the inputs. This can be required when inputs > 1."
@@ -50,6 +56,9 @@ for arg in "$@"; do
       --portable_kernels=*) portable_kernels="${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --etdump) build_with_etdump=true ;;
+      --debug_build) build_type="Debug" ;;
+      --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --build_only) build_only=true ;;
       --scratch-dir=*) root_dir="${arg#*=}";;
       --reorder_inputs=*) reorder_inputs="${arg#*=}";;
@@ -94,6 +103,7 @@ if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]];
 fi
 
 # Generate a pte file
+# output from this function is the pte filename e.g. echo should be avoided or directed to stderr e.g. >&2
 function generate_pte_file() {
     [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; }
     local model=${1}
@@ -115,33 +125,14 @@ function generate_pte_file() {
     # We are using the aot_lib from build_quantization_aot_lib below
     SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
 
-    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags}  --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library="$SO_LIB" 1>&2
+    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB"
+    echo "CALL ${ARM_AOT_CMD}" >&2
+    ${ARM_AOT_CMD} 1>&2
+
     [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
 
-# Build .so library to register quant ops with AoT flow
-function build_quantization_aot_lib()
-{
-    SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-    CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
-
-    cd $et_root_dir
-    mkdir -p cmake-out-aot-lib
-    cmake \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_XNNPACK=OFF \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-        -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-        -DPYTHON_EXECUTABLE=python3 \
-        -Bcmake-out-aot-lib \
-        "${et_root_dir}"
-
-    cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
-}
-
-
 # build ExecuTorch Libraries
 function build_executorch() {
     set -x
@@ -151,32 +142,86 @@ function build_executorch() {
     mkdir -p "${et_build_dir}"
 
     cd "${et_root_dir}"
+
+    build_with_etdump_flags=""
+    if [ "$build_with_etdump" = true ] ; then
+        ( set +x ;
+            echo "--------------------------------------------------------------------------------" ;
+            echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_root_dir} - cmake-out-host-tools/bin/flatcc" ;
+            echo "--------------------------------------------------------------------------------" )
+
+
+        # Build host flatcc bin
+        mkdir -p cmake-out-host-tools
+        cmake                                                 \
+            -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+            -DCMAKE_BUILD_TYPE=${build_type}                  \
+            -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
+            -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+            -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
+            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
+            -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+            -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+            -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
+            -DFLATCC_ALLOW_WERROR=OFF                         \
+            -DFLATC_EXECUTABLE="$(which flatc)"               \
+            ${extra_build_flags}                              \
+            -Bcmake-out-host-tools                            \
+            "${et_root_dir}"
+
+        mkdir -p cmake-out-host-tools/bin
+        cp third-party/flatcc/bin/flatcc cmake-out-host-tools/bin
+
+        build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+                                 -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+                                 -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
+                                 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
+                                 -DFLATCC_ALLOW_WERROR=OFF                         \
+                                 -DFLATCC_EXECUTABLE=${et_root_dir}/cmake-out-host-tools/bin/flatcc "
+    fi
+
+    ( set +x ;
+        echo "--------------------------------------------------------------------------------" ;
+        echo "Build ExecuTorch Libraries target libs with --target install ${build_type} into '${et_root_dir}' - '${et_build_dir}'" ;
+        echo "--------------------------------------------------------------------------------" )
+
+    # Build
     cmake                                                 \
         -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+        -DCMAKE_BUILD_TYPE=${build_type}                  \
+        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-        -DCMAKE_BUILD_TYPE=Release                        \
-        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
         -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+        ${build_with_etdump_flags}                        \
         -DFLATC_EXECUTABLE="$(which flatc)"               \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
+        ${extra_build_flags}                              \
         -B${et_build_dir}                                 \
         "${et_root_dir}"
 
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
-    cmake --build ${et_build_dir} --parallel --target install --config Release
+    cmake --build ${et_build_dir} --parallel --target install --config ${build_type} --
+
+    ( set +x ;
+        echo "--------------------------------------------------------------------------------" ;
+        echo "Build ExecuTorch Libraries ${build_type} into '${et_root_dir}/examples/arm' - '${et_build_dir}/examples/arm'" ;
+        echo "--------------------------------------------------------------------------------" )
 
     cmake                                                 \
         -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-        -DCMAKE_BUILD_TYPE=Release                        \
+        -DCMAKE_BUILD_TYPE=${build_type}                  \
+        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
         -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
         -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-        -B"${et_build_dir}"/examples/arm                  \
-        "${et_root_dir}"/examples/arm
-    cmake --build ${et_build_dir}/examples/arm --parallel --
+        ${extra_build_flags}                              \
+        -B"${et_build_dir}/examples/arm"                  \
+        "${et_root_dir}/examples/arm"
+
+    cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} --
 
     set +x
 
@@ -185,6 +230,40 @@ function build_executorch() {
     find . -name "*.a" -exec ls -al {} \;
 }
 
+# Build .so library to register quant ops with AoT flow
+function build_quantization_aot_lib()
+{
+    SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+
+    cd $et_root_dir
+    mkdir -p cmake-out-aot-lib
+
+    echo "--------------------------------------------------------------------------------"
+    echo "Build .so library to register quant ops with AoT flow ${build_type} into '${et_root_dir}' - 'cmake-out-aot-lib'"
+    echo "--------------------------------------------------------------------------------"
+
+    build_with_etdump_flags=""
+    if [ "$build_with_etdump" = true ] ; then
+        build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON        \
+                                 -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    fi
+
+    cmake \
+        -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH"    \
+        -DCMAKE_BUILD_TYPE=${build_type}            \
+        -DEXECUTORCH_BUILD_XNNPACK=OFF              \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
+        ${build_with_etdump_flags}                  \
+        -DPYTHON_EXECUTABLE=$(which python3)        \
+        ${extra_build_flags}                        \
+        -Bcmake-out-aot-lib                         \
+        "${et_root_dir}"
+
+    cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
+}
+
 # build Arm Baremetal executor_runner
 function build_executorch_runner() {
     echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries"
@@ -192,22 +271,37 @@ function build_executorch_runner() {
     local pte=${1}
     if [[ ${target} == *"ethos-u55"*  ]]; then
         local target_cpu=cortex-m55
-	local target_board=corstone-300
+      local target_board=corstone-300
     else
         local target_cpu=cortex-m85
-	local target_board=corstone-320
+      local target_board=corstone-320
     fi
+    echo "--------------------------------------------------------------------------------"
+    echo "Build Arm Baremetal executor_runner for ${target} - '${executor_runner_path}/cmake-out'"
+    echo "--------------------------------------------------------------------------------"
+
     cd ${script_dir}/executor_runner
-    cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}     \
-	  -DTARGET_CPU=${target_cpu}                    \
-	  -DTARGET_BOARD=${target_board}                \
-	  -DETHOSU_TARGET_NPU_CONFIG=${target}          \
-	  -B ${executor_runner_path}/cmake-out          \
-	  -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}     \
-	  -DET_DIR_PATH:PATH=${et_root_dir}             \
-	  -DET_BUILD_DIR_PATH:PATH=${et_build_dir}      \
-	  -DET_PTE_FILE_PATH:PATH="${pte}"              \
-	  -DPYTHON_EXECUTABLE=$(which python3)
+
+    build_with_etdump_flags=""
+    if [ "$build_with_etdump" = true ] ; then
+        build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    fi
+
+    cmake \
+      -DCMAKE_BUILD_TYPE=${build_type}            \
+      -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}   \
+      -DTARGET_CPU=${target_cpu}                  \
+      -DTARGET_BOARD=${target_board}              \
+      -DET_DIR_PATH:PATH=${et_root_dir}           \
+      -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
+      -DET_PTE_FILE_PATH:PATH="${pte}"            \
+      -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
+      -DETHOSU_TARGET_NPU_CONFIG=${target}        \
+      ${build_with_etdump_flags}                  \
+      -DPYTHON_EXECUTABLE=$(which python3)        \
+      ${extra_build_flags}                        \
+      -B ${executor_runner_path}/cmake-out
+
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
     cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner
@@ -235,7 +329,7 @@ function run_fvp() {
             -C mps3_board.uart0.out_file='-'                    \
             -C mps3_board.uart0.shutdown_on_eot=1               \
             -a "${elf}"                                         \
-            --timelimit 120 || true # seconds
+            --timelimit 220 || true # seconds
         echo "[${FUNCNAME[0]}] Simulation complete, $?"
     elif [[ ${target} == *"ethos-u85"*  ]]; then
         echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
@@ -247,7 +341,7 @@ function run_fvp() {
             -C mps4_board.uart0.out_file='-'                    \
             -C mps4_board.uart0.shutdown_on_eot=1               \
             -a "${elf}"                                         \
-            --timelimit 120 || true # seconds
+            --timelimit 220 || true # seconds
         echo "[${FUNCNAME[0]}] Simulation complete, $?"
     else
         echo "Running ${elf} for ${target} is not supported"