From dabb14ee61b48bc527dbee292790d59d1ee633aa Mon Sep 17 00:00:00 2001 From: Zingo Andersen Date: Mon, 14 Oct 2024 16:48:35 +0200 Subject: [PATCH] Arm backend: Add devtools support to example New flags on run.sh --etdump Build in etdump and profiling, the etdump base64 coded and put in the log --debug_build Build debug instead of release --extra_build_flags Extra flags to pass to cmake this makes it for example possible to override the allocator pool size or other build time cmake flags. The devtools build has been updated so FLATCC_EXECUTABLE can be used to point out the executable. Signed-off-by: Zingo Andersen Change-Id: Ic0fb1e48ee633c5fe91473bdc2db9e894b2fc4fa --- CMakeLists.txt | 15 +- backends/arm/runtime/ArmBackendEthosU.cpp | 70 ++++++- devtools/CMakeLists.txt | 8 +- examples/arm/CMakeLists.txt | 5 + examples/arm/aot_arm_compiler.py | 4 +- .../patches/0001-Move-rodata-to-the-DDR.patch | 3 +- examples/arm/executor_runner/CMakeLists.txt | 45 ++++- .../executor_runner/arm_executor_runner.cpp | 116 +++++++++-- .../arm/executor_runner/arm_perf_monitor.cpp | 8 +- examples/arm/run.sh | 190 +++++++++++++----- 10 files changed, 379 insertions(+), 85 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d79b49e6e..ac8950bc1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -673,10 +673,17 @@ if(EXECUTORCH_BUILD_XNNPACK) endif() if(EXECUTORCH_BUILD_DEVTOOLS) - set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER - ON - CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE - ) + if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL) + set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER + ON + CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE + ) + else() + set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER + OFF + CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE + ) + endif() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index a14c42140e..2cc716391b 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -15,6 +15,39 @@ #include +#if defined(ET_EVENT_TRACER_ENABLED) +#include +#include +using executorch::runtime::EventTracer; +using executorch::runtime::EventTracerEntry; + +class EventTraceScope { + public: + EventTraceScope(EventTracer* event_tracer_, const char* name) { + event_tracer = event_tracer_; + event_tracer_entry_scope = event_tracer->start_profiling(name); + } + ~EventTraceScope() { + event_tracer->end_profiling(event_tracer_entry_scope); + } + + private: + EventTracer* event_tracer; + EventTracerEntry event_tracer_entry_scope; +}; +#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \ + EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME) +#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \ + SCOPE = EVENTTRACER->start_profiling(NAME) +#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \ + EVENTTRACER->end_profiling(SCOPE) + +#else +#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) +#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) +#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) +#endif + #include #include #include @@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { BackendExecutionContext& context, DelegateHandle* input_handle, EValue** args) const override { +#if defined(ET_EVENT_TRACER_ENABLED) + EventTracer* event_tracer = context.event_tracer(); + EventTracerEntry event_tracer_local_scope; +#endif + + EXECUTORCH_PROF_SCOPE(event_tracer, "ArmBackend::execute()"); + ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; + ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle; VelaHandles handles; - ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; // Command stream - we know at this point it's aligned + EXECUTORCH_PROF_START( + event_tracer, + event_tracer_local_scope, + "+ArmBackend::execute()processed_data"); char* data = (char*)execution_handle->processed->data(); + EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); + ET_LOG(Debug, "ArmBackend::execute %p", data); + EXECUTORCH_PROF_START( + event_tracer, + event_tracer_local_scope, + "+ArmBackend::execute()vela_bin_read()"); // Read key sections from the vela_bin_stream if (vela_bin_read(data, &handles, execution_handle->processed->size()) == false) { ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout"); return Error::InvalidProgram; } + EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); ET_LOG( Debug, @@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { // Select a compatible copy routine if (both_char and permuted_input_shape) { + EXECUTORCH_PROF_SCOPE( + event_tracer, + "+ArmBackend::execute()handles.input.permute_CHW_to_HWC()"); // permuted byte copy CHW to HWC permute_CHW_to_HWC( tensor_in.mutable_data_ptr(), @@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { tensor_in.size(2), tensor_in.size(3)); } else if (both_char or both_int) { + EXECUTORCH_PROF_SCOPE( + event_tracer, "+ArmBackend::execute()handles.input.memcpy()"); // Sizes match and elt size matches so memcpy memcpy( scratch_addr, @@ -234,7 +290,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; size_t bases_size[2] = { handles.weight_data_size, handles.scratch_data_size}; - int result = ethosu_invoke_v3( + int result = 0; + EXECUTORCH_PROF_START( + event_tracer, event_tracer_local_scope, "+ArmBackend::execute()NPU"); + result = ethosu_invoke_v3( driver.get(), (void*)handles.cmd_data, handles.cmd_data_size, @@ -242,6 +301,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { bases_size, 2, /* fixed array of pointers to binary interface*/ nullptr); + EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); if (result != 0) { ET_LOG( @@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { &permuted_output_shape)); if (tensor_out.scalar_type() == ScalarType::Char and permuted_output_shape) { + EXECUTORCH_PROF_SCOPE( + event_tracer, + "+ArmBackend::execute()handles.output.permute_HWC_to_CHW()"); + char* output_address = (char*)output_addr; permute_HWC_to_CHW( output_address, @@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { tensor_out.size(2), tensor_out.size(3)); } else { + EXECUTORCH_PROF_SCOPE( + event_tracer, "+ArmBackend::execute()handles.output.move()"); for (int j = 0; j < tensor_out.numel(); j++) { if (tensor_out.scalar_type() == ScalarType::Char) { char* output_address = (char*)output_addr; diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt index df4bacb802..58043067a5 100644 --- a/devtools/CMakeLists.txt +++ b/devtools/CMakeLists.txt @@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc) + if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() if(NOT FLATCC_EXECUTABLE) - set(FLATCC_EXECUTABLE flatcc) + set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc) endif() # Source root directory for executorch. @@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE OFF CACHE BOOL "" ) -set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc) + add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc) # Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making @@ -163,7 +165,7 @@ add_custom_command( # Note that the flatcc project actually writes its outputs into the source # tree instead of under the binary directory, and there's no way to change # that behavior. - ${_flatcc_source_dir}/bin/flatcc -cwr -o + ${FLATCC_EXECUTABLE} -cwr -o ${_program_schema__include_dir}/executorch/devtools/etdump ${_etdump_schema__srcs} COMMAND rm -rf ${_etdump_schema_cleanup_paths} diff --git a/examples/arm/CMakeLists.txt b/examples/arm/CMakeLists.txt index e0a8186b46..0c754beaaa 100644 --- a/examples/arm/CMakeLists.txt +++ b/examples/arm/CMakeLists.txt @@ -57,3 +57,8 @@ generate_bindings_for_kernels( gen_operators_lib( LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch ) + +if(EXECUTORCH_ENABLE_EVENT_TRACER) + target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED) + target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED) +endif() diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 7fc2cf0564..6713c9b6f8 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -273,7 +273,7 @@ def get_compile_spec( target, system_config="Ethos_U55_High_End_Embedded", memory_mode="Shared_Sram", - extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", + extra_flags="--debug-force-regor --output-format=raw --verbose-operators", ) .set_permute_memory_format(True) .set_quantize_io(True) @@ -286,7 +286,7 @@ def get_compile_spec( target, system_config="Ethos_U85_SYS_DRAM_Mid", memory_mode="Shared_Sram", - extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", + extra_flags="--output-format=raw --verbose-operators", ) .set_permute_memory_format(True) .set_quantize_io(True) diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch index 4467185ae7..9b47aa4e3a 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch @@ -20,11 +20,12 @@ index b458fc6..8d4bc73 100644 KEEP(*(.eh_frame*)) } > ITCM :rom_exec -@@ -280,7 +280,7 @@ SECTIONS +@@ -280,7 +280,8 @@ SECTIONS #endif * (expected_output_data_sec) * (sec_command_stream, sec_weight_data, sec_input_data) - ++ *(.got*) + *(.rodata*) * (ethosu_core_in_queue) * (ethosu_core_out_queue) diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 7da3462924..ea180f4d23 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF) option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF) option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF) - if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING}) message( FATAL_ERROR @@ -220,10 +219,8 @@ target_sources( # Include the target's bare-metal linker script ethosu_eval_link_options(arm_executor_runner) -# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for -# bin size as we link in a number of other symbols -target_link_libraries( - arm_executor_runner +set(arm_executor_runner_link) +list(APPEND arm_executor_runner_link extension_runner_util ethosu_target_init executorch @@ -237,6 +234,44 @@ target_link_libraries( -Xlinker -Map=arm_executor_runner.map ) +if(EXECUTORCH_ENABLE_EVENT_TRACER) + target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED) + + add_library(etdump STATIC IMPORTED) + set_property( + TARGET etdump + PROPERTY IMPORTED_LOCATION + "${ET_BUILD_DIR_PATH}/lib/libetdump.a" + ) + + if(CMAKE_BUILD_TYPE MATCHES "Debug") + set(FLATCCRT_LIB flatccrt_d) + else() + set(FLATCCRT_LIB flatccrt) + endif() + + add_library(${FLATCCRT_LIB} STATIC IMPORTED) + set_property( + TARGET ${FLATCCRT_LIB} + PROPERTY IMPORTED_LOCATION + "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a" + ) + + list(APPEND arm_executor_runner_link + etdump + ${FLATCCRT_LIB} + ) +endif() + +# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for +# bin size as we link in a number of other symbols +target_link_libraries( + arm_executor_runner + ${arm_executor_runner_link} +) + +target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map ) + # ET headers and generated headers includes target_include_directories( arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR} diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index 5b9945be7a..e9c758a60b 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -21,8 +21,14 @@ #include #include "arm_perf_monitor.h" +#if defined(ET_EVENT_TRACER_ENABLED) +#include +#if !defined(SEMIHOSTING) +#include +#endif +#endif -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) /** * The input_file_allocation_pool should be large enough to fit the various @@ -75,7 +81,10 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::Tag; using executorch::runtime::TensorInfo; - +#if defined(ET_EVENT_TRACER_ENABLED) +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +#endif /** * The method_allocation_pool should be large enough to fit the setup, input * used and other data used like the planned memory pool (e.g. memory-planned @@ -84,8 +93,8 @@ using executorch::runtime::TensorInfo; * large models if you run on HW this should be lowered to fit into your * availible memory. */ -#ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE -#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (20 * 1024 * 1024) +#if !defined(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE) +#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (60 * 1024 * 1024) #endif const size_t method_allocation_pool_size = ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE; @@ -99,7 +108,7 @@ unsigned char __attribute__(( * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably * a better fit */ -#ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE +#if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE) #define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024) #endif const size_t temp_allocation_pool_size = @@ -108,16 +117,40 @@ unsigned char __attribute__(( section("input_data_sec"), aligned(16))) temp_allocation_pool[temp_allocation_pool_size]; -void et_pal_init(void) {} +void et_pal_init(void) { + // Enable ARM PMU Clock + ARM_PMU_Enable(); + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable + ARM_PMU_CYCCNT_Reset(); + ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); +} + +/** + * Implementation of the et_pal_() + * + * This functions are hardware adaption type of functions for things like + * time/logging/memory allocation that could call your RTOS or need to to + * be implemnted in some way. + */ ET_NORETURN void et_pal_abort(void) { -#ifndef SEMIHOSTING +#if !defined(SEMIHOSTING) __builtin_trap(); #else _exit(-1); #endif } +et_timestamp_t et_pal_current_ticks(void) { + return ARM_PMU_Get_CCNTR(); +} + +et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) { + // Since we don't know the CPU freq for your target and justs cycles in the + // FVP for et_pal_current_ticks() we return a conversion ratio of 1 + return {1, 1}; +} + /** * Emit a log message via platform output (serial port, console, etc). */ @@ -133,6 +166,18 @@ void et_pal_emit_log_message( stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message); } +/** + * Dynamic memory allocators intended to be used by temp_allocator + * to implement malloc()/free() type of allocations. + * Currenyly not used. + */ + +void* et_pal_allocate(ET_UNUSED size_t size) { + return nullptr; +} + +void et_pal_free(ET_UNUSED void* ptr) {} + namespace { // Setup our own allocator that can show some extra stuff like used and free @@ -181,7 +226,7 @@ Result prepare_input_tensors( size_t num_inputs = method_meta.num_inputs(); size_t num_allocated = 0; -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) ET_CHECK_OR_RETURN_ERROR( input_buffers.size() > 0 && num_inputs == input_buffers.size(), InvalidArgument, @@ -267,7 +312,7 @@ Result prepare_input_tensors( return BufferCleanup({inputs, num_allocated}); } -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) std::pair read_binary_file( const char* filename, @@ -304,7 +349,7 @@ std::pair read_binary_file( } // namespace int main(int argc, const char* argv[]) { -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) ET_LOG(Info, "Running executor with parameter:"); if (argc < 7) { ET_LOG(Fatal, "Not right number of parameters!"); @@ -327,7 +372,7 @@ int main(int argc, const char* argv[]) { std::vector> input_buffers; size_t pte_size = sizeof(model_pte); -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) const char* output_basename = nullptr; ArmMemoryAllocator input_file_allocator( input_file_allocation_pool_size, input_file_allocation_pool); @@ -432,7 +477,16 @@ int main(int argc, const char* argv[]) { size_t method_loaded_membase = method_allocator.used_size(); - Result method = program->load_method(method_name, &memory_manager); + executorch::runtime::EventTracer* event_tracer_ptr = nullptr; + +#if defined(ET_EVENT_TRACER_ENABLED) + torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); + event_tracer_ptr = &etdump_gen; +#endif + + Result method = + program->load_method(method_name, &memory_manager, event_tracer_ptr); + if (!method.ok()) { ET_LOG( Info, @@ -468,7 +522,7 @@ int main(int argc, const char* argv[]) { size_t executor_memsize = method_allocator.used_size() - executor_membase; ET_LOG(Info, "model_pte_loaded_size: %lu bytes.", pte_size); -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) if (input_file_allocator.size() > 0) { ET_LOG( Info, @@ -520,7 +574,7 @@ int main(int argc, const char* argv[]) { ET_CHECK(status == Error::Ok); for (int i = 0; i < outputs.size(); ++i) { Tensor t = outputs[i].toTensor(); -#ifndef SEMIHOSTING +#if !defined(SEMIHOSTING) // The output might be collected and parsed so printf() is used instead // of ET_LOG() here for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { @@ -538,6 +592,25 @@ int main(int argc, const char* argv[]) { outputs[i].toTensor().const_data_ptr()[j]); } } +#if defined(ET_EVENT_TRACER_ENABLED) + ETDumpResult result = etdump_gen.get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + // On a device with no file system we can't just write it out + // to the file-system so we base64 encode it and dump it on the log. + int mode = 0; + size_t len = result.size; + size_t encoded_len = base64_encoded_size(result.size, mode); + uint8_t* encoded_buf = reinterpret_cast( + method_allocator.allocate(encoded_len + 1)); + int ret = base64_encode( + encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode); + encoded_buf[encoded_len] = 0x00; // Ensure null termination + ET_LOG(Info, "Writing etdump.bin [base64]"); + printf( + "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin --source_time_scale cycles --target_time_scale cycles\n#---\n", + encoded_buf); + } +#endif #else char out_filename[255]; snprintf(out_filename, 255, "%s-%d.bin", output_basename, i); @@ -549,11 +622,24 @@ int main(int argc, const char* argv[]) { outputs[i].toTensor().nbytes(), out_file); fclose(out_file); +#if defined(ET_EVENT_TRACER_ENABLED) + etdump_result result = etdump_gen.get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + // On a device with a file system we can just write it out + // to the file-system. + char etdump_filename = "etdump.bin"; + ET_LOG(Info, "Writing etdump to file: %s", etdump_filename); + FILE* f = fopen(etdump_filename, "w+"); + fwrite((uint8_t*)result.buf, 1, result.size, f); + fclose(f); + free(result.buf); + } +#endif #endif } out: ET_LOG(Info, "Program complete, exiting."); -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) _exit(0); #endif ET_LOG(Info, "\04"); diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp index b75e510d9d..38868e8a1d 100644 --- a/examples/arm/executor_runner/arm_perf_monitor.cpp +++ b/examples/arm/executor_runner/arm_perf_monitor.cpp @@ -15,6 +15,7 @@ #include static uint32_t ethosu_inference_count = 0; +static uint64_t ethosu_ArmCycleCountStart = 0; static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0; static uint64_t ethosu_ArmBackendExecuteCycleCount = 0; static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0; @@ -119,17 +120,14 @@ void StartMeasurements() { for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { ethosu_pmuEventCounts[i] = 0; } - ARM_PMU_Enable(); - DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable - ARM_PMU_CYCCNT_Reset(); - ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); + ethosu_ArmCycleCountStart = ARM_PMU_Get_CCNTR(); } void StopMeasurements() { ARM_PMU_CNTR_Disable( PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk | PMU_CNTENCLR_CNT1_ENABLE_Msk); - uint32_t cycle_count = ARM_PMU_Get_CCNTR(); + uint32_t cycle_count = ARM_PMU_Get_CCNTR() - ethosu_ArmCycleCountStart; // Number of comand streams handled by the NPU ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count); diff --git a/examples/arm/run.sh b/examples/arm/run.sh index cbc96c4b11..daab39ffc6 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -20,13 +20,16 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) root_dir=${script_dir}/ethos-u-scratch model_name="" -reorder_inputs="" aot_arm_compiler_flags="--delegate --quantize" +portable_kernels="aten::_softmax.out" target="ethos-u55-128" output_folder_set=false output_folder="." +build_with_etdump=false +build_type="Release" +extra_build_flags="" build_only=false -portable_kernels="aten::_softmax.out" +reorder_inputs="" help() { echo "Usage: $(basename $0) [options]" @@ -36,6 +39,9 @@ help() { echo " --portable_kernels= Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}" echo " --target= Target to build and run for Default: ${target}" echo " --output= Output folder Default: ${output_folder}" + echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" + echo " --debug_build Build with debug flag, default is Release" + echo " --extra_build_flags Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none " echo " --build_only Only build, don't run FVP" echo " --scratch-dir= Path to your Ethos-U scrach dir if you not using default" echo " --reorder_inputs= Reorder the inputs. This can be required when inputs > 1." @@ -50,6 +56,9 @@ for arg in "$@"; do --portable_kernels=*) portable_kernels="${arg#*=}";; --target=*) target="${arg#*=}";; --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;; + --etdump) build_with_etdump=true ;; + --debug_build) build_type="Debug" ;; + --extra_build_flags=*) extra_build_flags="${arg#*=}";; --build_only) build_only=true ;; --scratch-dir=*) root_dir="${arg#*=}";; --reorder_inputs=*) reorder_inputs="${arg#*=}";; @@ -94,6 +103,7 @@ if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; fi # Generate a pte file +# output from this function is the pte filename e.g. echo should be avoided or directed to stderr e.g. >&2 function generate_pte_file() { [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; } local model=${1} @@ -115,33 +125,14 @@ function generate_pte_file() { # We are using the aot_lib from build_quantization_aot_lib below SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT}) - python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library="$SO_LIB" 1>&2 + local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB" + echo "CALL ${ARM_AOT_CMD}" >&2 + ${ARM_AOT_CMD} 1>&2 + [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "${pte_file}" } -# Build .so library to register quant ops with AoT flow -function build_quantization_aot_lib() -{ - SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" - - cd $et_root_dir - mkdir -p cmake-out-aot-lib - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_XNNPACK=OFF \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \ - -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ - -DPYTHON_EXECUTABLE=python3 \ - -Bcmake-out-aot-lib \ - "${et_root_dir}" - - cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib -} - - # build ExecuTorch Libraries function build_executorch() { set -x @@ -151,32 +142,86 @@ function build_executorch() { mkdir -p "${et_build_dir}" cd "${et_root_dir}" + + build_with_etdump_flags="" + if [ "$build_with_etdump" = true ] ; then + ( set +x ; + echo "--------------------------------------------------------------------------------" ; + echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_root_dir} - cmake-out-host-tools/bin/flatcc" ; + echo "--------------------------------------------------------------------------------" ) + + + # Build host flatcc bin + mkdir -p cmake-out-host-tools + cmake \ + -DCMAKE_INSTALL_PREFIX=${et_build_dir} \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON \ + -DFLATCC_ALLOW_WERROR=OFF \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + ${extra_build_flags} \ + -Bcmake-out-host-tools \ + "${et_root_dir}" + + mkdir -p cmake-out-host-tools/bin + cp third-party/flatcc/bin/flatcc cmake-out-host-tools/bin + + build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF \ + -DFLATCC_ALLOW_WERROR=OFF \ + -DFLATCC_EXECUTABLE=${et_root_dir}/cmake-out-host-tools/bin/flatcc " + fi + + ( set +x ; + echo "--------------------------------------------------------------------------------" ; + echo "Build ExecuTorch Libraries target libs with --target install ${build_type} into '${et_root_dir}' - '${et_build_dir}'" ; + echo "--------------------------------------------------------------------------------" ) + + # Build cmake \ -DCMAKE_INSTALL_PREFIX=${et_build_dir} \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + ${build_with_etdump_flags} \ -DFLATC_EXECUTABLE="$(which flatc)" \ - -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ + ${extra_build_flags} \ -B${et_build_dir} \ "${et_root_dir}" echo "[${FUNCNAME[0]}] Configured CMAKE" - cmake --build ${et_build_dir} --parallel --target install --config Release + cmake --build ${et_build_dir} --parallel --target install --config ${build_type} -- + + ( set +x ; + echo "--------------------------------------------------------------------------------" ; + echo "Build ExecuTorch Libraries ${build_type} into '${et_root_dir}/examples/arm' - '${et_build_dir}/examples/arm'" ; + echo "--------------------------------------------------------------------------------" ) cmake \ -DCMAKE_INSTALL_PREFIX=${et_build_dir} \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels} \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ - -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ - -B"${et_build_dir}"/examples/arm \ - "${et_root_dir}"/examples/arm - cmake --build ${et_build_dir}/examples/arm --parallel -- + ${extra_build_flags} \ + -B"${et_build_dir}/examples/arm" \ + "${et_root_dir}/examples/arm" + + cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} -- set +x @@ -185,6 +230,40 @@ function build_executorch() { find . -name "*.a" -exec ls -al {} \; } +# Build .so library to register quant ops with AoT flow +function build_quantization_aot_lib() +{ + SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" + + cd $et_root_dir + mkdir -p cmake-out-aot-lib + + echo "--------------------------------------------------------------------------------" + echo "Build .so library to register quant ops with AoT flow ${build_type} into '${et_root_dir}' - 'cmake-out-aot-lib'" + echo "--------------------------------------------------------------------------------" + + build_with_etdump_flags="" + if [ "$build_with_etdump" = true ] ; then + build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON " + fi + + cmake \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \ + ${build_with_etdump_flags} \ + -DPYTHON_EXECUTABLE=$(which python3) \ + ${extra_build_flags} \ + -Bcmake-out-aot-lib \ + "${et_root_dir}" + + cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib +} + # build Arm Baremetal executor_runner function build_executorch_runner() { echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries" @@ -192,22 +271,37 @@ function build_executorch_runner() { local pte=${1} if [[ ${target} == *"ethos-u55"* ]]; then local target_cpu=cortex-m55 - local target_board=corstone-300 + local target_board=corstone-300 else local target_cpu=cortex-m85 - local target_board=corstone-320 + local target_board=corstone-320 fi + echo "--------------------------------------------------------------------------------" + echo "Build Arm Baremetal executor_runner for ${target} - '${executor_runner_path}/cmake-out'" + echo "--------------------------------------------------------------------------------" + cd ${script_dir}/executor_runner - cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \ - -DTARGET_CPU=${target_cpu} \ - -DTARGET_BOARD=${target_board} \ - -DETHOSU_TARGET_NPU_CONFIG=${target} \ - -B ${executor_runner_path}/cmake-out \ - -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir} \ - -DET_DIR_PATH:PATH=${et_root_dir} \ - -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ - -DET_PTE_FILE_PATH:PATH="${pte}" \ - -DPYTHON_EXECUTABLE=$(which python3) + + build_with_etdump_flags="" + if [ "$build_with_etdump" = true ] ; then + build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON " + fi + + cmake \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \ + -DTARGET_CPU=${target_cpu} \ + -DTARGET_BOARD=${target_board} \ + -DET_DIR_PATH:PATH=${et_root_dir} \ + -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ + -DET_PTE_FILE_PATH:PATH="${pte}" \ + -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir} \ + -DETHOSU_TARGET_NPU_CONFIG=${target} \ + ${build_with_etdump_flags} \ + -DPYTHON_EXECUTABLE=$(which python3) \ + ${extra_build_flags} \ + -B ${executor_runner_path}/cmake-out + echo "[${FUNCNAME[0]}] Configured CMAKE" cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner @@ -235,7 +329,7 @@ function run_fvp() { -C mps3_board.uart0.out_file='-' \ -C mps3_board.uart0.shutdown_on_eot=1 \ -a "${elf}" \ - --timelimit 120 || true # seconds + --timelimit 220 || true # seconds echo "[${FUNCNAME[0]}] Simulation complete, $?" elif [[ ${target} == *"ethos-u85"* ]]; then echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}" @@ -247,7 +341,7 @@ function run_fvp() { -C mps4_board.uart0.out_file='-' \ -C mps4_board.uart0.shutdown_on_eot=1 \ -a "${elf}" \ - --timelimit 120 || true # seconds + --timelimit 220 || true # seconds echo "[${FUNCNAME[0]}] Simulation complete, $?" else echo "Running ${elf} for ${target} is not supported"