diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d79b49e6e..ac8950bc1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -673,10 +673,17 @@ if(EXECUTORCH_BUILD_XNNPACK) endif() if(EXECUTORCH_BUILD_DEVTOOLS) - set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER - ON - CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE - ) + if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL) + set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER + ON + CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE + ) + else() + set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER + OFF + CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE + ) + endif() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index a14c42140e..2cc716391b 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -15,6 +15,39 @@ #include +#if defined(ET_EVENT_TRACER_ENABLED) +#include +#include +using executorch::runtime::EventTracer; +using executorch::runtime::EventTracerEntry; + +class EventTraceScope { + public: + EventTraceScope(EventTracer* event_tracer_, const char* name) { + event_tracer = event_tracer_; + event_tracer_entry_scope = event_tracer->start_profiling(name); + } + ~EventTraceScope() { + event_tracer->end_profiling(event_tracer_entry_scope); + } + + private: + EventTracer* event_tracer; + EventTracerEntry event_tracer_entry_scope; +}; +#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \ + EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME) +#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \ + SCOPE = EVENTTRACER->start_profiling(NAME) +#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \ + EVENTTRACER->end_profiling(SCOPE) + +#else +#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) +#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) +#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) +#endif + #include #include #include @@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { BackendExecutionContext& context, DelegateHandle* input_handle, EValue** args) const override { +#if defined(ET_EVENT_TRACER_ENABLED) + EventTracer* event_tracer = context.event_tracer(); + EventTracerEntry event_tracer_local_scope; +#endif + + EXECUTORCH_PROF_SCOPE(event_tracer, "ArmBackend::execute()"); + ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; + ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle; VelaHandles handles; - ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; // Command stream - we know at this point it's aligned + EXECUTORCH_PROF_START( + event_tracer, + event_tracer_local_scope, + "+ArmBackend::execute()processed_data"); char* data = (char*)execution_handle->processed->data(); + EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); + ET_LOG(Debug, "ArmBackend::execute %p", data); + EXECUTORCH_PROF_START( + event_tracer, + event_tracer_local_scope, + "+ArmBackend::execute()vela_bin_read()"); // Read key sections from the vela_bin_stream if (vela_bin_read(data, &handles, execution_handle->processed->size()) == false) { ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout"); return Error::InvalidProgram; } + EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); ET_LOG( Debug, @@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { // Select a compatible copy routine if (both_char and permuted_input_shape) { + EXECUTORCH_PROF_SCOPE( + event_tracer, + "+ArmBackend::execute()handles.input.permute_CHW_to_HWC()"); // permuted byte copy CHW to HWC permute_CHW_to_HWC( tensor_in.mutable_data_ptr(), @@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { tensor_in.size(2), tensor_in.size(3)); } else if (both_char or both_int) { + EXECUTORCH_PROF_SCOPE( + event_tracer, "+ArmBackend::execute()handles.input.memcpy()"); // Sizes match and elt size matches so memcpy memcpy( scratch_addr, @@ -234,7 +290,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; size_t bases_size[2] = { handles.weight_data_size, handles.scratch_data_size}; - int result = ethosu_invoke_v3( + int result = 0; + EXECUTORCH_PROF_START( + event_tracer, event_tracer_local_scope, "+ArmBackend::execute()NPU"); + result = ethosu_invoke_v3( driver.get(), (void*)handles.cmd_data, handles.cmd_data_size, @@ -242,6 +301,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { bases_size, 2, /* fixed array of pointers to binary interface*/ nullptr); + EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); if (result != 0) { ET_LOG( @@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { &permuted_output_shape)); if (tensor_out.scalar_type() == ScalarType::Char and permuted_output_shape) { + EXECUTORCH_PROF_SCOPE( + event_tracer, + "+ArmBackend::execute()handles.output.permute_HWC_to_CHW()"); + char* output_address = (char*)output_addr; permute_HWC_to_CHW( output_address, @@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { tensor_out.size(2), tensor_out.size(3)); } else { + EXECUTORCH_PROF_SCOPE( + event_tracer, "+ArmBackend::execute()handles.output.move()"); for (int j = 0; j < tensor_out.numel(); j++) { if (tensor_out.scalar_type() == ScalarType::Char) { char* output_address = (char*)output_addr; diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt index df4bacb802..58043067a5 100644 --- a/devtools/CMakeLists.txt +++ b/devtools/CMakeLists.txt @@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc) + if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() if(NOT FLATCC_EXECUTABLE) - set(FLATCC_EXECUTABLE flatcc) + set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc) endif() # Source root directory for executorch. @@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE OFF CACHE BOOL "" ) -set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc) + add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc) # Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making @@ -163,7 +165,7 @@ add_custom_command( # Note that the flatcc project actually writes its outputs into the source # tree instead of under the binary directory, and there's no way to change # that behavior. - ${_flatcc_source_dir}/bin/flatcc -cwr -o + ${FLATCC_EXECUTABLE} -cwr -o ${_program_schema__include_dir}/executorch/devtools/etdump ${_etdump_schema__srcs} COMMAND rm -rf ${_etdump_schema_cleanup_paths} diff --git a/examples/arm/CMakeLists.txt b/examples/arm/CMakeLists.txt index e0a8186b46..0c754beaaa 100644 --- a/examples/arm/CMakeLists.txt +++ b/examples/arm/CMakeLists.txt @@ -57,3 +57,8 @@ generate_bindings_for_kernels( gen_operators_lib( LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch ) + +if(EXECUTORCH_ENABLE_EVENT_TRACER) + target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED) + target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED) +endif() diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 7fc2cf0564..6713c9b6f8 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -273,7 +273,7 @@ def get_compile_spec( target, system_config="Ethos_U55_High_End_Embedded", memory_mode="Shared_Sram", - extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", + extra_flags="--debug-force-regor --output-format=raw --verbose-operators", ) .set_permute_memory_format(True) .set_quantize_io(True) @@ -286,7 +286,7 @@ def get_compile_spec( target, system_config="Ethos_U85_SYS_DRAM_Mid", memory_mode="Shared_Sram", - extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", + extra_flags="--output-format=raw --verbose-operators", ) .set_permute_memory_format(True) .set_quantize_io(True) diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch index 4467185ae7..9b47aa4e3a 100644 --- a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch +++ b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch @@ -20,11 +20,12 @@ index b458fc6..8d4bc73 100644 KEEP(*(.eh_frame*)) } > ITCM :rom_exec -@@ -280,7 +280,7 @@ SECTIONS +@@ -280,7 +280,8 @@ SECTIONS #endif * (expected_output_data_sec) * (sec_command_stream, sec_weight_data, sec_input_data) - ++ *(.got*) + *(.rodata*) * (ethosu_core_in_queue) * (ethosu_core_out_queue) diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 7da3462924..ea180f4d23 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF) option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF) option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF) - if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING}) message( FATAL_ERROR @@ -220,10 +219,8 @@ target_sources( # Include the target's bare-metal linker script ethosu_eval_link_options(arm_executor_runner) -# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for -# bin size as we link in a number of other symbols -target_link_libraries( - arm_executor_runner +set(arm_executor_runner_link) +list(APPEND arm_executor_runner_link extension_runner_util ethosu_target_init executorch @@ -237,6 +234,44 @@ target_link_libraries( -Xlinker -Map=arm_executor_runner.map ) +if(EXECUTORCH_ENABLE_EVENT_TRACER) + target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED) + + add_library(etdump STATIC IMPORTED) + set_property( + TARGET etdump + PROPERTY IMPORTED_LOCATION + "${ET_BUILD_DIR_PATH}/lib/libetdump.a" + ) + + if(CMAKE_BUILD_TYPE MATCHES "Debug") + set(FLATCCRT_LIB flatccrt_d) + else() + set(FLATCCRT_LIB flatccrt) + endif() + + add_library(${FLATCCRT_LIB} STATIC IMPORTED) + set_property( + TARGET ${FLATCCRT_LIB} + PROPERTY IMPORTED_LOCATION + "${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a" + ) + + list(APPEND arm_executor_runner_link + etdump + ${FLATCCRT_LIB} + ) +endif() + +# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for +# bin size as we link in a number of other symbols +target_link_libraries( + arm_executor_runner + ${arm_executor_runner_link} +) + +target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map ) + # ET headers and generated headers includes target_include_directories( arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR} diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index 5b9945be7a..e9c758a60b 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -21,8 +21,14 @@ #include #include "arm_perf_monitor.h" +#if defined(ET_EVENT_TRACER_ENABLED) +#include +#if !defined(SEMIHOSTING) +#include +#endif +#endif -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) /** * The input_file_allocation_pool should be large enough to fit the various @@ -75,7 +81,10 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::Tag; using executorch::runtime::TensorInfo; - +#if defined(ET_EVENT_TRACER_ENABLED) +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +#endif /** * The method_allocation_pool should be large enough to fit the setup, input * used and other data used like the planned memory pool (e.g. memory-planned @@ -84,8 +93,8 @@ using executorch::runtime::TensorInfo; * large models if you run on HW this should be lowered to fit into your * availible memory. */ -#ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE -#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (20 * 1024 * 1024) +#if !defined(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE) +#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (60 * 1024 * 1024) #endif const size_t method_allocation_pool_size = ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE; @@ -99,7 +108,7 @@ unsigned char __attribute__(( * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably * a better fit */ -#ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE +#if !defined(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE) #define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024) #endif const size_t temp_allocation_pool_size = @@ -108,16 +117,40 @@ unsigned char __attribute__(( section("input_data_sec"), aligned(16))) temp_allocation_pool[temp_allocation_pool_size]; -void et_pal_init(void) {} +void et_pal_init(void) { + // Enable ARM PMU Clock + ARM_PMU_Enable(); + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable + ARM_PMU_CYCCNT_Reset(); + ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); +} + +/** + * Implementation of the et_pal_() + * + * This functions are hardware adaption type of functions for things like + * time/logging/memory allocation that could call your RTOS or need to to + * be implemnted in some way. + */ ET_NORETURN void et_pal_abort(void) { -#ifndef SEMIHOSTING +#if !defined(SEMIHOSTING) __builtin_trap(); #else _exit(-1); #endif } +et_timestamp_t et_pal_current_ticks(void) { + return ARM_PMU_Get_CCNTR(); +} + +et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) { + // Since we don't know the CPU freq for your target and justs cycles in the + // FVP for et_pal_current_ticks() we return a conversion ratio of 1 + return {1, 1}; +} + /** * Emit a log message via platform output (serial port, console, etc). */ @@ -133,6 +166,18 @@ void et_pal_emit_log_message( stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message); } +/** + * Dynamic memory allocators intended to be used by temp_allocator + * to implement malloc()/free() type of allocations. + * Currenyly not used. + */ + +void* et_pal_allocate(ET_UNUSED size_t size) { + return nullptr; +} + +void et_pal_free(ET_UNUSED void* ptr) {} + namespace { // Setup our own allocator that can show some extra stuff like used and free @@ -181,7 +226,7 @@ Result prepare_input_tensors( size_t num_inputs = method_meta.num_inputs(); size_t num_allocated = 0; -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) ET_CHECK_OR_RETURN_ERROR( input_buffers.size() > 0 && num_inputs == input_buffers.size(), InvalidArgument, @@ -267,7 +312,7 @@ Result prepare_input_tensors( return BufferCleanup({inputs, num_allocated}); } -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) std::pair read_binary_file( const char* filename, @@ -304,7 +349,7 @@ std::pair read_binary_file( } // namespace int main(int argc, const char* argv[]) { -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) ET_LOG(Info, "Running executor with parameter:"); if (argc < 7) { ET_LOG(Fatal, "Not right number of parameters!"); @@ -327,7 +372,7 @@ int main(int argc, const char* argv[]) { std::vector> input_buffers; size_t pte_size = sizeof(model_pte); -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) const char* output_basename = nullptr; ArmMemoryAllocator input_file_allocator( input_file_allocation_pool_size, input_file_allocation_pool); @@ -432,7 +477,16 @@ int main(int argc, const char* argv[]) { size_t method_loaded_membase = method_allocator.used_size(); - Result method = program->load_method(method_name, &memory_manager); + executorch::runtime::EventTracer* event_tracer_ptr = nullptr; + +#if defined(ET_EVENT_TRACER_ENABLED) + torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); + event_tracer_ptr = &etdump_gen; +#endif + + Result method = + program->load_method(method_name, &memory_manager, event_tracer_ptr); + if (!method.ok()) { ET_LOG( Info, @@ -468,7 +522,7 @@ int main(int argc, const char* argv[]) { size_t executor_memsize = method_allocator.used_size() - executor_membase; ET_LOG(Info, "model_pte_loaded_size: %lu bytes.", pte_size); -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) if (input_file_allocator.size() > 0) { ET_LOG( Info, @@ -520,7 +574,7 @@ int main(int argc, const char* argv[]) { ET_CHECK(status == Error::Ok); for (int i = 0; i < outputs.size(); ++i) { Tensor t = outputs[i].toTensor(); -#ifndef SEMIHOSTING +#if !defined(SEMIHOSTING) // The output might be collected and parsed so printf() is used instead // of ET_LOG() here for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { @@ -538,6 +592,25 @@ int main(int argc, const char* argv[]) { outputs[i].toTensor().const_data_ptr()[j]); } } +#if defined(ET_EVENT_TRACER_ENABLED) + ETDumpResult result = etdump_gen.get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + // On a device with no file system we can't just write it out + // to the file-system so we base64 encode it and dump it on the log. + int mode = 0; + size_t len = result.size; + size_t encoded_len = base64_encoded_size(result.size, mode); + uint8_t* encoded_buf = reinterpret_cast( + method_allocator.allocate(encoded_len + 1)); + int ret = base64_encode( + encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode); + encoded_buf[encoded_len] = 0x00; // Ensure null termination + ET_LOG(Info, "Writing etdump.bin [base64]"); + printf( + "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin --source_time_scale cycles --target_time_scale cycles\n#---\n", + encoded_buf); + } +#endif #else char out_filename[255]; snprintf(out_filename, 255, "%s-%d.bin", output_basename, i); @@ -549,11 +622,24 @@ int main(int argc, const char* argv[]) { outputs[i].toTensor().nbytes(), out_file); fclose(out_file); +#if defined(ET_EVENT_TRACER_ENABLED) + etdump_result result = etdump_gen.get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + // On a device with a file system we can just write it out + // to the file-system. + char etdump_filename = "etdump.bin"; + ET_LOG(Info, "Writing etdump to file: %s", etdump_filename); + FILE* f = fopen(etdump_filename, "w+"); + fwrite((uint8_t*)result.buf, 1, result.size, f); + fclose(f); + free(result.buf); + } +#endif #endif } out: ET_LOG(Info, "Program complete, exiting."); -#ifdef SEMIHOSTING +#if defined(SEMIHOSTING) _exit(0); #endif ET_LOG(Info, "\04"); diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp index b75e510d9d..38868e8a1d 100644 --- a/examples/arm/executor_runner/arm_perf_monitor.cpp +++ b/examples/arm/executor_runner/arm_perf_monitor.cpp @@ -15,6 +15,7 @@ #include static uint32_t ethosu_inference_count = 0; +static uint64_t ethosu_ArmCycleCountStart = 0; static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0; static uint64_t ethosu_ArmBackendExecuteCycleCount = 0; static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0; @@ -119,17 +120,14 @@ void StartMeasurements() { for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { ethosu_pmuEventCounts[i] = 0; } - ARM_PMU_Enable(); - DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable - ARM_PMU_CYCCNT_Reset(); - ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); + ethosu_ArmCycleCountStart = ARM_PMU_Get_CCNTR(); } void StopMeasurements() { ARM_PMU_CNTR_Disable( PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk | PMU_CNTENCLR_CNT1_ENABLE_Msk); - uint32_t cycle_count = ARM_PMU_Get_CCNTR(); + uint32_t cycle_count = ARM_PMU_Get_CCNTR() - ethosu_ArmCycleCountStart; // Number of comand streams handled by the NPU ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count); diff --git a/examples/arm/run.sh b/examples/arm/run.sh index cbc96c4b11..daab39ffc6 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -20,13 +20,16 @@ script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) root_dir=${script_dir}/ethos-u-scratch model_name="" -reorder_inputs="" aot_arm_compiler_flags="--delegate --quantize" +portable_kernels="aten::_softmax.out" target="ethos-u55-128" output_folder_set=false output_folder="." +build_with_etdump=false +build_type="Release" +extra_build_flags="" build_only=false -portable_kernels="aten::_softmax.out" +reorder_inputs="" help() { echo "Usage: $(basename $0) [options]" @@ -36,6 +39,9 @@ help() { echo " --portable_kernels= Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}" echo " --target= Target to build and run for Default: ${target}" echo " --output= Output folder Default: ${output_folder}" + echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" + echo " --debug_build Build with debug flag, default is Release" + echo " --extra_build_flags Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none " echo " --build_only Only build, don't run FVP" echo " --scratch-dir= Path to your Ethos-U scrach dir if you not using default" echo " --reorder_inputs= Reorder the inputs. This can be required when inputs > 1." @@ -50,6 +56,9 @@ for arg in "$@"; do --portable_kernels=*) portable_kernels="${arg#*=}";; --target=*) target="${arg#*=}";; --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;; + --etdump) build_with_etdump=true ;; + --debug_build) build_type="Debug" ;; + --extra_build_flags=*) extra_build_flags="${arg#*=}";; --build_only) build_only=true ;; --scratch-dir=*) root_dir="${arg#*=}";; --reorder_inputs=*) reorder_inputs="${arg#*=}";; @@ -94,6 +103,7 @@ if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; fi # Generate a pte file +# output from this function is the pte filename e.g. echo should be avoided or directed to stderr e.g. >&2 function generate_pte_file() { [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; } local model=${1} @@ -115,33 +125,14 @@ function generate_pte_file() { # We are using the aot_lib from build_quantization_aot_lib below SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT}) - python3 -m examples.arm.aot_arm_compiler --model_name="${model}" --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library="$SO_LIB" 1>&2 + local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB" + echo "CALL ${ARM_AOT_CMD}" >&2 + ${ARM_AOT_CMD} 1>&2 + [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "${pte_file}" } -# Build .so library to register quant ops with AoT flow -function build_quantization_aot_lib() -{ - SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" - - cd $et_root_dir - mkdir -p cmake-out-aot-lib - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_XNNPACK=OFF \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \ - -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ - -DPYTHON_EXECUTABLE=python3 \ - -Bcmake-out-aot-lib \ - "${et_root_dir}" - - cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib -} - - # build ExecuTorch Libraries function build_executorch() { set -x @@ -151,32 +142,86 @@ function build_executorch() { mkdir -p "${et_build_dir}" cd "${et_root_dir}" + + build_with_etdump_flags="" + if [ "$build_with_etdump" = true ] ; then + ( set +x ; + echo "--------------------------------------------------------------------------------" ; + echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_root_dir} - cmake-out-host-tools/bin/flatcc" ; + echo "--------------------------------------------------------------------------------" ) + + + # Build host flatcc bin + mkdir -p cmake-out-host-tools + cmake \ + -DCMAKE_INSTALL_PREFIX=${et_build_dir} \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON \ + -DFLATCC_ALLOW_WERROR=OFF \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + ${extra_build_flags} \ + -Bcmake-out-host-tools \ + "${et_root_dir}" + + mkdir -p cmake-out-host-tools/bin + cp third-party/flatcc/bin/flatcc cmake-out-host-tools/bin + + build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF \ + -DFLATCC_ALLOW_WERROR=OFF \ + -DFLATCC_EXECUTABLE=${et_root_dir}/cmake-out-host-tools/bin/flatcc " + fi + + ( set +x ; + echo "--------------------------------------------------------------------------------" ; + echo "Build ExecuTorch Libraries target libs with --target install ${build_type} into '${et_root_dir}' - '${et_build_dir}'" ; + echo "--------------------------------------------------------------------------------" ) + + # Build cmake \ -DCMAKE_INSTALL_PREFIX=${et_build_dir} \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + ${build_with_etdump_flags} \ -DFLATC_EXECUTABLE="$(which flatc)" \ - -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ + ${extra_build_flags} \ -B${et_build_dir} \ "${et_root_dir}" echo "[${FUNCNAME[0]}] Configured CMAKE" - cmake --build ${et_build_dir} --parallel --target install --config Release + cmake --build ${et_build_dir} --parallel --target install --config ${build_type} -- + + ( set +x ; + echo "--------------------------------------------------------------------------------" ; + echo "Build ExecuTorch Libraries ${build_type} into '${et_root_dir}/examples/arm' - '${et_build_dir}/examples/arm'" ; + echo "--------------------------------------------------------------------------------" ) cmake \ -DCMAKE_INSTALL_PREFIX=${et_build_dir} \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels} \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ - -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \ - -B"${et_build_dir}"/examples/arm \ - "${et_root_dir}"/examples/arm - cmake --build ${et_build_dir}/examples/arm --parallel -- + ${extra_build_flags} \ + -B"${et_build_dir}/examples/arm" \ + "${et_root_dir}/examples/arm" + + cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} -- set +x @@ -185,6 +230,40 @@ function build_executorch() { find . -name "*.a" -exec ls -al {} \; } +# Build .so library to register quant ops with AoT flow +function build_quantization_aot_lib() +{ + SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" + + cd $et_root_dir + mkdir -p cmake-out-aot-lib + + echo "--------------------------------------------------------------------------------" + echo "Build .so library to register quant ops with AoT flow ${build_type} into '${et_root_dir}' - 'cmake-out-aot-lib'" + echo "--------------------------------------------------------------------------------" + + build_with_etdump_flags="" + if [ "$build_with_etdump" = true ] ; then + build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON " + fi + + cmake \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \ + ${build_with_etdump_flags} \ + -DPYTHON_EXECUTABLE=$(which python3) \ + ${extra_build_flags} \ + -Bcmake-out-aot-lib \ + "${et_root_dir}" + + cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib +} + # build Arm Baremetal executor_runner function build_executorch_runner() { echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries" @@ -192,22 +271,37 @@ function build_executorch_runner() { local pte=${1} if [[ ${target} == *"ethos-u55"* ]]; then local target_cpu=cortex-m55 - local target_board=corstone-300 + local target_board=corstone-300 else local target_cpu=cortex-m85 - local target_board=corstone-320 + local target_board=corstone-320 fi + echo "--------------------------------------------------------------------------------" + echo "Build Arm Baremetal executor_runner for ${target} - '${executor_runner_path}/cmake-out'" + echo "--------------------------------------------------------------------------------" + cd ${script_dir}/executor_runner - cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \ - -DTARGET_CPU=${target_cpu} \ - -DTARGET_BOARD=${target_board} \ - -DETHOSU_TARGET_NPU_CONFIG=${target} \ - -B ${executor_runner_path}/cmake-out \ - -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir} \ - -DET_DIR_PATH:PATH=${et_root_dir} \ - -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ - -DET_PTE_FILE_PATH:PATH="${pte}" \ - -DPYTHON_EXECUTABLE=$(which python3) + + build_with_etdump_flags="" + if [ "$build_with_etdump" = true ] ; then + build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON " + fi + + cmake \ + -DCMAKE_BUILD_TYPE=${build_type} \ + -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \ + -DTARGET_CPU=${target_cpu} \ + -DTARGET_BOARD=${target_board} \ + -DET_DIR_PATH:PATH=${et_root_dir} \ + -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ + -DET_PTE_FILE_PATH:PATH="${pte}" \ + -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir} \ + -DETHOSU_TARGET_NPU_CONFIG=${target} \ + ${build_with_etdump_flags} \ + -DPYTHON_EXECUTABLE=$(which python3) \ + ${extra_build_flags} \ + -B ${executor_runner_path}/cmake-out + echo "[${FUNCNAME[0]}] Configured CMAKE" cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner @@ -235,7 +329,7 @@ function run_fvp() { -C mps3_board.uart0.out_file='-' \ -C mps3_board.uart0.shutdown_on_eot=1 \ -a "${elf}" \ - --timelimit 120 || true # seconds + --timelimit 220 || true # seconds echo "[${FUNCNAME[0]}] Simulation complete, $?" elif [[ ${target} == *"ethos-u85"* ]]; then echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}" @@ -247,7 +341,7 @@ function run_fvp() { -C mps4_board.uart0.out_file='-' \ -C mps4_board.uart0.shutdown_on_eot=1 \ -a "${elf}" \ - --timelimit 120 || true # seconds + --timelimit 220 || true # seconds echo "[${FUNCNAME[0]}] Simulation complete, $?" else echo "Running ${elf} for ${target} is not supported"