Skip to content

Commit

Permalink
Arm backend: Add devtools support to example
Browse files Browse the repository at this point in the history
New flags on run.sh
--etdump
  Build in etdump and profiling, the etdump base64 coded and put in
  the log
--debug_build
  Build debug instead of release
--extra_build_flags
  Extra flags to pass to cmake this makes it for example possible to
  override the allocator pool size or other build time cmake flags.

The devtools build has been updated so FLATCC_EXECUTABLE can be used
to point out the executable.

Signed-off-by: Zingo Andersen <[email protected]>
Change-Id: Ic0fb1e48ee633c5fe91473bdc2db9e894b2fc4fa
  • Loading branch information
zingo authored and freddan80 committed Dec 17, 2024
1 parent 9d1a310 commit dabb14e
Show file tree
Hide file tree
Showing 10 changed files with 379 additions and 85 deletions.
15 changes: 11 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -673,10 +673,17 @@ if(EXECUTORCH_BUILD_XNNPACK)
endif()

if(EXECUTORCH_BUILD_DEVTOOLS)
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
ON
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
)
if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
ON
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
)
else()
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
OFF
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
)
endif()
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
endif()

Expand Down
70 changes: 68 additions & 2 deletions backends/arm/runtime/ArmBackendEthosU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,39 @@

#include <ethosu_driver.h>

#if defined(ET_EVENT_TRACER_ENABLED)
#include <executorch/runtime/core/event_tracer.h>
#include <executorch/runtime/core/event_tracer_hooks.h>
using executorch::runtime::EventTracer;
using executorch::runtime::EventTracerEntry;

class EventTraceScope {
public:
EventTraceScope(EventTracer* event_tracer_, const char* name) {
event_tracer = event_tracer_;
event_tracer_entry_scope = event_tracer->start_profiling(name);
}
~EventTraceScope() {
event_tracer->end_profiling(event_tracer_entry_scope);
}

private:
EventTracer* event_tracer;
EventTracerEntry event_tracer_entry_scope;
};
#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \
EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \
SCOPE = EVENTTRACER->start_profiling(NAME)
#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \
EVENTTRACER->end_profiling(SCOPE)

#else
#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME)
#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME)
#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE)
#endif

#include <executorch/backends/arm/runtime/VelaBinStream.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
Expand Down Expand Up @@ -109,20 +142,38 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
BackendExecutionContext& context,
DelegateHandle* input_handle,
EValue** args) const override {
#if defined(ET_EVENT_TRACER_ENABLED)
EventTracer* event_tracer = context.event_tracer();
EventTracerEntry event_tracer_local_scope;
#endif

EXECUTORCH_PROF_SCOPE(event_tracer, "ArmBackend::execute()");
ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;

ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle;
VelaHandles handles;

ArmBackendExecuteCallbacks ArmBackend_execute_callbacks;
// Command stream - we know at this point it's aligned
EXECUTORCH_PROF_START(
event_tracer,
event_tracer_local_scope,
"+ArmBackend::execute()processed_data");
char* data = (char*)execution_handle->processed->data();
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);

ET_LOG(Debug, "ArmBackend::execute %p", data);

EXECUTORCH_PROF_START(
event_tracer,
event_tracer_local_scope,
"+ArmBackend::execute()vela_bin_read()");
// Read key sections from the vela_bin_stream
if (vela_bin_read(data, &handles, execution_handle->processed->size()) ==
false) {
ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
return Error::InvalidProgram;
}
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);

ET_LOG(
Debug,
Expand Down Expand Up @@ -186,6 +237,9 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {

// Select a compatible copy routine
if (both_char and permuted_input_shape) {
EXECUTORCH_PROF_SCOPE(
event_tracer,
"+ArmBackend::execute()handles.input.permute_CHW_to_HWC()");
// permuted byte copy CHW to HWC
permute_CHW_to_HWC(
tensor_in.mutable_data_ptr<char>(),
Expand All @@ -194,6 +248,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
tensor_in.size(2),
tensor_in.size(3));
} else if (both_char or both_int) {
EXECUTORCH_PROF_SCOPE(
event_tracer, "+ArmBackend::execute()handles.input.memcpy()");
// Sizes match and elt size matches so memcpy
memcpy(
scratch_addr,
Expand Down Expand Up @@ -234,14 +290,18 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
size_t bases_size[2] = {
handles.weight_data_size, handles.scratch_data_size};
int result = ethosu_invoke_v3(
int result = 0;
EXECUTORCH_PROF_START(
event_tracer, event_tracer_local_scope, "+ArmBackend::execute()NPU");
result = ethosu_invoke_v3(
driver.get(),
(void*)handles.cmd_data,
handles.cmd_data_size,
bases,
bases_size,
2, /* fixed array of pointers to binary interface*/
nullptr);
EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);

if (result != 0) {
ET_LOG(
Expand Down Expand Up @@ -277,6 +337,10 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
&permuted_output_shape));
if (tensor_out.scalar_type() == ScalarType::Char and
permuted_output_shape) {
EXECUTORCH_PROF_SCOPE(
event_tracer,
"+ArmBackend::execute()handles.output.permute_HWC_to_CHW()");

char* output_address = (char*)output_addr;
permute_HWC_to_CHW(
output_address,
Expand All @@ -285,6 +349,8 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
tensor_out.size(2),
tensor_out.size(3));
} else {
EXECUTORCH_PROF_SCOPE(
event_tracer, "+ArmBackend::execute()handles.output.move()");
for (int j = 0; j < tensor_out.numel(); j++) {
if (tensor_out.scalar_type() == ScalarType::Char) {
char* output_address = (char*)output_addr;
Expand Down
8 changes: 5 additions & 3 deletions devtools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ cmake_minimum_required(VERSION 3.19)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)

if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()

if(NOT FLATCC_EXECUTABLE)
set(FLATCC_EXECUTABLE flatcc)
set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
endif()

# Source root directory for executorch.
Expand Down Expand Up @@ -66,7 +68,7 @@ set(FLATCC_DEBUG_CLANG_SANITIZE
OFF
CACHE BOOL ""
)
set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc)

add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc)

# Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making
Expand Down Expand Up @@ -163,7 +165,7 @@ add_custom_command(
# Note that the flatcc project actually writes its outputs into the source
# tree instead of under the binary directory, and there's no way to change
# that behavior.
${_flatcc_source_dir}/bin/flatcc -cwr -o
${FLATCC_EXECUTABLE} -cwr -o
${_program_schema__include_dir}/executorch/devtools/etdump
${_etdump_schema__srcs}
COMMAND rm -rf ${_etdump_schema_cleanup_paths}
Expand Down
5 changes: 5 additions & 0 deletions examples/arm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,8 @@ generate_bindings_for_kernels(
gen_operators_lib(
LIB_NAME "arm_portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
)

if(EXECUTORCH_ENABLE_EVENT_TRACER)
target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
endif()
4 changes: 2 additions & 2 deletions examples/arm/aot_arm_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def get_compile_spec(
target,
system_config="Ethos_U55_High_End_Embedded",
memory_mode="Shared_Sram",
extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
extra_flags="--debug-force-regor --output-format=raw --verbose-operators",
)
.set_permute_memory_format(True)
.set_quantize_io(True)
Expand All @@ -286,7 +286,7 @@ def get_compile_spec(
target,
system_config="Ethos_U85_SYS_DRAM_Mid",
memory_mode="Shared_Sram",
extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
extra_flags="--output-format=raw --verbose-operators",
)
.set_permute_memory_format(True)
.set_quantize_io(True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ index b458fc6..8d4bc73 100644

KEEP(*(.eh_frame*))
} > ITCM :rom_exec
@@ -280,7 +280,7 @@ SECTIONS
@@ -280,7 +280,8 @@ SECTIONS
#endif
* (expected_output_data_sec)
* (sec_command_stream, sec_weight_data, sec_input_data)
-
+ *(.got*)
+ *(.rodata*)
* (ethosu_core_in_queue)
* (ethosu_core_out_queue)
Expand Down
45 changes: 40 additions & 5 deletions examples/arm/executor_runner/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ option(SEMIHOSTING "Enable semihosting" OFF)
option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)


if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
message(
FATAL_ERROR
Expand Down Expand Up @@ -220,10 +219,8 @@ target_sources(
# Include the target's bare-metal linker script
ethosu_eval_link_options(arm_executor_runner)

# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
# bin size as we link in a number of other symbols
target_link_libraries(
arm_executor_runner
set(arm_executor_runner_link)
list(APPEND arm_executor_runner_link
extension_runner_util
ethosu_target_init
executorch
Expand All @@ -237,6 +234,44 @@ target_link_libraries(
-Xlinker -Map=arm_executor_runner.map
)

if(EXECUTORCH_ENABLE_EVENT_TRACER)
target_compile_options(arm_executor_runner PUBLIC -DET_EVENT_TRACER_ENABLED)

add_library(etdump STATIC IMPORTED)
set_property(
TARGET etdump
PROPERTY IMPORTED_LOCATION
"${ET_BUILD_DIR_PATH}/lib/libetdump.a"
)

if(CMAKE_BUILD_TYPE MATCHES "Debug")
set(FLATCCRT_LIB flatccrt_d)
else()
set(FLATCCRT_LIB flatccrt)
endif()

add_library(${FLATCCRT_LIB} STATIC IMPORTED)
set_property(
TARGET ${FLATCCRT_LIB}
PROPERTY IMPORTED_LOCATION
"${ET_BUILD_DIR_PATH}/lib/lib${FLATCCRT_LIB}.a"
)

list(APPEND arm_executor_runner_link
etdump
${FLATCCRT_LIB}
)
endif()

# Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
# bin size as we link in a number of other symbols
target_link_libraries(
arm_executor_runner
${arm_executor_runner_link}
)

target_link_options( arm_executor_runner PUBLIC LINKER:-Map=arm_executor_runner.map )

# ET headers and generated headers includes
target_include_directories(
arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
Expand Down
Loading

0 comments on commit dabb14e

Please sign in to comment.