diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py index a38cbcf964..d58b001053 100644 --- a/.ci/scripts/gather_test_models.py +++ b/.ci/scripts/gather_test_models.py @@ -93,7 +93,6 @@ def export_models_for_ci() -> dict[str, dict]: # This is the JSON syntax for configuration matrix used by GitHub # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs models = {"include": []} - xnnpack_options = {} backends = ["portable", "xnnpack"] for (name, build_tool, backend) in itertools.product( MODEL_NAME_TO_MODEL.keys(), BUILD_TOOLS.keys(), backends @@ -119,8 +118,6 @@ def export_models_for_ci() -> dict[str, dict]: "model": name, "backend": backend, "runner": DEFAULT_RUNNERS.get(target_os, "linux.2xlarge"), - # demo_backend_delegation test only supports add_mul model - "demo_backend_delegation": name == "add_mul", } # NB: Some model requires much bigger Linux runner to avoid diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh index 5fd15440c9..6c3466ccaf 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test.sh @@ -28,11 +28,6 @@ if [[ -z "${BACKEND:-}" ]]; then exit 1 fi -DEMO_BACKEND_DELEGATION=$4 -if [[ -z "${DEMO_BACKEND_DELEGATION:-}" ]]; then - DEMO_BACKEND_DELEGATION=false -fi - which "${PYTHON_EXECUTABLE}" # Just set this variable here, it's cheap even if we use buck2 CMAKE_OUTPUT_DIR=cmake-out @@ -123,30 +118,6 @@ test_model_with_xnnpack() { fi } -test_demo_backend_delegation() { - echo "Testing demo backend delegation on AddMul" - "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate --option "composite" - "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate --option "partition" - "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate --option "whole" - - # Run test model - if [[ "${BUILD_TOOL}" == "buck2" ]]; then - buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./composite_model.pte" - buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./partition_lowered_model.pte" - buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./whole.pte" - elif [[ "${BUILD_TOOL}" == "cmake" ]]; then - if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then - build_cmake_executor_runner - fi - ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./composite_model.pte" - ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./partition_lowered_model.pte" - ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./whole.pte" - else - echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" - exit 1 - fi -} - if [[ "${BACKEND}" == "portable" ]]; then echo "Testing ${MODEL_NAME} with portable kernels..." test_model @@ -167,8 +138,3 @@ else echo "::endgroup::" fi fi - -# Test demo backend delegation -if [[ "${DEMO_BACKEND_DELEGATION}" == true ]]; then - test_demo_backend_delegation -fi diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index bbb5279bd7..38e850f801 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -21,22 +21,15 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: 'true' + submodules: 'false' - uses: actions/setup-python@v4 with: python-version: '3.10' - cache: pip - name: Extract the list of models to test id: gather-models run: | set -eux - source .ci/scripts/utils.sh - # This is a simple Python script but as it tries to import executorch.examples.models, - # it requires a whole bunch of ExecuTorch dependencies on the Docker image - install_pip_dependencies - install_executorch - PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}" test-models-macos: @@ -57,14 +50,13 @@ jobs: MODEL_NAME=${{ matrix.model }} BUILD_TOOL=${{ matrix.build-tool }} - XNNPACK_QUANTIZATION=${{ matrix.xnnpack_quantization }} - XNNPACK_DELEGATION=${{ matrix.delegation }} + BACKEND=${{ matrix.backend }} DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }} # Setup MacOS dependencies as there is no Docker support on MacOS atm PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" # Build and test xecutorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${XNNPACK_QUANTIZATION}" "${XNNPACK_DELEGATION}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" popd test-custom-ops-macos: @@ -114,3 +106,27 @@ jobs: # Build and test selective build PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}" popd + + test-demo-backend-delegation: + name: test-demo-backend-delegation + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + matrix: + include: + - build-tool: buck2 + - build-tool: cmake + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12 + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + BUILD_TOOL=${{ matrix.build-tool }} + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" + # Test selective build + PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}" diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 82f08f486a..212355ffe0 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -20,7 +20,10 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include") include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} ) -set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) +set(_arm_baremetal_sources + backends/arm/runtime/ArmBackendEthosU.cpp + backends/arm/runtime/VelaBinStream.cpp +) list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") add_library( diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 748e60e213..2a4fca05e6 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -145,8 +145,27 @@ def dbg_tosa_dump(tosa_fb, path): f.write(js) -# Output to Vela with current file-based compilation -# WARNING: if this changes, the runtime reader also needs to change +# Pack either input or output tensor block, compose the related arrays into +# per-io structs to simplify runtime use. +def vela_bin_pack_io(prefix, data): + ios = struct.pack(" 1: - raise RuntimeError( - "Currently only support one output in Vela ArmBackend" - ) - offset_struct = struct.pack(" -#include -#include #include #include #include +#include + #include #include @@ -52,29 +52,14 @@ class ArmBackend final : public PyTorchBackendInterface { char* data = (char*)processed->data(); size_t size = processed->size(); - char* foot = data + size - 16; + char* foot = data + size - sizeof(VelaBinBlock); - // Header and footer both 16 bit aligned suggest valid structure and we - // wont walk off the end of the chunks and segfault - if (!((int)data == next_mul_16((uintptr_t)data))) { - ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned"); - return Error::InvalidProgram; - } - if (!((int)foot == next_mul_16((uintptr_t)foot))) { - ET_LOG(Error, "ArmBackend::init: Footer expected to be 16 byte aligned"); - ET_LOG( - Error, - "ArmBackend::init: Program expected to be multiple of 16 bytes"); - return Error::InvalidProgram; - } - if (!(0 == strncmp(data, "vela_bin_stream", 15))) { - ET_LOG(Error, "ArmBackend::init: Binary passed is not a vela_bin_stream"); - return Error::InvalidProgram; - } - if (!(0 == strncmp(foot, "vela_end_stream", 15))) { - ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream"); + // Verify format of vela_bin + if (vela_bin_validate(data, size) == false) { + ET_LOG(Error, "Malformed vela_bin_stream found"); return Error::InvalidProgram; } + // Verify address range is accessible current expectation is the program // is wholly stored in SRAM // TODO: expect to improve capabilities here by supporting DRAM storage @@ -108,7 +93,7 @@ class ArmBackend final : public PyTorchBackendInterface { char* data = (char*)processed->data(); // Read key sections from the vela_bin_stream - if (!this->vela_read(data, &handles, processed->size())) { + if (vela_bin_read(data, &handles, processed->size()) == false) { ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout"); return Error::InvalidProgram; } @@ -124,8 +109,9 @@ class ArmBackend final : public PyTorchBackendInterface { handles.scratch_data_size); // Write inputs into SRAM scratch area defined by Vela - for (int i = 0; i < handles.input_shapes.size(); i++) { - const char* input_addr = handles.scratch_data + handles.input_offset[i]; + for (int i = 0; i < handles.inputs->count; i++) { + const char* input_addr = + handles.scratch_data + handles.inputs->io[i].offset; // Process input EValue into scratch // TODO: Optimise into direct write from Vela into the SRAM or DRAM output // for compatible data layouts. @@ -168,25 +154,17 @@ class ArmBackend final : public PyTorchBackendInterface { return Error::InvalidProgram; } - // output data from Ethos U - // We only handle one output at the moment - const char* output_addr = handles.scratch_data + handles.output_offset[0]; - // Outputs are in the index immediately after inputs - int output_index = handles.input_shapes.size(); - - if (handles.output_shapes.size() != 1) { - ET_LOG( - Error, - "ArmBackend::execute: currently only support one return tensor"); - return Error::InvalidProgram; - } - // Process results into EValue storage - // TODO: optimise into direct write for compatible, contig layout - int* output_address = (int*)output_addr; - auto tensor_out = args[output_index]->toTensor(); - for (int j = 0; j < tensor_out.numel(); j++) { - // TODO: extend beyond tensors with 4 byte elements - tensor_out.mutable_data_ptr()[j] = output_address[j]; + // Write outputs from scratch into EValue pointers + for (int i = 0; i < handles.outputs->count; i++) { + const char* output_addr = + handles.scratch_data + handles.outputs->io[i].offset; + // Process input EValue into scratch + int* output_address = (int*)output_addr; + // Outputs are in the index immediately after inputs + auto tensor_out = args[handles.inputs->count + i]->toTensor(); + for (int j = 0; j < tensor_out.numel(); j++) { + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } } return Error::Ok; @@ -195,114 +173,6 @@ class ArmBackend final : public PyTorchBackendInterface { void destroy(DelegateHandle* handle) const override { return; } - - private: - typedef struct { - const char* cmd_data; - size_t cmd_data_size; - const char* weight_data; - size_t weight_data_size; - const char* scratch_data; - size_t scratch_data_size; - vector input_offset; - vector> input_shapes; - vector output_offset; - vector> output_shapes; - } VelaHandles; - - typedef struct { - char name[16]; - uint32_t size; - char _pad[12]; - char data[]; - } VelaBinBlock; - - typedef struct { - int count; - int shape[][4]; - } VelaShapes; - - typedef struct { - int count; - int offsets[]; - } VelaOffsets; - - static int next_mul_16(int n) { - return ((n - 1) | 15) + 1; - } - - int vela_read(char* data, VelaHandles* handles, int size) const { - constexpr const size_t header_size = 16; - - // Read header string - if (strncmp(data, "vela_bin_stream", 15)) { - return 0; - } - data += header_size; - - // Expect one or more 'VelaBinBlock's - while (1) { - VelaBinBlock* b = (VelaBinBlock*)data; - data += sizeof(VelaBinBlock) + next_mul_16(b->size); - - // Exit with success on finding end of stream - if (!strncmp(b->name, "vela_end_stream", strlen("vela_end_stream"))) - return 1; - - if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) { - // This magic header confirms a valid command stream in binary - if (strncmp(b->data, "COP1", strlen("COP1"))) - return 0; - handles->cmd_data = b->data; - handles->cmd_data_size = b->size; - } - if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { - handles->weight_data = b->data; - handles->weight_data_size = b->size; - } - if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { - handles->scratch_data = b->data; - handles->scratch_data_size = b->size; - } - - // capture inputs and outputs - if (!strncmp(b->name, "input_offset", strlen("input_offset"))) { - VelaOffsets* offsets = (VelaOffsets*)b->data; - for (int i = 0; i < offsets->count; i++) { - handles->input_offset.push_back(offsets->offsets[i]); - } - } - if (!strncmp(b->name, "output_offset", strlen("output_offset"))) { - VelaOffsets* offsets = (VelaOffsets*)b->data; - for (int i = 0; i < offsets->count; i++) { - handles->output_offset.push_back(offsets->offsets[i]); - } - } - - if (!strncmp(b->name, "input_shape", strlen("input_shape"))) { - VelaShapes* shapes = (VelaShapes*)b->data; - for (int i = 0; i < shapes->count; i++) { - vector s = { - shapes->shape[i][0], - shapes->shape[i][1], - shapes->shape[i][2], - shapes->shape[i][3]}; - handles->input_shapes.push_back(s); - } - } - if (!strncmp(b->name, "output_shape", strlen("output_shape"))) { - VelaShapes* shapes = (VelaShapes*)b->data; - for (int i = 0; i < shapes->count; i++) { - vector s = { - shapes->shape[i][0], - shapes->shape[i][1], - shapes->shape[i][2], - shapes->shape[i][3]}; - handles->output_shapes.push_back(s); - } - } - } - } }; namespace { diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp new file mode 100644 index 0000000000..f9f8b0aca1 --- /dev/null +++ b/backends/arm/runtime/VelaBinStream.cpp @@ -0,0 +1,85 @@ +/* + * Copyright 2023 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Warning: Do not change this without changing arm_backend.py::vela_compile + * as that function emits this format and the two need to align. + */ + +#include + +#include + +#include + +// get next mul of 16 ptr, return n if already aligned +static uintptr_t next_mul_16(uintptr_t n) { + return ((n - 1) | 15) + 1; +} + +bool vela_bin_validate(const char* data, int size) { + const char* foot = data + size - sizeof(VelaBinBlock); + + // Check 16 byte alignment + if ((uintptr_t)data != next_mul_16((uintptr_t)data)) + return false; + if ((uintptr_t)foot != next_mul_16((uintptr_t)foot)) + return false; + // Check header and footer blocks are the right format + if (strncmp(data, "vela_bin_stream", strlen("vela_bin_stream")) != 0) + return false; + if (strncmp(foot, "vela_end_stream", strlen("vela_end_stream")) != 0) + return false; + + return true; +} + +bool vela_bin_read(const char* data, VelaHandles* handles, int size) { + const char* ptr = data; + + while (ptr - data < size) { + VelaBinBlock* b = (VelaBinBlock*)ptr; + ptr += sizeof(VelaBinBlock) + next_mul_16(b->size); + + if (!strncmp(b->name, "vela_bin_stream", strlen("vela_bin_stream"))) { + // expect vela_bin_stream first + if ((char*)b != (char*)data) + return false; + } else if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) { + // This driver magic header confirms a valid command stream in binary + if (strncmp(b->data, "COP1", strlen("COP1"))) + return false; + handles->cmd_data = b->data; + handles->cmd_data_size = b->size; + } else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { + handles->weight_data = b->data; + handles->weight_data_size = b->size; + } else if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { + handles->scratch_data = b->data; + handles->scratch_data_size = b->size; + } else if (!strncmp(b->name, "inputs", strlen("inputs"))) { + handles->inputs = (VelaIOs*)b->data; + } else if (!strncmp(b->name, "outputs", strlen("outputs"))) { + handles->outputs = (VelaIOs*)b->data; + } else if (!strncmp( + b->name, "vela_end_stream", strlen("vela_end_stream"))) { + // expect vela_end_stream last + if (ptr - data != size) { + ET_LOG(Error, "Expected vela binary to end with vela_end_stream"); + return false; + } + return true; + } else { + // Unrecognised block name + ET_LOG(Error, "Invalid block name or malformed binary"); + return false; + } + } + + // We've fallen off the end without finding vela_end_stream + return false; +} diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h new file mode 100644 index 0000000000..96a2409f39 --- /dev/null +++ b/backends/arm/runtime/VelaBinStream.h @@ -0,0 +1,68 @@ +/* + * Copyright 2023 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Minimal reading function for vela_bin_stream wire format. This is an + * implementation detail of the arm_backend AoT flow and ArmBackendEthosU + * and subject to change. + * This format captures the command stream, I/O and memory layout data to + * enable execution of the command stream on Ethos-U hardware. + */ + +#pragma once + +#include + +// Standard block name size +const uint32_t kVelaBlockNameLength = 16; + +// Generic block within the vela_bin_stream encoded by the python vela_compile +// step +typedef struct { + char name[kVelaBlockNameLength]; // string name, can be shorter or truncated + uint32_t size; // unpadded size, BinBlock size will be rounded to next_mul_16 + char _pad[12]; // Our data often need 16 byte alignemnt + char data[]; // block.name specific format data +} VelaBinBlock; + +// A Vela input or output descriptor in the binary stream +typedef struct { + int shape[4]; // Up to 4D shape of input or output + int elem_size; // Element sizeof in bytes + int offset; // Offset in bytes within SRAM working data + int region; // Scratch region this belongs to +} VelaIO; + +// A list of VelaIOs from the binary stream +typedef struct { + int count; + VelaIO io[]; +} VelaIOs; + +// Processed data used by the backend to invoke the payload +typedef struct { + const char* cmd_data; + size_t cmd_data_size; + const char* weight_data; + size_t weight_data_size; + const char* scratch_data; + size_t scratch_data_size; + VelaIOs* inputs; + VelaIOs* outputs; +} VelaHandles; + +/* Takes in the preprocessed vela_bin_stream wire format and returns data + * needed to launch the workload on the Ethos-U and wire up input and + * output values. + */ +bool vela_bin_read(const char* data, VelaHandles* handles, int size); + +/* Does minimal validation of a vela_bin_stream to ensure the overall + * structure is correct and so likely to contain valid binary data for launch + * on the Ethos-U. + */ +bool vela_bin_validate(const char* data, int size); diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index f61dba1307..76a244ce4a 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -179,16 +179,17 @@ To generate a sample program, complete the following steps: ``` ::: - This command has created a `add.pte` file that contains your sample program. + This command has created a `add.pte` file that contains your sample program, + which adds its inputs multiple times. -Alternatively, you can use a Python Interpreter to perform the same action: +Alternatively, you can use a Python interpreter to perform similar steps, this +time creating a `mul.pte` program file that multiplies its inputs: ```python ->>> import executorch.exir as exir ->>> from executorch.exir.tests.models import Mul ->>> m = Mul() ->>> print(exir.capture(m, m.get_random_inputs()).to_edge()) ->>> open("mul.pte", "wb").write(exir.capture(m, m.get_random_inputs()).to_edge().to_executorch().buffer) +import executorch.exir as exir +from executorch.exir.tests.models import Mul +m = Mul() +open("mul.pte", "wb").write(exir.capture(m, m.get_random_inputs()).to_edge().to_executorch().buffer) ``` In this step, you learned how you can export your PyTorch program to an ExecuTorch diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index c969f95890..2c557e68c6 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -38,6 +38,34 @@ def forward(self, x): can_delegate = True +class AddModule2(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + return x + y + + example_input = ( + torch.ones(5, dtype=torch.int32), + torch.ones(5, dtype=torch.int32), + ) + can_delegate = True + + +class AddModule3(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + return (x + y, x + x) + + example_input = ( + torch.ones(5, dtype=torch.int32), + torch.ones(5, dtype=torch.int32), + ) + can_delegate = True + + class SoftmaxModule(torch.nn.Module): def __init__(self): super().__init__() @@ -53,6 +81,8 @@ def forward(self, x): models = { "add": AddModule, + "add2": AddModule2, + "add3": AddModule3, "softmax": SoftmaxModule, } diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py index fa34f179c7..c89e941144 100644 --- a/examples/arm/arm_tosa_e2e.py +++ b/examples/arm/arm_tosa_e2e.py @@ -55,12 +55,12 @@ def get_input_quantization_params(captured_model): input_scales = {} input_zeropoints = {} input_names = [] - for node in captured_model.exported_program.graph.nodes: + for node in captured_model.exported_program().graph.nodes: if node.op == "placeholder": input_names.append(node.name) continue - for node in captured_model.exported_program.graph.nodes: + for node in captured_model.exported_program().graph.nodes: if ( node.target == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default @@ -78,11 +78,11 @@ def get_output_quantization_param(captured_model): output_scale = 0.0 output_zeropoint = 0 output_name = "" - for node in captured_model.exported_program.graph.nodes: + for node in captured_model.exported_program().graph.nodes: if node.op == "output": output_name = node.args[0][0] - for node in captured_model.exported_program.graph.nodes: + for node in captured_model.exported_program().graph.nodes: if ( node.target == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default @@ -172,7 +172,7 @@ def tosa_run_test(op, profile=TosaProfile.MI): # noqa: C901 # Export model model_capture = export(model, inputs) - model_edge = to_edge(model_capture, _EDGE_COMPILE_CONFIG) + model_edge = to_edge(model_capture, compile_config=_EDGE_COMPILE_CONFIG) ArmPartitioner.compile_spec = compile_spec if profile == TosaProfile.BI: @@ -185,9 +185,8 @@ def tosa_run_test(op, profile=TosaProfile.MI): # noqa: C901 output_quantization_zp, ) = get_output_quantization_param(model_edge) - model_edge = model_edge.transform(DuplicateDequantNodePass()).to_backend( - ArmPartitioner - ) + model_edge = model_edge.transform((DuplicateDequantNodePass(),)) + model_edge = model_edge.to_backend(ArmPartitioner) exec_prog = model_edge.to_executorch() # Save ground truth results to file diff --git a/examples/arm/run.sh b/examples/arm/run.sh index c82c491390..44ba9ba0fa 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -34,23 +34,20 @@ fvp_model=FVP_Corstone_SSE-300_Ethos-U55 toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake _setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools." - -# Generate the PTE file +# Generate a pte file function generate_pte_file() { + [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and delegate flag, got, $*"; exit 1; } + local model=${1} + local delegate=${2} + + local model_filename=${model}.pte + if [ "${delegate}" = "--delegate" ]; then + model_filename=${model}_arm_delegate.pte + fi cd $et_root_dir - python3 -m examples.arm.aot_arm_compiler --model_name="softmax" - local pte_file - pte_file="$(realpath ./softmax.pte)" - [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } - echo "${pte_file}" -} - -# Generate the ethos delegate PTE file -function generate_ethos_pte_file() { - cd $et_root_dir - python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate 1>&2 + python3 -m examples.arm.aot_arm_compiler --model_name="${model}" ${delegate} 1>&2 local pte_file - pte_file=$(realpath ./add_arm_delegate.pte) + pte_file=$(realpath ${model_filename}) [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "${pte_file}" } @@ -150,16 +147,17 @@ type ${buck2} 2>&1 > /dev/null \ # build executorch libraries build_executorch -# generate a .pte file - in this case a non-delegated one -pte=$(generate_pte_file) -# build and run the runner with a non-delegated .pte -build_executorch_runner "${pte}" -run_fvp executor_runner.elf - -# generate a pte with an ArmBackend delegate -pte_delegate=$(generate_ethos_pte_file) -# build and run the same app with a delegated .pte -build_executorch_runner "${pte_delegate}" -run_fvp executor_runner.elf +# the test models run, and whether to delegate +test_model=( "softmax" "add" "add3" ) +test_delegate=( "" "--delegate" "--delegate" ) + +# loop over running the AoT flow and executing the model on device +for i in "${!test_model[@]}"; do + printf "Running e2e flow for model '%s' with flags '%s'\n" "${test_model[i]}" "${test_delegate[i]}" + pte=$(generate_pte_file "${test_model[i]}" "${test_delegate[i]}") + # Rebuild the application as the pte is imported as a header/c array + build_executorch_runner "${pte}" + run_fvp executor_runner.elf +done exit 0 diff --git a/sdk/etrecord/_etrecord.py b/sdk/etrecord/_etrecord.py index fb8e68eac6..ce86d984f2 100644 --- a/sdk/etrecord/_etrecord.py +++ b/sdk/etrecord/_etrecord.py @@ -69,7 +69,10 @@ def _handle_multi_method_exported_program( def _handle_export_module( etrecord_zip: ZipFile, export_module: Union[ - MultiMethodExirExportedProgram, ExirExportedProgram, EdgeProgramManager + MultiMethodExirExportedProgram, + ExirExportedProgram, + EdgeProgramManager, + ExportedProgram, ], module_name: str, ) -> None: @@ -79,6 +82,8 @@ def _handle_export_module( _handle_exported_program( etrecord_zip, module_name, "forward", export_module.exported_program ) + elif isinstance(export_module, ExportedProgram): + _handle_exported_program(etrecord_zip, module_name, "forward", export_module) elif isinstance( export_module, (EdgeProgramManager, exir.program._program.EdgeProgramManager), @@ -140,7 +145,7 @@ def generate_etrecord( executorch_program: `ExecutorchProgramManager` for this model returned by the call to `to_executorch()` export_modules[Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the value being the corresponding exported module. The exported graph modules can be either the - output of `capture()` or `to_edge()`. + output of `torch.export()` or `exir.to_edge()`. Returns: None