diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index a38cbcf964..d58b001053 100644
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -93,7 +93,6 @@ def export_models_for_ci() -> dict[str, dict]:
     # This is the JSON syntax for configuration matrix used by GitHub
     # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
     models = {"include": []}
-    xnnpack_options = {}
     backends = ["portable", "xnnpack"]
     for (name, build_tool, backend) in itertools.product(
         MODEL_NAME_TO_MODEL.keys(), BUILD_TOOLS.keys(), backends
@@ -119,8 +118,6 @@ def export_models_for_ci() -> dict[str, dict]:
             "model": name,
             "backend": backend,
             "runner": DEFAULT_RUNNERS.get(target_os, "linux.2xlarge"),
-            # demo_backend_delegation test only supports add_mul model
-            "demo_backend_delegation": name == "add_mul",
         }
 
         # NB: Some model requires much bigger Linux runner to avoid
diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh
index 5fd15440c9..6c3466ccaf 100755
--- a/.ci/scripts/test.sh
+++ b/.ci/scripts/test.sh
@@ -28,11 +28,6 @@ if [[ -z "${BACKEND:-}" ]]; then
   exit 1
 fi
 
-DEMO_BACKEND_DELEGATION=$4
-if [[ -z "${DEMO_BACKEND_DELEGATION:-}" ]]; then
-  DEMO_BACKEND_DELEGATION=false
-fi
-
 which "${PYTHON_EXECUTABLE}"
 # Just set this variable here, it's cheap even if we use buck2
 CMAKE_OUTPUT_DIR=cmake-out
@@ -123,30 +118,6 @@ test_model_with_xnnpack() {
   fi
 }
 
-test_demo_backend_delegation() {
-  echo "Testing demo backend delegation on AddMul"
-  "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate  --option "composite"
-  "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate  --option "partition"
-  "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate  --option "whole"
-
-  # Run test model
-  if [[ "${BUILD_TOOL}" == "buck2" ]]; then
-    buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./composite_model.pte"
-    buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./partition_lowered_model.pte"
-    buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./whole.pte"
-  elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-    if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then
-      build_cmake_executor_runner
-    fi
-    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./composite_model.pte"
-    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./partition_lowered_model.pte"
-    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./whole.pte"
-  else
-    echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
-    exit 1
-  fi
-}
-
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
   test_model
@@ -167,8 +138,3 @@ else
     echo "::endgroup::"
   fi
 fi
-
-# Test demo backend delegation
-if [[ "${DEMO_BACKEND_DELEGATION}" == true ]]; then
-  test_demo_backend_delegation
-fi
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index bbb5279bd7..38e850f801 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -21,22 +21,15 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: 'true'
+          submodules: 'false'
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-          cache: pip
       - name: Extract the list of models to test
         id: gather-models
         run: |
           set -eux
 
-          source .ci/scripts/utils.sh
-          # This is a simple Python script but as it tries to import executorch.examples.models,
-          # it requires a whole bunch of ExecuTorch dependencies on the Docker image
-          install_pip_dependencies
-          install_executorch
-
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}"
 
   test-models-macos:
@@ -57,14 +50,13 @@ jobs:
 
         MODEL_NAME=${{ matrix.model }}
         BUILD_TOOL=${{ matrix.build-tool }}
-        XNNPACK_QUANTIZATION=${{ matrix.xnnpack_quantization }}
-        XNNPACK_DELEGATION=${{ matrix.delegation }}
+        BACKEND=${{ matrix.backend }}
         DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test xecutorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${XNNPACK_QUANTIZATION}" "${XNNPACK_DELEGATION}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
         popd
 
   test-custom-ops-macos:
@@ -114,3 +106,27 @@ jobs:
         # Build and test selective build
         PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
         popd
+
+  test-demo-backend-delegation:
+    name: test-demo-backend-delegation
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        include:
+          - build-tool: buck2
+          - build-tool: cmake
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL=${{ matrix.build-tool }}
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        # Test selective build
+        PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 82f08f486a..212355ffe0 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -20,7 +20,10 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
 include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
 
-set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+set(_arm_baremetal_sources
+  backends/arm/runtime/ArmBackendEthosU.cpp
+  backends/arm/runtime/VelaBinStream.cpp
+)
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
 
 add_library(
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 748e60e213..2a4fca05e6 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -145,8 +145,27 @@ def dbg_tosa_dump(tosa_fb, path):
         f.write(js)
 
 
-# Output to Vela with current file-based compilation
-# WARNING: if this changes, the runtime reader also needs to change
+# Pack either input or output tensor block, compose the related arrays into
+# per-io structs to simplify runtime use.
+def vela_bin_pack_io(prefix, data):
+    ios = struct.pack("<i", len(data[prefix + "_shape"]))
+    for i in range(len(data[prefix + "_shape"])):
+        io_shape = data[prefix + "_shape"][i]
+        io_elem_size = data[prefix + "_elem_size"][i]
+        io_offset = data[prefix + "_offset"][i]
+        io_region = data[prefix + "_region"][i]
+        assert len(io_shape) <= 4
+        inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
+        io_struct = struct.pack(
+            "<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
+        )
+        ios += io_struct
+    return ios
+
+
+# Output via Vela to binary stream for ArmBackendEthosU
+# WARNING: Do not change this without changing VelaBinStream.cpp as that
+#          function consumes this format and the two need to align.
 def vela_compile(tosa_fb):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
@@ -162,65 +181,52 @@ def vela_compile(tosa_fb):
 
         np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
         blocks = b""
+
         with np.load(np_path, allow_pickle=False) as data:
+            # Construct our modified output_blocks with data in a form easily
+            # digested on the device side
+            bin_blocks = {"vela_bin_stream": b""}
+
+            # copy command data through unmodified
+            bin_blocks["cmd_data"] = data["cmd_data"].tobytes()
+
+            # copy weight data through unmodified
+            bin_blocks["weight_data"] = data["weight_data"].tobytes()
+
+            # Add a block for scratch, inputs and outputs;  scratch shape is a 1 element
+            # array giving us size in bytes so extract this and add a block of 0's.
+            # Currently we preallocated this on the host to provide SRAM for computation.
+            if len(data["scratch_shape"][0]) != 1:
+                raise RuntimeError("Expected scratch to be single array")
+            block_length = data["scratch_shape"][0].item()
+            bin_blocks["scratch_data"] = b"\x00" * block_length
+
+            # Capture inputs and outputs
+            bin_blocks["inputs"] = vela_bin_pack_io("input", data)
+            bin_blocks["outputs"] = vela_bin_pack_io("output", data)
+
+            bin_blocks["vela_end_stream"] = b""
+
             # Emit the NPZ regions as:
             #  - 16 byte block name null terminated string (padded to 16 if name shorter)
             #  - 4 bytes of int32 block length and 12 bytes of 0's
             #  - block data (padded to 16 byte alignment at end)
             # Repeat for all blocks
-            for key in data.keys():
+            for key in bin_blocks.keys():
                 block_name = bytes(key, "utf8")[:15]
                 block_name = block_name + b"\x00" * (16 - len(block_name))
 
-                block_data = b""
-                if key in ("input_shape", "output_shape"):
-                    inputs = data[key]
-                    # Encode a struct of int len; and one or more int x,y,z,w shape;
-                    input_struct = struct.pack("<i", len(inputs))
-                    for inp in inputs:
-                        assert len(inp) <= 4
-                        inp_pad = inp.tolist() + [0] * (4 - len(inp))
-                        input_struct = input_struct + struct.pack("<iiii", *inp_pad)
-                    block_data = input_struct
-                elif key in ("input_offset", "output_offset"):
-                    inputs = data[key]
-                    if key == "output_offset" and len(inputs) > 1:
-                        raise RuntimeError(
-                            "Currently only support one output in Vela ArmBackend"
-                        )
-                    offset_struct = struct.pack("<i", len(inputs))
-                    for inp in inputs:
-                        offset_struct = offset_struct + struct.pack("<i", inp)
-                    block_data = offset_struct
-                else:
-                    block_data = data[key].tobytes()
                 # We need the acual unpadded block lengths for hw setup
-                block_length = len(block_data).to_bytes(16, "little")
-                # pad block data to multiple of 16 bytes
+                block_length = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)
+
+                # Pad block data to multiple of 16 bytes
+                block_data = bin_blocks[key]
                 block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
 
                 block = block_name + block_length + block_data
                 blocks = blocks + block
 
-            # Add a block for scratch, inputs and outputs
-            # scratch shape is a 1 element array giving us size in bytes
-            block_name = bytes("scratch_data", "utf8")[:15]
-            block_name = block_name + b"\x00" * (16 - len(block_name))
-            block_length = data["scratch_shape"][0].item()
-            block_length = block_length + (15 - (block_length - 1) % 16)
-            block_data = b"\x00" * block_length
-            block_length = block_length.to_bytes(16, "little")
-            block = block_name + block_length + block_data
-            blocks = blocks + block
-            # TODO are these already in scratch shape? look to be
-            # input_shape * input_elem_size
-            # output_shape * output_elem_size
-            # input_offset and output_offset specify the location these arrays are written from base of scratch
-
-        # return 16 byte VELA bin header + blocks + footer
-        header = bytes("vela_bin_stream", "utf-8") + b"\x00"
-        footer = bytes("vela_end_stream", "utf-8") + b"\x00"
-        return header + blocks + footer
+        return blocks
 
 
 def dbg_fail(node, tosa_fb, path):
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 17625bdf20..fcc33b63fe 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -11,13 +11,13 @@
  */
 
 #include <cstring>
-#include <memory>
-#include <vector>
 
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
+#include <executorch/backends/arm/runtime/VelaBinStream.h>
+
 #include <ethosu_driver.h>
 #include <pmu_ethosu.h>
 
@@ -52,29 +52,14 @@ class ArmBackend final : public PyTorchBackendInterface {
 
     char* data = (char*)processed->data();
     size_t size = processed->size();
-    char* foot = data + size - 16;
+    char* foot = data + size - sizeof(VelaBinBlock);
 
-    // Header and footer both 16 bit aligned suggest valid structure and we
-    // wont walk off the end of the chunks and segfault
-    if (!((int)data == next_mul_16((uintptr_t)data))) {
-      ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
-      return Error::InvalidProgram;
-    }
-    if (!((int)foot == next_mul_16((uintptr_t)foot))) {
-      ET_LOG(Error, "ArmBackend::init: Footer expected to be 16 byte aligned");
-      ET_LOG(
-          Error,
-          "ArmBackend::init: Program expected to be multiple of 16 bytes");
-      return Error::InvalidProgram;
-    }
-    if (!(0 == strncmp(data, "vela_bin_stream", 15))) {
-      ET_LOG(Error, "ArmBackend::init: Binary passed is not a vela_bin_stream");
-      return Error::InvalidProgram;
-    }
-    if (!(0 == strncmp(foot, "vela_end_stream", 15))) {
-      ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream");
+    // Verify format of vela_bin
+    if (vela_bin_validate(data, size) == false) {
+      ET_LOG(Error, "Malformed vela_bin_stream found");
       return Error::InvalidProgram;
     }
+
     // Verify address range is accessible current expectation is the program
     // is wholly stored in SRAM
     // TODO: expect to improve capabilities here by supporting DRAM storage
@@ -108,7 +93,7 @@ class ArmBackend final : public PyTorchBackendInterface {
     char* data = (char*)processed->data();
 
     // Read key sections from the vela_bin_stream
-    if (!this->vela_read(data, &handles, processed->size())) {
+    if (vela_bin_read(data, &handles, processed->size()) == false) {
       ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
       return Error::InvalidProgram;
     }
@@ -124,8 +109,9 @@ class ArmBackend final : public PyTorchBackendInterface {
         handles.scratch_data_size);
 
     // Write inputs into SRAM scratch area defined by Vela
-    for (int i = 0; i < handles.input_shapes.size(); i++) {
-      const char* input_addr = handles.scratch_data + handles.input_offset[i];
+    for (int i = 0; i < handles.inputs->count; i++) {
+      const char* input_addr =
+          handles.scratch_data + handles.inputs->io[i].offset;
       // Process input EValue into scratch
       // TODO: Optimise into direct write from Vela into the SRAM or DRAM output
       //       for compatible data layouts.
@@ -168,25 +154,17 @@ class ArmBackend final : public PyTorchBackendInterface {
       return Error::InvalidProgram;
     }
 
-    // output data from Ethos U
-    // We only handle one output at the moment
-    const char* output_addr = handles.scratch_data + handles.output_offset[0];
-    // Outputs are in the index immediately after inputs
-    int output_index = handles.input_shapes.size();
-
-    if (handles.output_shapes.size() != 1) {
-      ET_LOG(
-          Error,
-          "ArmBackend::execute: currently only support one return tensor");
-      return Error::InvalidProgram;
-    }
-    // Process results into EValue storage
-    // TODO: optimise into direct write for compatible, contig layout
-    int* output_address = (int*)output_addr;
-    auto tensor_out = args[output_index]->toTensor();
-    for (int j = 0; j < tensor_out.numel(); j++) {
-      // TODO: extend beyond tensors with 4 byte elements
-      tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+    // Write outputs from scratch into EValue pointers
+    for (int i = 0; i < handles.outputs->count; i++) {
+      const char* output_addr =
+          handles.scratch_data + handles.outputs->io[i].offset;
+      // Process input EValue into scratch
+      int* output_address = (int*)output_addr;
+      // Outputs are in the index immediately after inputs
+      auto tensor_out = args[handles.inputs->count + i]->toTensor();
+      for (int j = 0; j < tensor_out.numel(); j++) {
+        tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+      }
     }
 
     return Error::Ok;
@@ -195,114 +173,6 @@ class ArmBackend final : public PyTorchBackendInterface {
   void destroy(DelegateHandle* handle) const override {
     return;
   }
-
- private:
-  typedef struct {
-    const char* cmd_data;
-    size_t cmd_data_size;
-    const char* weight_data;
-    size_t weight_data_size;
-    const char* scratch_data;
-    size_t scratch_data_size;
-    vector<size_t> input_offset;
-    vector<vector<int>> input_shapes;
-    vector<size_t> output_offset;
-    vector<vector<int>> output_shapes;
-  } VelaHandles;
-
-  typedef struct {
-    char name[16];
-    uint32_t size;
-    char _pad[12];
-    char data[];
-  } VelaBinBlock;
-
-  typedef struct {
-    int count;
-    int shape[][4];
-  } VelaShapes;
-
-  typedef struct {
-    int count;
-    int offsets[];
-  } VelaOffsets;
-
-  static int next_mul_16(int n) {
-    return ((n - 1) | 15) + 1;
-  }
-
-  int vela_read(char* data, VelaHandles* handles, int size) const {
-    constexpr const size_t header_size = 16;
-
-    // Read header string
-    if (strncmp(data, "vela_bin_stream", 15)) {
-      return 0;
-    }
-    data += header_size;
-
-    // Expect one or more 'VelaBinBlock's
-    while (1) {
-      VelaBinBlock* b = (VelaBinBlock*)data;
-      data += sizeof(VelaBinBlock) + next_mul_16(b->size);
-
-      // Exit with success on finding end of stream
-      if (!strncmp(b->name, "vela_end_stream", strlen("vela_end_stream")))
-        return 1;
-
-      if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
-        // This magic header confirms a valid command stream in binary
-        if (strncmp(b->data, "COP1", strlen("COP1")))
-          return 0;
-        handles->cmd_data = b->data;
-        handles->cmd_data_size = b->size;
-      }
-      if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
-        handles->weight_data = b->data;
-        handles->weight_data_size = b->size;
-      }
-      if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
-        handles->scratch_data = b->data;
-        handles->scratch_data_size = b->size;
-      }
-
-      // capture inputs and outputs
-      if (!strncmp(b->name, "input_offset", strlen("input_offset"))) {
-        VelaOffsets* offsets = (VelaOffsets*)b->data;
-        for (int i = 0; i < offsets->count; i++) {
-          handles->input_offset.push_back(offsets->offsets[i]);
-        }
-      }
-      if (!strncmp(b->name, "output_offset", strlen("output_offset"))) {
-        VelaOffsets* offsets = (VelaOffsets*)b->data;
-        for (int i = 0; i < offsets->count; i++) {
-          handles->output_offset.push_back(offsets->offsets[i]);
-        }
-      }
-
-      if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
-        VelaShapes* shapes = (VelaShapes*)b->data;
-        for (int i = 0; i < shapes->count; i++) {
-          vector<int> s = {
-              shapes->shape[i][0],
-              shapes->shape[i][1],
-              shapes->shape[i][2],
-              shapes->shape[i][3]};
-          handles->input_shapes.push_back(s);
-        }
-      }
-      if (!strncmp(b->name, "output_shape", strlen("output_shape"))) {
-        VelaShapes* shapes = (VelaShapes*)b->data;
-        for (int i = 0; i < shapes->count; i++) {
-          vector<int> s = {
-              shapes->shape[i][0],
-              shapes->shape[i][1],
-              shapes->shape[i][2],
-              shapes->shape[i][3]};
-          handles->output_shapes.push_back(s);
-        }
-      }
-    }
-  }
 };
 
 namespace {
diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp
new file mode 100644
index 0000000000..f9f8b0aca1
--- /dev/null
+++ b/backends/arm/runtime/VelaBinStream.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2023 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Warning: Do not change this without changing arm_backend.py::vela_compile
+ *          as that function emits this format and the two need to align.
+ */
+
+#include <cstring>
+
+#include <executorch/runtime/core/error.h>
+
+#include <executorch/backends/arm/runtime/VelaBinStream.h>
+
+// get next mul of 16 ptr, return n if already aligned
+static uintptr_t next_mul_16(uintptr_t n) {
+  return ((n - 1) | 15) + 1;
+}
+
+bool vela_bin_validate(const char* data, int size) {
+  const char* foot = data + size - sizeof(VelaBinBlock);
+
+  // Check 16 byte alignment
+  if ((uintptr_t)data != next_mul_16((uintptr_t)data))
+    return false;
+  if ((uintptr_t)foot != next_mul_16((uintptr_t)foot))
+    return false;
+  // Check header and footer blocks are the right format
+  if (strncmp(data, "vela_bin_stream", strlen("vela_bin_stream")) != 0)
+    return false;
+  if (strncmp(foot, "vela_end_stream", strlen("vela_end_stream")) != 0)
+    return false;
+
+  return true;
+}
+
+bool vela_bin_read(const char* data, VelaHandles* handles, int size) {
+  const char* ptr = data;
+
+  while (ptr - data < size) {
+    VelaBinBlock* b = (VelaBinBlock*)ptr;
+    ptr += sizeof(VelaBinBlock) + next_mul_16(b->size);
+
+    if (!strncmp(b->name, "vela_bin_stream", strlen("vela_bin_stream"))) {
+      // expect vela_bin_stream first
+      if ((char*)b != (char*)data)
+        return false;
+    } else if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
+      // This driver magic header confirms a valid command stream in binary
+      if (strncmp(b->data, "COP1", strlen("COP1")))
+        return false;
+      handles->cmd_data = b->data;
+      handles->cmd_data_size = b->size;
+    } else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
+      handles->weight_data = b->data;
+      handles->weight_data_size = b->size;
+    } else if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
+      handles->scratch_data = b->data;
+      handles->scratch_data_size = b->size;
+    } else if (!strncmp(b->name, "inputs", strlen("inputs"))) {
+      handles->inputs = (VelaIOs*)b->data;
+    } else if (!strncmp(b->name, "outputs", strlen("outputs"))) {
+      handles->outputs = (VelaIOs*)b->data;
+    } else if (!strncmp(
+                   b->name, "vela_end_stream", strlen("vela_end_stream"))) {
+      // expect vela_end_stream last
+      if (ptr - data != size) {
+        ET_LOG(Error, "Expected vela binary to end with vela_end_stream");
+        return false;
+      }
+      return true;
+    } else {
+      // Unrecognised block name
+      ET_LOG(Error, "Invalid block name or malformed binary");
+      return false;
+    }
+  }
+
+  // We've fallen off the end without finding vela_end_stream
+  return false;
+}
diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h
new file mode 100644
index 0000000000..96a2409f39
--- /dev/null
+++ b/backends/arm/runtime/VelaBinStream.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2023 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Minimal reading function for vela_bin_stream wire format. This is an
+ * implementation detail of the arm_backend AoT flow and ArmBackendEthosU
+ * and subject to change.
+ * This format captures the command stream, I/O and memory layout data to
+ * enable execution of the command stream on Ethos-U hardware.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+// Standard block name size
+const uint32_t kVelaBlockNameLength = 16;
+
+// Generic block within the vela_bin_stream encoded by the python vela_compile
+// step
+typedef struct {
+  char name[kVelaBlockNameLength]; // string name, can be shorter or truncated
+  uint32_t size; // unpadded size, BinBlock size will be rounded to next_mul_16
+  char _pad[12]; // Our data often need 16 byte alignemnt
+  char data[]; // block.name specific format data
+} VelaBinBlock;
+
+// A Vela input or output descriptor in the binary stream
+typedef struct {
+  int shape[4]; // Up to 4D shape of input or output
+  int elem_size; // Element sizeof in bytes
+  int offset; // Offset in bytes within SRAM working data
+  int region; // Scratch region this belongs to
+} VelaIO;
+
+// A list of VelaIOs from the binary stream
+typedef struct {
+  int count;
+  VelaIO io[];
+} VelaIOs;
+
+// Processed data used by the backend to invoke the payload
+typedef struct {
+  const char* cmd_data;
+  size_t cmd_data_size;
+  const char* weight_data;
+  size_t weight_data_size;
+  const char* scratch_data;
+  size_t scratch_data_size;
+  VelaIOs* inputs;
+  VelaIOs* outputs;
+} VelaHandles;
+
+/* Takes in the preprocessed vela_bin_stream wire format and returns data
+ * needed to launch the workload on the Ethos-U and wire up input and
+ * output values.
+ */
+bool vela_bin_read(const char* data, VelaHandles* handles, int size);
+
+/* Does minimal validation of a vela_bin_stream to ensure the overall
+ * structure is correct and so likely to contain valid binary data for launch
+ * on the Ethos-U.
+ */
+bool vela_bin_validate(const char* data, int size);
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
index f61dba1307..76a244ce4a 100644
--- a/docs/source/getting-started-setup.md
+++ b/docs/source/getting-started-setup.md
@@ -179,16 +179,17 @@ To generate a sample program, complete the following steps:
   ```
   :::
 
-  This command has created a `add.pte` file that contains your sample program.
+  This command has created a `add.pte` file that contains your sample program,
+  which adds its inputs multiple times.
 
-Alternatively, you can use a Python Interpreter to perform the same action:
+Alternatively, you can use a Python interpreter to perform similar steps, this
+time creating a `mul.pte` program file that multiplies its inputs:
 
 ```python
->>> import executorch.exir as exir
->>> from executorch.exir.tests.models import Mul
->>> m = Mul()
->>> print(exir.capture(m, m.get_random_inputs()).to_edge())
->>> open("mul.pte", "wb").write(exir.capture(m, m.get_random_inputs()).to_edge().to_executorch().buffer)
+import executorch.exir as exir
+from executorch.exir.tests.models import Mul
+m = Mul()
+open("mul.pte", "wb").write(exir.capture(m, m.get_random_inputs()).to_edge().to_executorch().buffer)
 ```
 
 In this step, you learned how you can export your PyTorch program to an ExecuTorch
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index c969f95890..2c557e68c6 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -38,6 +38,34 @@ def forward(self, x):
     can_delegate = True
 
 
+class AddModule2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return x + y
+
+    example_input = (
+        torch.ones(5, dtype=torch.int32),
+        torch.ones(5, dtype=torch.int32),
+    )
+    can_delegate = True
+
+
+class AddModule3(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return (x + y, x + x)
+
+    example_input = (
+        torch.ones(5, dtype=torch.int32),
+        torch.ones(5, dtype=torch.int32),
+    )
+    can_delegate = True
+
+
 class SoftmaxModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -53,6 +81,8 @@ def forward(self, x):
 
 models = {
     "add": AddModule,
+    "add2": AddModule2,
+    "add3": AddModule3,
     "softmax": SoftmaxModule,
 }
 
diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py
index fa34f179c7..c89e941144 100644
--- a/examples/arm/arm_tosa_e2e.py
+++ b/examples/arm/arm_tosa_e2e.py
@@ -55,12 +55,12 @@ def get_input_quantization_params(captured_model):
     input_scales = {}
     input_zeropoints = {}
     input_names = []
-    for node in captured_model.exported_program.graph.nodes:
+    for node in captured_model.exported_program().graph.nodes:
         if node.op == "placeholder":
             input_names.append(node.name)
             continue
 
-    for node in captured_model.exported_program.graph.nodes:
+    for node in captured_model.exported_program().graph.nodes:
         if (
             node.target
             == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
@@ -78,11 +78,11 @@ def get_output_quantization_param(captured_model):
     output_scale = 0.0
     output_zeropoint = 0
     output_name = ""
-    for node in captured_model.exported_program.graph.nodes:
+    for node in captured_model.exported_program().graph.nodes:
         if node.op == "output":
             output_name = node.args[0][0]
 
-    for node in captured_model.exported_program.graph.nodes:
+    for node in captured_model.exported_program().graph.nodes:
         if (
             node.target
             == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
@@ -172,7 +172,7 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
 
     # Export model
     model_capture = export(model, inputs)
-    model_edge = to_edge(model_capture, _EDGE_COMPILE_CONFIG)
+    model_edge = to_edge(model_capture, compile_config=_EDGE_COMPILE_CONFIG)
     ArmPartitioner.compile_spec = compile_spec
 
     if profile == TosaProfile.BI:
@@ -185,9 +185,8 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
             output_quantization_zp,
         ) = get_output_quantization_param(model_edge)
 
-    model_edge = model_edge.transform(DuplicateDequantNodePass()).to_backend(
-        ArmPartitioner
-    )
+    model_edge = model_edge.transform((DuplicateDequantNodePass(),))
+    model_edge = model_edge.to_backend(ArmPartitioner)
     exec_prog = model_edge.to_executorch()
 
     # Save ground truth results to file
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index c82c491390..44ba9ba0fa 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -34,23 +34,20 @@ fvp_model=FVP_Corstone_SSE-300_Ethos-U55
 toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake
 _setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools."
 
-
-# Generate the PTE file
+# Generate a pte file
 function generate_pte_file() {
+    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and delegate flag, got, $*"; exit 1; }
+    local model=${1}
+    local delegate=${2}
+
+    local model_filename=${model}.pte
+    if [ "${delegate}" = "--delegate" ]; then
+        model_filename=${model}_arm_delegate.pte
+    fi
     cd $et_root_dir
-    python3 -m examples.arm.aot_arm_compiler --model_name="softmax"
-    local pte_file
-    pte_file="$(realpath ./softmax.pte)"
-    [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
-    echo "${pte_file}"
-}
-
-# Generate the ethos delegate PTE file
-function generate_ethos_pte_file() {
-    cd $et_root_dir
-    python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate 1>&2
+    python3 -m examples.arm.aot_arm_compiler --model_name="${model}" ${delegate} 1>&2
     local pte_file
-    pte_file=$(realpath ./add_arm_delegate.pte)
+    pte_file=$(realpath ${model_filename})
     [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
@@ -150,16 +147,17 @@ type ${buck2} 2>&1 > /dev/null \
 # build executorch libraries
 build_executorch
 
-# generate a .pte file - in this case a non-delegated one
-pte=$(generate_pte_file)
-# build and run the runner with a non-delegated .pte
-build_executorch_runner "${pte}"
-run_fvp executor_runner.elf
-
-# generate a pte with an ArmBackend delegate
-pte_delegate=$(generate_ethos_pte_file)
-# build and run the same app with a delegated .pte
-build_executorch_runner "${pte_delegate}"
-run_fvp executor_runner.elf
+# the test models run, and whether to delegate
+test_model=( "softmax" "add" "add3" )
+test_delegate=( "" "--delegate" "--delegate" )
+
+# loop over running the AoT flow and executing the model on device
+for i in "${!test_model[@]}"; do
+    printf "Running e2e flow for model '%s' with flags '%s'\n" "${test_model[i]}" "${test_delegate[i]}"
+    pte=$(generate_pte_file "${test_model[i]}" "${test_delegate[i]}")
+    # Rebuild the application as the pte is imported as a header/c array
+    build_executorch_runner "${pte}"
+    run_fvp executor_runner.elf
+done
 
 exit 0
diff --git a/sdk/etrecord/_etrecord.py b/sdk/etrecord/_etrecord.py
index fb8e68eac6..ce86d984f2 100644
--- a/sdk/etrecord/_etrecord.py
+++ b/sdk/etrecord/_etrecord.py
@@ -69,7 +69,10 @@ def _handle_multi_method_exported_program(
 def _handle_export_module(
     etrecord_zip: ZipFile,
     export_module: Union[
-        MultiMethodExirExportedProgram, ExirExportedProgram, EdgeProgramManager
+        MultiMethodExirExportedProgram,
+        ExirExportedProgram,
+        EdgeProgramManager,
+        ExportedProgram,
     ],
     module_name: str,
 ) -> None:
@@ -79,6 +82,8 @@ def _handle_export_module(
         _handle_exported_program(
             etrecord_zip, module_name, "forward", export_module.exported_program
         )
+    elif isinstance(export_module, ExportedProgram):
+        _handle_exported_program(etrecord_zip, module_name, "forward", export_module)
     elif isinstance(
         export_module,
         (EdgeProgramManager, exir.program._program.EdgeProgramManager),
@@ -140,7 +145,7 @@ def generate_etrecord(
         executorch_program: `ExecutorchProgramManager` for this model returned by the call to `to_executorch()`
         export_modules[Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the
             value being the corresponding exported module. The exported graph modules can be either the
-            output of `capture()` or `to_edge()`.
+            output of `torch.export()` or `exir.to_edge()`.
 
     Returns:
         None