kirklandsign · kirklandsign · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
@@ -93,7 +93,6 @@ def export_models_for_ci() -> dict[str, dict]:
     # This is the JSON syntax for configuration matrix used by GitHub
     # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
     models = {"include": []}
-    xnnpack_options = {}
     backends = ["portable", "xnnpack"]
     for (name, build_tool, backend) in itertools.product(
         MODEL_NAME_TO_MODEL.keys(), BUILD_TOOLS.keys(), backends
@@ -119,8 +118,6 @@ def export_models_for_ci() -> dict[str, dict]:
             "model": name,
             "backend": backend,
             "runner": DEFAULT_RUNNERS.get(target_os, "linux.2xlarge"),
-            # demo_backend_delegation test only supports add_mul model
-            "demo_backend_delegation": name == "add_mul",
         }
 
         # NB: Some model requires much bigger Linux runner to avoid

diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh
@@ -28,11 +28,6 @@ if [[ -z "${BACKEND:-}" ]]; then
   exit 1
 fi
 
-DEMO_BACKEND_DELEGATION=$4
-if [[ -z "${DEMO_BACKEND_DELEGATION:-}" ]]; then
-  DEMO_BACKEND_DELEGATION=false
-fi
-
 which "${PYTHON_EXECUTABLE}"
 # Just set this variable here, it's cheap even if we use buck2
 CMAKE_OUTPUT_DIR=cmake-out
@@ -123,30 +118,6 @@ test_model_with_xnnpack() {
   fi
 }
 
-test_demo_backend_delegation() {
-  echo "Testing demo backend delegation on AddMul"
-  "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate  --option "composite"
-  "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate  --option "partition"
-  "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate  --option "whole"
-
-  # Run test model
-  if [[ "${BUILD_TOOL}" == "buck2" ]]; then
-    buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./composite_model.pte"
-    buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./partition_lowered_model.pte"
-    buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./whole.pte"
-  elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-    if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then
-      build_cmake_executor_runner
-    fi
-    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./composite_model.pte"
-    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./partition_lowered_model.pte"
-    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./whole.pte"
-  else
-    echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
-    exit 1
-  fi
-}
-
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
   test_model
@@ -167,8 +138,3 @@ else
     echo "::endgroup::"
   fi
 fi
-
-# Test demo backend delegation
-if [[ "${DEMO_BACKEND_DELEGATION}" == true ]]; then
-  test_demo_backend_delegation
-fi
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -21,22 +21,15 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: 'true'
+          submodules: 'false'
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-          cache: pip
       - name: Extract the list of models to test
         id: gather-models
         run: |
           set -eux
 
-          source .ci/scripts/utils.sh
-          # This is a simple Python script but as it tries to import executorch.examples.models,
-          # it requires a whole bunch of ExecuTorch dependencies on the Docker image
-          install_pip_dependencies
-          install_executorch
-
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}"
 
   test-models-macos:
@@ -57,14 +50,13 @@ jobs:
 
         MODEL_NAME=${{ matrix.model }}
         BUILD_TOOL=${{ matrix.build-tool }}
-        XNNPACK_QUANTIZATION=${{ matrix.xnnpack_quantization }}
-        XNNPACK_DELEGATION=${{ matrix.delegation }}
+        BACKEND=${{ matrix.backend }}
         DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test xecutorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${XNNPACK_QUANTIZATION}" "${XNNPACK_DELEGATION}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
         popd
 
   test-custom-ops-macos:
@@ -114,3 +106,27 @@ jobs:
         # Build and test selective build
         PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
         popd
+
+  test-demo-backend-delegation:
+    name: test-demo-backend-delegation
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        include:
+          - build-tool: buck2
+          - build-tool: cmake
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL=${{ matrix.build-tool }}
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        # Test selective build
+        PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
@@ -20,7 +20,10 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
 include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
 
-set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+set(_arm_baremetal_sources
+  backends/arm/runtime/ArmBackendEthosU.cpp
+  backends/arm/runtime/VelaBinStream.cpp
+)
 list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
 
 add_library(

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -145,8 +145,27 @@ def dbg_tosa_dump(tosa_fb, path):
         f.write(js)
 
 
-# Output to Vela with current file-based compilation
-# WARNING: if this changes, the runtime reader also needs to change
+# Pack either input or output tensor block, compose the related arrays into
+# per-io structs to simplify runtime use.
+def vela_bin_pack_io(prefix, data):
+    ios = struct.pack("<i", len(data[prefix + "_shape"]))
+    for i in range(len(data[prefix + "_shape"])):
+        io_shape = data[prefix + "_shape"][i]
+        io_elem_size = data[prefix + "_elem_size"][i]
+        io_offset = data[prefix + "_offset"][i]
+        io_region = data[prefix + "_region"][i]
+        assert len(io_shape) <= 4
+        inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
+        io_struct = struct.pack(
+            "<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
+        )
+        ios += io_struct
+    return ios
+
+
+# Output via Vela to binary stream for ArmBackendEthosU
+# WARNING: Do not change this without changing VelaBinStream.cpp as that
+#          function consumes this format and the two need to align.
 def vela_compile(tosa_fb):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
@@ -162,65 +181,52 @@ def vela_compile(tosa_fb):
 
         np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
         blocks = b""
+
         with np.load(np_path, allow_pickle=False) as data:
+            # Construct our modified output_blocks with data in a form easily
+            # digested on the device side
+            bin_blocks = {"vela_bin_stream": b""}
+
+            # copy command data through unmodified
+            bin_blocks["cmd_data"] = data["cmd_data"].tobytes()
+
+            # copy weight data through unmodified
+            bin_blocks["weight_data"] = data["weight_data"].tobytes()
+
+            # Add a block for scratch, inputs and outputs;  scratch shape is a 1 element
+            # array giving us size in bytes so extract this and add a block of 0's.
+            # Currently we preallocated this on the host to provide SRAM for computation.
+            if len(data["scratch_shape"][0]) != 1:
+                raise RuntimeError("Expected scratch to be single array")
+            block_length = data["scratch_shape"][0].item()
+            bin_blocks["scratch_data"] = b"\x00" * block_length
+
+            # Capture inputs and outputs
+            bin_blocks["inputs"] = vela_bin_pack_io("input", data)
+            bin_blocks["outputs"] = vela_bin_pack_io("output", data)
+
+            bin_blocks["vela_end_stream"] = b""
+
             # Emit the NPZ regions as:
             #  - 16 byte block name null terminated string (padded to 16 if name shorter)
             #  - 4 bytes of int32 block length and 12 bytes of 0's
             #  - block data (padded to 16 byte alignment at end)
             # Repeat for all blocks
-            for key in data.keys():
+            for key in bin_blocks.keys():
                 block_name = bytes(key, "utf8")[:15]
                 block_name = block_name + b"\x00" * (16 - len(block_name))
 
-                block_data = b""
-                if key in ("input_shape", "output_shape"):
-                    inputs = data[key]
-                    # Encode a struct of int len; and one or more int x,y,z,w shape;
-                    input_struct = struct.pack("<i", len(inputs))
-                    for inp in inputs:
-                        assert len(inp) <= 4
-                        inp_pad = inp.tolist() + [0] * (4 - len(inp))
-                        input_struct = input_struct + struct.pack("<iiii", *inp_pad)
-                    block_data = input_struct
-                elif key in ("input_offset", "output_offset"):
-                    inputs = data[key]
-                    if key == "output_offset" and len(inputs) > 1:
-                        raise RuntimeError(
-                            "Currently only support one output in Vela ArmBackend"
-                        )
-                    offset_struct = struct.pack("<i", len(inputs))
-                    for inp in inputs:
-                        offset_struct = offset_struct + struct.pack("<i", inp)
-                    block_data = offset_struct
-                else:
-                    block_data = data[key].tobytes()
                 # We need the acual unpadded block lengths for hw setup
-                block_length = len(block_data).to_bytes(16, "little")
-                # pad block data to multiple of 16 bytes
+                block_length = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)
+
+                # Pad block data to multiple of 16 bytes
+                block_data = bin_blocks[key]
                 block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
 
                 block = block_name + block_length + block_data
                 blocks = blocks + block
 
-            # Add a block for scratch, inputs and outputs
-            # scratch shape is a 1 element array giving us size in bytes
-            block_name = bytes("scratch_data", "utf8")[:15]
-            block_name = block_name + b"\x00" * (16 - len(block_name))
-            block_length = data["scratch_shape"][0].item()
-            block_length = block_length + (15 - (block_length - 1) % 16)
-            block_data = b"\x00" * block_length
-            block_length = block_length.to_bytes(16, "little")
-            block = block_name + block_length + block_data
-            blocks = blocks + block
-            # TODO are these already in scratch shape? look to be
-            # input_shape * input_elem_size
-            # output_shape * output_elem_size
-            # input_offset and output_offset specify the location these arrays are written from base of scratch
-
-        # return 16 byte VELA bin header + blocks + footer
-        header = bytes("vela_bin_stream", "utf-8") + b"\x00"
-        footer = bytes("vela_end_stream", "utf-8") + b"\x00"
-        return header + blocks + footer
+        return blocks
 
 
 def dbg_fail(node, tosa_fb, path):