Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Demo backend #1

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def export_models_for_ci() -> dict[str, dict]:
# This is the JSON syntax for configuration matrix used by GitHub
# https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
models = {"include": []}
xnnpack_options = {}
backends = ["portable", "xnnpack"]
for (name, build_tool, backend) in itertools.product(
MODEL_NAME_TO_MODEL.keys(), BUILD_TOOLS.keys(), backends
Expand All @@ -119,8 +118,6 @@ def export_models_for_ci() -> dict[str, dict]:
"model": name,
"backend": backend,
"runner": DEFAULT_RUNNERS.get(target_os, "linux.2xlarge"),
# demo_backend_delegation test only supports add_mul model
"demo_backend_delegation": name == "add_mul",
}

# NB: Some model requires much bigger Linux runner to avoid
Expand Down
34 changes: 0 additions & 34 deletions .ci/scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@ if [[ -z "${BACKEND:-}" ]]; then
exit 1
fi

DEMO_BACKEND_DELEGATION=$4
if [[ -z "${DEMO_BACKEND_DELEGATION:-}" ]]; then
DEMO_BACKEND_DELEGATION=false
fi

which "${PYTHON_EXECUTABLE}"
# Just set this variable here, it's cheap even if we use buck2
CMAKE_OUTPUT_DIR=cmake-out
Expand Down Expand Up @@ -123,30 +118,6 @@ test_model_with_xnnpack() {
fi
}

test_demo_backend_delegation() {
echo "Testing demo backend delegation on AddMul"
"${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate --option "composite"
"${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate --option "partition"
"${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export_and_delegate --option "whole"

# Run test model
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./composite_model.pte"
buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./partition_lowered_model.pte"
buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./whole.pte"
elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then
build_cmake_executor_runner
fi
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./composite_model.pte"
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./partition_lowered_model.pte"
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./whole.pte"
else
echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
exit 1
fi
}

if [[ "${BACKEND}" == "portable" ]]; then
echo "Testing ${MODEL_NAME} with portable kernels..."
test_model
Expand All @@ -167,8 +138,3 @@ else
echo "::endgroup::"
fi
fi

# Test demo backend delegation
if [[ "${DEMO_BACKEND_DELEGATION}" == true ]]; then
test_demo_backend_delegation
fi
38 changes: 27 additions & 11 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,15 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: pip
- name: Extract the list of models to test
id: gather-models
run: |
set -eux

source .ci/scripts/utils.sh
# This is a simple Python script but as it tries to import executorch.examples.models,
# it requires a whole bunch of ExecuTorch dependencies on the Docker image
install_pip_dependencies
install_executorch

PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}"

test-models-macos:
Expand All @@ -57,14 +50,13 @@ jobs:

MODEL_NAME=${{ matrix.model }}
BUILD_TOOL=${{ matrix.build-tool }}
XNNPACK_QUANTIZATION=${{ matrix.xnnpack_quantization }}
XNNPACK_DELEGATION=${{ matrix.delegation }}
BACKEND=${{ matrix.backend }}
DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}

# Setup MacOS dependencies as there is no Docker support on MacOS atm
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
# Build and test xecutorch
PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${XNNPACK_QUANTIZATION}" "${XNNPACK_DELEGATION}" "${DEMO_BACKEND_DELEGATION}"
PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
popd

test-custom-ops-macos:
Expand Down Expand Up @@ -114,3 +106,27 @@ jobs:
# Build and test selective build
PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
popd

test-demo-backend-delegation:
name: test-demo-backend-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
strategy:
matrix:
include:
- build-tool: buck2
- build-tool: cmake
fail-fast: false
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-clang12
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

BUILD_TOOL=${{ matrix.build-tool }}
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
# Test selective build
PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
5 changes: 4 additions & 1 deletion backends/arm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )

set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
set(_arm_baremetal_sources
backends/arm/runtime/ArmBackendEthosU.cpp
backends/arm/runtime/VelaBinStream.cpp
)
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")

add_library(
Expand Down
98 changes: 52 additions & 46 deletions backends/arm/arm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,27 @@ def dbg_tosa_dump(tosa_fb, path):
f.write(js)


# Output to Vela with current file-based compilation
# WARNING: if this changes, the runtime reader also needs to change
# Pack either input or output tensor block, compose the related arrays into
# per-io structs to simplify runtime use.
def vela_bin_pack_io(prefix, data):
ios = struct.pack("<i", len(data[prefix + "_shape"]))
for i in range(len(data[prefix + "_shape"])):
io_shape = data[prefix + "_shape"][i]
io_elem_size = data[prefix + "_elem_size"][i]
io_offset = data[prefix + "_offset"][i]
io_region = data[prefix + "_region"][i]
assert len(io_shape) <= 4
inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
io_struct = struct.pack(
"<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
)
ios += io_struct
return ios


# Output via Vela to binary stream for ArmBackendEthosU
# WARNING: Do not change this without changing VelaBinStream.cpp as that
# function consumes this format and the two need to align.
def vela_compile(tosa_fb):
with tempfile.TemporaryDirectory() as tmpdir:
tosaname = "out.tosa"
Expand All @@ -162,65 +181,52 @@ def vela_compile(tosa_fb):

np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
blocks = b""

with np.load(np_path, allow_pickle=False) as data:
# Construct our modified output_blocks with data in a form easily
# digested on the device side
bin_blocks = {"vela_bin_stream": b""}

# copy command data through unmodified
bin_blocks["cmd_data"] = data["cmd_data"].tobytes()

# copy weight data through unmodified
bin_blocks["weight_data"] = data["weight_data"].tobytes()

# Add a block for scratch, inputs and outputs; scratch shape is a 1 element
# array giving us size in bytes so extract this and add a block of 0's.
# Currently we preallocated this on the host to provide SRAM for computation.
if len(data["scratch_shape"][0]) != 1:
raise RuntimeError("Expected scratch to be single array")
block_length = data["scratch_shape"][0].item()
bin_blocks["scratch_data"] = b"\x00" * block_length

# Capture inputs and outputs
bin_blocks["inputs"] = vela_bin_pack_io("input", data)
bin_blocks["outputs"] = vela_bin_pack_io("output", data)

bin_blocks["vela_end_stream"] = b""

# Emit the NPZ regions as:
# - 16 byte block name null terminated string (padded to 16 if name shorter)
# - 4 bytes of int32 block length and 12 bytes of 0's
# - block data (padded to 16 byte alignment at end)
# Repeat for all blocks
for key in data.keys():
for key in bin_blocks.keys():
block_name = bytes(key, "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))

block_data = b""
if key in ("input_shape", "output_shape"):
inputs = data[key]
# Encode a struct of int len; and one or more int x,y,z,w shape;
input_struct = struct.pack("<i", len(inputs))
for inp in inputs:
assert len(inp) <= 4
inp_pad = inp.tolist() + [0] * (4 - len(inp))
input_struct = input_struct + struct.pack("<iiii", *inp_pad)
block_data = input_struct
elif key in ("input_offset", "output_offset"):
inputs = data[key]
if key == "output_offset" and len(inputs) > 1:
raise RuntimeError(
"Currently only support one output in Vela ArmBackend"
)
offset_struct = struct.pack("<i", len(inputs))
for inp in inputs:
offset_struct = offset_struct + struct.pack("<i", inp)
block_data = offset_struct
else:
block_data = data[key].tobytes()
# We need the acual unpadded block lengths for hw setup
block_length = len(block_data).to_bytes(16, "little")
# pad block data to multiple of 16 bytes
block_length = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)

# Pad block data to multiple of 16 bytes
block_data = bin_blocks[key]
block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)

block = block_name + block_length + block_data
blocks = blocks + block

# Add a block for scratch, inputs and outputs
# scratch shape is a 1 element array giving us size in bytes
block_name = bytes("scratch_data", "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))
block_length = data["scratch_shape"][0].item()
block_length = block_length + (15 - (block_length - 1) % 16)
block_data = b"\x00" * block_length
block_length = block_length.to_bytes(16, "little")
block = block_name + block_length + block_data
blocks = blocks + block
# TODO are these already in scratch shape? look to be
# input_shape * input_elem_size
# output_shape * output_elem_size
# input_offset and output_offset specify the location these arrays are written from base of scratch

# return 16 byte VELA bin header + blocks + footer
header = bytes("vela_bin_stream", "utf-8") + b"\x00"
footer = bytes("vela_end_stream", "utf-8") + b"\x00"
return header + blocks + footer
return blocks


def dbg_fail(node, tosa_fb, path):
Expand Down
Loading
Loading