Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into demo-backend
Browse files Browse the repository at this point in the history
  • Loading branch information
kirklandsign committed Oct 24, 2023
2 parents 2ac0a2d + 1998177 commit 97a2250
Show file tree
Hide file tree
Showing 12 changed files with 309 additions and 253 deletions.
1 change: 0 additions & 1 deletion .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def export_models_for_ci() -> dict[str, dict]:
# This is the JSON syntax for configuration matrix used by GitHub
# https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
models = {"include": []}
xnnpack_options = {}
backends = ["portable", "xnnpack"]
for (name, build_tool, backend) in itertools.product(
MODEL_NAME_TO_MODEL.keys(), BUILD_TOOLS.keys(), backends
Expand Down
14 changes: 3 additions & 11 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,15 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: pip
- name: Extract the list of models to test
id: gather-models
run: |
set -eux
source .ci/scripts/utils.sh
# This is a simple Python script but as it tries to import executorch.examples.models,
# it requires a whole bunch of ExecuTorch dependencies on the Docker image
install_pip_dependencies
install_executorch
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}"
test-models-macos:
Expand All @@ -57,14 +50,13 @@ jobs:
MODEL_NAME=${{ matrix.model }}
BUILD_TOOL=${{ matrix.build-tool }}
XNNPACK_QUANTIZATION=${{ matrix.xnnpack_quantization }}
XNNPACK_DELEGATION=${{ matrix.delegation }}
BACKEND=${{ matrix.backend }}
DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
# Setup MacOS dependencies as there is no Docker support on MacOS atm
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
# Build and test xecutorch
PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${XNNPACK_QUANTIZATION}" "${XNNPACK_DELEGATION}" "${DEMO_BACKEND_DELEGATION}"
PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
popd
test-custom-ops-macos:
Expand Down
5 changes: 4 additions & 1 deletion backends/arm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )

set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
set(_arm_baremetal_sources
backends/arm/runtime/ArmBackendEthosU.cpp
backends/arm/runtime/VelaBinStream.cpp
)
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")

add_library(
Expand Down
98 changes: 52 additions & 46 deletions backends/arm/arm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,27 @@ def dbg_tosa_dump(tosa_fb, path):
f.write(js)


# Output to Vela with current file-based compilation
# WARNING: if this changes, the runtime reader also needs to change
# Pack either input or output tensor block, compose the related arrays into
# per-io structs to simplify runtime use.
def vela_bin_pack_io(prefix, data):
ios = struct.pack("<i", len(data[prefix + "_shape"]))
for i in range(len(data[prefix + "_shape"])):
io_shape = data[prefix + "_shape"][i]
io_elem_size = data[prefix + "_elem_size"][i]
io_offset = data[prefix + "_offset"][i]
io_region = data[prefix + "_region"][i]
assert len(io_shape) <= 4
inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
io_struct = struct.pack(
"<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
)
ios += io_struct
return ios


# Output via Vela to binary stream for ArmBackendEthosU
# WARNING: Do not change this without changing VelaBinStream.cpp as that
# function consumes this format and the two need to align.
def vela_compile(tosa_fb):
with tempfile.TemporaryDirectory() as tmpdir:
tosaname = "out.tosa"
Expand All @@ -162,65 +181,52 @@ def vela_compile(tosa_fb):

np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
blocks = b""

with np.load(np_path, allow_pickle=False) as data:
# Construct our modified output_blocks with data in a form easily
# digested on the device side
bin_blocks = {"vela_bin_stream": b""}

# copy command data through unmodified
bin_blocks["cmd_data"] = data["cmd_data"].tobytes()

# copy weight data through unmodified
bin_blocks["weight_data"] = data["weight_data"].tobytes()

# Add a block for scratch, inputs and outputs; scratch shape is a 1 element
# array giving us size in bytes so extract this and add a block of 0's.
# Currently we preallocated this on the host to provide SRAM for computation.
if len(data["scratch_shape"][0]) != 1:
raise RuntimeError("Expected scratch to be single array")
block_length = data["scratch_shape"][0].item()
bin_blocks["scratch_data"] = b"\x00" * block_length

# Capture inputs and outputs
bin_blocks["inputs"] = vela_bin_pack_io("input", data)
bin_blocks["outputs"] = vela_bin_pack_io("output", data)

bin_blocks["vela_end_stream"] = b""

# Emit the NPZ regions as:
# - 16 byte block name null terminated string (padded to 16 if name shorter)
# - 4 bytes of int32 block length and 12 bytes of 0's
# - block data (padded to 16 byte alignment at end)
# Repeat for all blocks
for key in data.keys():
for key in bin_blocks.keys():
block_name = bytes(key, "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))

block_data = b""
if key in ("input_shape", "output_shape"):
inputs = data[key]
# Encode a struct of int len; and one or more int x,y,z,w shape;
input_struct = struct.pack("<i", len(inputs))
for inp in inputs:
assert len(inp) <= 4
inp_pad = inp.tolist() + [0] * (4 - len(inp))
input_struct = input_struct + struct.pack("<iiii", *inp_pad)
block_data = input_struct
elif key in ("input_offset", "output_offset"):
inputs = data[key]
if key == "output_offset" and len(inputs) > 1:
raise RuntimeError(
"Currently only support one output in Vela ArmBackend"
)
offset_struct = struct.pack("<i", len(inputs))
for inp in inputs:
offset_struct = offset_struct + struct.pack("<i", inp)
block_data = offset_struct
else:
block_data = data[key].tobytes()
# We need the acual unpadded block lengths for hw setup
block_length = len(block_data).to_bytes(16, "little")
# pad block data to multiple of 16 bytes
block_length = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)

# Pad block data to multiple of 16 bytes
block_data = bin_blocks[key]
block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)

block = block_name + block_length + block_data
blocks = blocks + block

# Add a block for scratch, inputs and outputs
# scratch shape is a 1 element array giving us size in bytes
block_name = bytes("scratch_data", "utf8")[:15]
block_name = block_name + b"\x00" * (16 - len(block_name))
block_length = data["scratch_shape"][0].item()
block_length = block_length + (15 - (block_length - 1) % 16)
block_data = b"\x00" * block_length
block_length = block_length.to_bytes(16, "little")
block = block_name + block_length + block_data
blocks = blocks + block
# TODO are these already in scratch shape? look to be
# input_shape * input_elem_size
# output_shape * output_elem_size
# input_offset and output_offset specify the location these arrays are written from base of scratch

# return 16 byte VELA bin header + blocks + footer
header = bytes("vela_bin_stream", "utf-8") + b"\x00"
footer = bytes("vela_end_stream", "utf-8") + b"\x00"
return header + blocks + footer
return blocks


def dbg_fail(node, tosa_fb, path):
Expand Down
Loading

0 comments on commit 97a2250

Please sign in to comment.