diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 6e686b7f24..0e9181ac55 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -b556d31586845fb1e296a975d2b85d9d325205c9 +aec9b2ab77389967ef39bb9c10662fd0fe3e185a diff --git a/.ci/docker/ci_commit_pins/torchao.txt b/.ci/docker/ci_commit_pins/torchao.txt new file mode 100644 index 0000000000..768110b82f --- /dev/null +++ b/.ci/docker/ci_commit_pins/torchao.txt @@ -0,0 +1 @@ +0916b5b29b092afcbf2b898caae49abe80662bac diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh index 4a796a72d5..d262176e49 100755 --- a/.ci/docker/common/install_linter.sh +++ b/.ci/docker/common/install_linter.sh @@ -13,3 +13,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" # NB: Install all linter dependencies, the caching of lintrunner init could be # done after Executorch becomes public pip_install -r requirements-lintrunner.txt + +# Install google-java-format +curl -L --retry 3 https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format +chmod +x /opt/google-java-format diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh new file mode 100644 index 0000000000..deeaed34ac --- /dev/null +++ b/.ci/scripts/build-qnn-sdk.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -eux +set -o xtrace + +build_qnn_backend() { + echo "Start building qnn backend." + export ANDROID_NDK_ROOT=/opt/ndk + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 + export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" + + bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release +} + +set_up_aot() { + cd $EXECUTORCH_ROOT + if [ ! -d "cmake-out" ]; then + mkdir cmake-out + fi + pushd cmake-out + cmake .. \ + -DCMAKE_INSTALL_PREFIX=$PWD \ + -DEXECUTORCH_BUILD_QNN=ON \ + -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF + cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc) + # install Python APIs to correct import path + # The filename might vary depending on your Python and host version. + cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python + cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python + popd + + # Workaround for fbs files in exir/_serialize + cp schema/program.fbs exir/_serialize/program.fbs + cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs +} + +build_qnn_backend +set_up_aot diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 644fc4c2bb..7d3370ee56 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -22,8 +22,9 @@ install_executorch_and_backend_lib() { -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py index 55289140c4..36a64e4241 100755 --- a/.ci/scripts/gather_test_models.py +++ b/.ci/scripts/gather_test_models.py @@ -27,6 +27,7 @@ # This one causes timeout on smaller runner, the root cause is unclear (T161064121) "dl3": "linux.12xlarge", "emformer_join": "linux.12xlarge", + "emformer_predict": "linux.12xlarge", } } @@ -35,9 +36,11 @@ # Just some examples on how custom timeout can be set "linux": { "mobilebert": 90, + "emformer_predict": 360, }, "macos": { "mobilebert": 90, + "emformer_predict": 360, }, } @@ -84,7 +87,11 @@ def model_should_run_on_event(model: str, event: str) -> bool: """ if event == "pull_request": return model in ["mv3", "vit"] - return True + elif event == "push": + # 'emformer_predict' is running super slow. Only run it periodically + return model not in ["emformer_predict"] + else: + return True def model_should_run_on_target_os(model: str, target_os: str) -> bool: diff --git a/.ci/scripts/setup-ios.sh b/.ci/scripts/setup-ios.sh new file mode 100755 index 0000000000..519cd2581e --- /dev/null +++ b/.ci/scripts/setup-ios.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +# This script follows the instructions from GitHub to install an Apple certificate +# https://docs.github.com/en/actions/use-cases-and-examples/deploying/installing-an-apple-certificate-on-macos-runners-for-xcode-development + +CERTIFICATE_PATH="${RUNNER_TEMP}"/build_certificate.p12 +PP_PATH="${RUNNER_TEMP}"/build_pp.mobileprovision +KEYCHAIN_PATH="${RUNNER_TEMP}"/app-signing.keychain-db + +# Import certificate and provisioning profile from secrets +echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH +echo -n "$BUILD_PROVISION_PROFILE_BASE64" | base64 --decode -o $PP_PATH + +# Create a temporary keychain +security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH +security set-keychain-settings -lut 21600 $KEYCHAIN_PATH +security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH + +# Import certificate to the keychain +security import $CERTIFICATE_PATH -P "" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH +security set-key-partition-list -S apple-tool:,apple: -k "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH +security list-keychain -d user -s $KEYCHAIN_PATH + +# Apply provisioning profile +mkdir -p ~/Library/MobileDevice/Provisioning\ Profiles +cp $PP_PATH ~/Library/MobileDevice/Provisioning\ Profiles diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh index 4bccabad5c..5df4668f65 100755 --- a/.ci/scripts/setup-linux.sh +++ b/.ci/scripts/setup-linux.sh @@ -20,6 +20,5 @@ fi # As Linux job is running inside a Docker container, all of its dependencies # have already been installed -install_flatc_from_source install_executorch build_executorch_runner "${BUILD_TOOL}" diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh index 2be7d9efe8..833ba0aafe 100755 --- a/.ci/scripts/setup-macos.sh +++ b/.ci/scripts/setup-macos.sh @@ -128,7 +128,5 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then fi print_cmake_info -install_pytorch_and_domains -install_flatc_from_source install_executorch build_executorch_runner "${BUILD_TOOL}" diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh new file mode 100644 index 0000000000..92ffd07bcc --- /dev/null +++ b/.ci/scripts/setup-qnn-deps.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +verify_pkg_installed() { + echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed") +} + +install_qnn() { + echo "Start installing qnn." + QNN_INSTALLATION_DIR=/tmp/qnn + mkdir -p "${QNN_INSTALLATION_DIR}" + + curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip" + echo "Finishing downloading qnn sdk." + unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp + echo "Finishing unzip qnn sdk." + + + # Print the content for manual verification + ls -lah "/tmp/qairt" + mv "/tmp/qairt"/* "${QNN_INSTALLATION_DIR}" + echo "Finishing installing qnn '${QNN_INSTALLATION_DIR}' ." + + ls -lah "${QNN_INSTALLATION_DIR}" +} + +setup_libc++() { + sudo apt-get update + pkgs_to_check=('libc++-dev') + j=0 + while [ $j -lt ${#pkgs_to_check[*]} ]; do + install_status=$(verify_pkg_installed ${pkgs_to_check[$j]}) + if [ "$install_status" == "" ]; then + sudo apt-get install -y ${pkgs_to_check[$j]} + if [[ $? -ne 0 ]]; then + echo "ERROR: Failed to install required packages for libc++" + exit 1 + fi + fi + j=$(( $j +1)); + done +} + +setup_libc++ +install_qnn diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh deleted file mode 100755 index ad02fdc79d..0000000000 --- a/.ci/scripts/test.sh +++ /dev/null @@ -1,157 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -exu - -# shellcheck source=/dev/null -source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" - -MODEL_NAME=$1 -if [[ -z "${MODEL_NAME:-}" ]]; then - echo "Missing model name, exiting..." - exit 1 -fi - -BUILD_TOOL=$2 -if [[ -z "${BUILD_TOOL:-}" ]]; then - echo "Missing build tool (require buck2 or cmake), exiting..." - exit 1 -fi - -BACKEND=$3 -if [[ -z "${BACKEND:-}" ]]; then - echo "Missing backend (require portable or xnnpack), exiting..." - exit 1 -fi - -which "${PYTHON_EXECUTABLE}" -# Just set this variable here, it's cheap even if we use buck2 -CMAKE_OUTPUT_DIR=cmake-out - -build_cmake_executor_runner() { - echo "Building executor_runner" - (rm -rf ${CMAKE_OUTPUT_DIR} \ - && mkdir ${CMAKE_OUTPUT_DIR} \ - && cd ${CMAKE_OUTPUT_DIR} \ - && retry cmake -DCMAKE_BUILD_TYPE=Release \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) - - cmake --build ${CMAKE_OUTPUT_DIR} -j4 -} - -run_portable_executor_runner() { - # Run test model - if [[ "${BUILD_TOOL}" == "buck2" ]]; then - buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./${MODEL_NAME}.pte" - elif [[ "${BUILD_TOOL}" == "cmake" ]]; then - if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then - build_cmake_executor_runner - fi - ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" - else - echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" - exit 1 - fi -} - -test_model() { - if [[ "${MODEL_NAME}" == "llama2" ]]; then - # Install requirements for export_llama - bash examples/models/llama2/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama2.export_llama - "${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json - run_portable_executor_runner - rm "./${MODEL_NAME}.pte" - fi - STRICT="--strict" - if [[ "${MODEL_NAME}" == "llava" ]]; then - # Install requirements for llava - bash examples/models/llava/install_requirements.sh - STRICT="--no-strict" - fi - # python3 -m examples.portable.scripts.export --model_name="llama2" should works too - "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}" - run_portable_executor_runner -} - -build_cmake_xnn_executor_runner() { - echo "Building xnn_executor_runner" - SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" - - (rm -rf ${CMAKE_OUTPUT_DIR} \ - && mkdir ${CMAKE_OUTPUT_DIR} \ - && cd ${CMAKE_OUTPUT_DIR} \ - && retry cmake -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) - - cmake --build ${CMAKE_OUTPUT_DIR} -j4 -} - -test_model_with_xnnpack() { - WITH_QUANTIZATION=$1 - WITH_DELEGATION=$2 - - # Quantization-only - if [[ ${WITH_QUANTIZATION} == true ]] && [[ ${WITH_DELEGATION} == false ]]; then - bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" "${MODEL_NAME}" - return 0 - fi - - # Delegation - if [[ ${WITH_QUANTIZATION} == true ]]; then - SUFFIX="q8" - "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize - else - SUFFIX="fp32" - "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate - fi - - OUTPUT_MODEL_PATH="${MODEL_NAME}_xnnpack_${SUFFIX}.pte" - - # Run test model - if [[ "${BUILD_TOOL}" == "buck2" ]]; then - buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}" - elif [[ "${BUILD_TOOL}" == "cmake" ]]; then - if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then - build_cmake_xnn_executor_runner - fi - ./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}" - else - echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" - exit 1 - fi -} - -if [[ "${BACKEND}" == "portable" ]]; then - echo "Testing ${MODEL_NAME} with portable kernels..." - test_model -else - set +e - if [[ "${BACKEND}" == *"quantization"* ]]; then - echo "::group::Testing ${MODEL_NAME} with XNNPACK quantization only..." - test_model_with_xnnpack true false || Q_ERROR="error" - echo "::endgroup::" - fi - if [[ "${BACKEND}" == *"delegation"* ]]; then - echo "::group::Testing ${MODEL_NAME} with XNNPACK delegation only..." - test_model_with_xnnpack false true || D_ERROR="error" - echo "::endgroup::" - fi - if [[ "${BACKEND}" == *"quantization"* ]] && [[ "${BACKEND}" == *"delegation"* ]]; then - echo "::group::Testing ${MODEL_NAME} with XNNPACK quantization and delegation..." - test_model_with_xnnpack true true || Q_D_ERROR="error" - echo "::endgroup::" - fi - set -e - if [[ -n "${Q_ERROR:-}" ]] || [[ -n "${D_ERROR:-}" ]] || [[ -n "${Q_D_ERROR:-}" ]]; then - echo "Portable q8 ${Q_ERROR:-ok}," "Delegation fp32 ${D_ERROR:-ok}," "Delegation q8 ${Q_D_ERROR:-ok}" - exit 1 - fi -fi diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 30b77ee38f..2e51866d90 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -9,10 +9,11 @@ set -exu # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" -MODEL_NAME=$1 # stories110M.pt +MODEL_NAME=$1 # stories110M BUILD_TOOL=$2 # buck2 or cmake -DTYPE=$3 # fp16 or fp32 +DTYPE=$3 # fp16, bf16, or fp32 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe +UPLOAD_DIR=${5:-} if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args echo "Expecting atleast 4 positional arguments" echo "Usage: [...]" @@ -28,7 +29,7 @@ if [[ -z "${BUILD_TOOL:-}" ]]; then fi if [[ -z "${DTYPE:-}" ]]; then - echo "Missing dtype, choose fp16 or fp32, exiting..." + echo "Missing dtype, choose fp16, bf16, or fp32, exiting..." exit 1 fi @@ -71,6 +72,25 @@ fi echo "COREML option ${COREML}" +if [[ "${MODE}" =~ .*qnn.* ]]; then + QNN=ON + export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 + export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" + export PYTHONPATH=".." + cp schema/program.fbs exir/_serialize/program.fbs + cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs + cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python + cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python + +else + QNN=OFF + QNN_SDK_ROOT="" +fi + +echo "QNN option ${QNN}" +echo "QNN_SDK_ROOT: ${QNN_SDK_ROOT}" + if [[ -z "${BUCK:-}" ]]; then BUCK=buck2 fi @@ -87,14 +107,17 @@ cmake_install_executorch_libraries() { retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DEXECUTORCH_BUILD_MPS="$MPS" \ -DEXECUTORCH_BUILD_COREML="$COREML" \ + -DEXECUTORCH_BUILD_QNN="$QNN" \ + -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . cmake --build cmake-out -j9 --target install --config Debug @@ -118,7 +141,7 @@ cmake_build_llama_runner() { cleanup_files() { echo "Deleting downloaded and generated files" - rm "${MODEL_NAME}" + rm "${CHECKPOINT_FILE_NAME}" rm tokenizer.model rm tokenizer.bin rm "${EXPORTED_MODEL_NAME}" @@ -126,10 +149,21 @@ cleanup_files() { rm params.json } +prepare_artifacts_upload() { + if [ -n "$UPLOAD_DIR" ]; then + echo "Preparing for uploading generated artifacs" + zip -j model.zip "${EXPORTED_MODEL_NAME}" tokenizer.bin + mkdir -p "${UPLOAD_DIR}" + mv model.zip "${UPLOAD_DIR}" + fi +} + # Download and create artifacts. PARAMS="params.json" +CHECKPOINT_FILE_NAME="" touch "${PARAMS}" -if [[ "${MODEL_NAME}" == "stories110M.pt" ]]; then +if [[ "${MODEL_NAME}" == "stories110M" ]]; then + CHECKPOINT_FILE_NAME="stories110M.pt" download_stories_model_artifacts else echo "Unsupported model name ${MODEL_NAME}" @@ -140,6 +174,8 @@ fi EXPORTED_MODEL_NAME="llama2" if [[ "${DTYPE}" == "fp16" ]]; then EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_h" +elif [[ "${DTYPE}" == "bf16" ]]; then + EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_bf" elif [[ "${DTYPE}" == "fp32" ]]; then : else @@ -150,7 +186,7 @@ fi # Export model. EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte" echo "Exporting ${EXPORTED_MODEL_NAME}" -EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv" +EXPORT_ARGS="-c ${CHECKPOINT_FILE_NAME} -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv" if [[ "${XNNPACK}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128" fi @@ -166,6 +202,9 @@ fi if [[ "${COREML}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} -kv -v --coreml --disable_dynamic_shape" fi +if [[ "${QNN}" == "ON" ]]; then + EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape" +fi # Add dynamically linked library location $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS} @@ -205,6 +244,7 @@ if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then echo "Actual result: ${RESULT}" echo "Success" + prepare_artifacts_upload cleanup_files else echo "Expected result prefix: ${EXPECTED_PREFIX}" diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh new file mode 100644 index 0000000000..8ac87b2302 --- /dev/null +++ b/.ci/scripts/test_llava.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu +# shellcheck source=/dev/null + +BUILD_TYPE=${1:-Debug} +TARGET_OS=${2:-Native} +BUILD_DIR=${3:-cmake-out} + +echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR" + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +TARGET_OS_lower="$(echo "${TARGET_OS}" | awk '{print tolower($0)}')" +if [[ "${TARGET_OS_lower}" == "android" ]]; then + if [[ -z "${ANDROID_NDK}" ]]; then + echo "Set ANDROID_NDK environment variable to build for Android." + exit 1 + fi +fi + +# Number of processes for a parallel build +NPROC=8 +if hash nproc &> /dev/null; then NPROC=$(nproc); fi + +EXECUTORCH_COMMON_CMAKE_ARGS=" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON" + +cmake_install_executorch_libraries() { + cmake \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} +} + +cmake_install_executorch_libraries_for_android() { + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} +} + + +LLAVA_COMMON_CMAKE_ARGS=" \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON" + +cmake_build_llava_runner() { + dir=examples/models/llava + python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + + cmake \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -B${BUILD_DIR}/${dir} \ + ${dir} + + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} +} + + +cmake_build_llava_runner_for_android() { + dir=examples/models/llava + python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \ + -B${BUILD_DIR}/${dir} \ + ${dir} + + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} +} + +# only export the one without custom op for now since it's +export_llava() { + echo "Starting to export Llava. This will take about 6 mins" + $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts +} + +# Download a new image with different size, to test if the model can handle different image sizes +prepare_image_tensor() { + echo "Downloading image" + curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg + $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt +} + +run_and_verify() { + NOW=$(date +"%H:%M:%S") + echo "Starting to run llava runner at ${NOW}" + if [[ ! -f "llava.pte" ]]; then + echo "Export failed. Abort" + exit 1 + fi + if [[ ! -f "image.pt" ]]; then + echo "image.pt is missing." + exit 1 + fi + if [[ ! -f "tokenizer.bin" ]]; then + echo "tokenizer.bin is missing." + exit 1 + fi + + + + RUNTIME_ARGS="--model_path=llava.pte \ + --tokenizer_path=tokenizer.bin \ + --image_path=image.pt \ + --prompt=ASSISTANT: \ + --temperature=0 \ + --seq_len=650" + + if [[ "${TARGET_OS_lower}" == "android" ]]; then + echo "Transfer relevant files to the phone via ADB and run llava_main with following args," + echo "$ llava_main ${RUNTIME_ARGS} " + exit 0; + fi + + ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + + # verify result.txt + RESULT=$(cat result.txt) + # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. + if [[ "$(uname)" == "Darwin" ]]; then + EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various" + else + # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. + EXPECTED_PREFIX="ASSISTANT:" + fi + if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then + echo "Expected result prefix: ${EXPECTED_PREFIX}" + echo "Actual result: ${RESULT}" + echo "Success" + exit 0 + else + echo "Expected result prefix: ${EXPECTED_PREFIX}" + echo "Actual result: ${RESULT}" + echo "Failure; results not the same" + exit 1 + fi +} + +# Step1. Build stuff +if [[ "${TARGET_OS_lower}" == "android" ]]; then + cmake_install_executorch_libraries_for_android + cmake_build_llava_runner_for_android +elif [[ "${TARGET_OS_lower}" == "native" ]]; then + cmake_install_executorch_libraries + cmake_build_llava_runner +else + echo "Invalid TARGET_OS ($2): ${TARGET_OS}" +fi + +# Step2. Generate the PTE +export_llava + +# Step3. Run +prepare_image_tensor +run_and_verify diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh new file mode 100755 index 0000000000..f558a508c9 --- /dev/null +++ b/.ci/scripts/test_model.sh @@ -0,0 +1,245 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +MODEL_NAME=$1 +if [[ -z "${MODEL_NAME:-}" ]]; then + echo "Missing model name, exiting..." + exit 1 +fi + +BUILD_TOOL=$2 +if [[ -z "${BUILD_TOOL:-}" ]]; then + echo "Missing build tool (require buck2 or cmake), exiting..." + exit 1 +fi + +BACKEND=$3 +if [[ -z "${BACKEND:-}" ]]; then + echo "Missing backend (require portable or xnnpack), exiting..." + exit 1 +fi + +UPLOAD_DIR=${4:-} + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi +which "${PYTHON_EXECUTABLE}" + +# Just set this variable here, it's cheap even if we use buck2 +CMAKE_OUTPUT_DIR=cmake-out +EXPORTED_MODEL=${MODEL_NAME} + +prepare_artifacts_upload() { + if [ -n "$UPLOAD_DIR" ]; then + echo "Preparing for uploading generated artifacs" + zip -j model.zip "${EXPORTED_MODEL}" + mkdir -p "${UPLOAD_DIR}" + mv model.zip "${UPLOAD_DIR}" + fi +} + +build_cmake_executor_runner() { + echo "Building executor_runner" + rm -rf ${CMAKE_OUTPUT_DIR} + cmake -DCMAKE_BUILD_TYPE=Debug \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -B${CMAKE_OUTPUT_DIR} . + + cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug +} + +run_portable_executor_runner() { + # Run test model + if [[ "${BUILD_TOOL}" == "buck2" ]]; then + buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./${MODEL_NAME}.pte" + elif [[ "${BUILD_TOOL}" == "cmake" ]]; then + build_cmake_executor_runner + ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" + else + echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" + exit 1 + fi +} + +test_model() { + if [[ "${MODEL_NAME}" == "llama2" ]]; then + # Install requirements for export_llama + bash examples/models/llama2/install_requirements.sh + # Test export_llama script: python3 -m examples.models.llama2.export_llama + "${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json + run_portable_executor_runner + rm "./${MODEL_NAME}.pte" + fi + STRICT="--strict" + if [[ "${MODEL_NAME}" == "llava" ]]; then + # Install requirements for llava + bash examples/models/llava/install_requirements.sh + STRICT="--no-strict" + fi + # python3 -m examples.portable.scripts.export --model_name="llama2" should works too + "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}" + run_portable_executor_runner +} + +build_cmake_xnn_executor_runner() { + echo "Building xnn_executor_runner" + SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" + + (rm -rf ${CMAKE_OUTPUT_DIR} \ + && mkdir ${CMAKE_OUTPUT_DIR} \ + && cd ${CMAKE_OUTPUT_DIR} \ + && retry cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) + + cmake --build ${CMAKE_OUTPUT_DIR} -j4 +} + +test_model_with_xnnpack() { + WITH_QUANTIZATION=$1 + WITH_DELEGATION=$2 + + # Quantization-only + if [[ ${WITH_QUANTIZATION} == true ]] && [[ ${WITH_DELEGATION} == false ]]; then + bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" "${MODEL_NAME}" + return 0 + fi + + # Delegation + if [[ ${WITH_QUANTIZATION} == true ]]; then + SUFFIX="q8" + "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize + else + SUFFIX="fp32" + "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate + fi + + OUTPUT_MODEL_PATH="${MODEL_NAME}_xnnpack_${SUFFIX}.pte" + EXPORTED_MODEL=${OUTPUT_MODEL_PATH} + + # Run test model + if [[ "${BUILD_TOOL}" == "buck2" ]]; then + buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}" + elif [[ "${BUILD_TOOL}" == "cmake" ]]; then + if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then + build_cmake_xnn_executor_runner + fi + ./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}" + else + echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" + exit 1 + fi +} + +test_model_with_qnn() { + source "$(dirname "${BASH_SOURCE[0]}")/build-qnn-sdk.sh" + echo "ANDROID_NDK_ROOT: $ANDROID_NDK_ROOT" + echo "QNN_SDK_ROOT: $QNN_SDK_ROOT" + echo "EXECUTORCH_ROOT: $EXECUTORCH_ROOT" + + export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ + export PYTHONPATH=$EXECUTORCH_ROOT/.. + + if [[ "${MODEL_NAME}" == "dl3" ]]; then + EXPORT_SCRIPT=deeplab_v3 + EXPORTED_MODEL_NAME=dlv3_qnn.pte + elif [[ "${MODEL_NAME}" == "mv3" ]]; then + EXPORT_SCRIPT=mobilenet_v3 + EXPORTED_MODEL_NAME=mv3_qnn.pte + elif [[ "${MODEL_NAME}" == "mv2" ]]; then + EXPORT_SCRIPT=mobilenet_v2 + EXPORTED_MODEL_NAME=mv2_qnn.pte + elif [[ "${MODEL_NAME}" == "ic4" ]]; then + EXPORT_SCRIPT=inception_v4 + EXPORTED_MODEL_NAME=ic4_qnn.pte + elif [[ "${MODEL_NAME}" == "ic3" ]]; then + EXPORT_SCRIPT=inception_v3 + EXPORTED_MODEL_NAME=ic3_qnn.pte + elif [[ "${MODEL_NAME}" == "vit" ]]; then + EXPORT_SCRIPT=torchvision_vit + EXPORTED_MODEL_NAME=vit_qnn.pte + fi + + # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24 + # TODO(guangyang): Make QNN chipset matches the target device + QNN_CHIPSET=SM8450 + + "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only + EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME} +} + +test_model_with_coreml() { + if [[ "${BUILD_TOOL}" == "buck2" ]]; then + echo "coreml doesn't support buck2." + exit 1 + fi + + "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" + EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit) +} + +if [[ "${BACKEND}" == "portable" ]]; then + echo "Testing ${MODEL_NAME} with portable kernels..." + test_model +elif [[ "${BACKEND}" == "qnn" ]]; then + echo "Testing ${MODEL_NAME} with qnn..." + test_model_with_qnn + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi +elif [[ "${BACKEND}" == "coreml" ]]; then + echo "Testing ${MODEL_NAME} with coreml..." + test_model_with_coreml + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi +elif [[ "${BACKEND}" == "xnnpack" ]]; then + echo "Testing ${MODEL_NAME} with xnnpack..." + WITH_QUANTIZATION=true + WITH_DELEGATION=true + if [[ "$MODEL_NAME" == "mobilebert" ]]; then + # TODO(T197452682) + WITH_QUANTIZATION=false + fi + test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}" + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi +else + set +e + if [[ "${BACKEND}" == *"quantization"* ]]; then + echo "::group::Testing ${MODEL_NAME} with XNNPACK quantization only..." + test_model_with_xnnpack true false || Q_ERROR="error" + echo "::endgroup::" + fi + if [[ "${BACKEND}" == *"delegation"* ]]; then + echo "::group::Testing ${MODEL_NAME} with XNNPACK delegation only..." + test_model_with_xnnpack false true || D_ERROR="error" + echo "::endgroup::" + fi + if [[ "${BACKEND}" == *"quantization"* ]] && [[ "${BACKEND}" == *"delegation"* ]]; then + echo "::group::Testing ${MODEL_NAME} with XNNPACK quantization and delegation..." + test_model_with_xnnpack true true || Q_D_ERROR="error" + echo "::endgroup::" + fi + set -e + if [[ -n "${Q_ERROR:-}" ]] || [[ -n "${D_ERROR:-}" ]] || [[ -n "${Q_D_ERROR:-}" ]]; then + echo "Portable q8 ${Q_ERROR:-ok}," "Delegation fp32 ${D_ERROR:-ok}," "Delegation q8 ${Q_D_ERROR:-ok}" + exit 1 + else + prepare_artifacts_upload + fi +fi diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index e293a49f37..64c512cdcc 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -19,11 +19,9 @@ retry () { install_executorch() { which pip # Install executorch, this assumes that Executorch is checked out in the - # current directory. The --extra-index-url options tell pip to look on the - # pytorch servers for nightly and pre-release versions of torch packages. - pip install . --no-build-isolation -v \ - --extra-index-url https://download.pytorch.org/whl/test/cpu \ - --extra-index-url https://download.pytorch.org/whl/nightly/cpu + # current directory. + # TODO(T199538337): clean up install scripts to use install_requirements.sh + ./install_requirements.sh --pybind xnnpack # Just print out the list of packages for debugging pip list } @@ -35,42 +33,6 @@ install_pip_dependencies() { popd || return } -install_domains() { - echo "Install torchvision and torchaudio" - pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}" - pip install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${TORCHVISION_VERSION}" -} - -install_pytorch_and_domains() { - pushd .ci/docker || return - TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt) - popd || return - - git clone https://github.com/pytorch/pytorch.git - - # Fetch the target commit - pushd pytorch || return - git checkout "${TORCH_VERSION}" - git submodule update --init --recursive - - export _GLIBCXX_USE_CXX11_ABI=0 - # Then build and install PyTorch - python setup.py bdist_wheel - pip install "$(echo dist/*.whl)" - - # Grab the pinned audio and vision commits from PyTorch - TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt) - export TORCHAUDIO_VERSION - TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt) - export TORCHVISION_VERSION - - install_domains - - popd || return - # Print sccache stats for debugging - sccache --show-stats || true -} - install_flatc_from_source() { # NB: This function could be used to install flatbuffer from source pushd third-party/flatbuffers || return diff --git a/examples/models/llama2/custom_ops/__init__.py b/.github/ghstack_direct similarity index 100% rename from examples/models/llama2/custom_ops/__init__.py rename to .github/ghstack_direct diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index f684d83fa5..2b66829ed0 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,5 +1,7 @@ # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml ciflow_push_tags: +- ciflow/android +- ciflow/apple - ciflow/nightly - ciflow/trunk - ciflow/binaries diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml new file mode 100644 index 0000000000..78c1a2dd09 --- /dev/null +++ b/.github/workflows/android-perf.yml @@ -0,0 +1,240 @@ +name: android-perf + +on: + schedule: + - cron: 0 0 * * * + # Note: GitHub has an upper limit of 10 inputs + workflow_dispatch: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: samsung_galaxy_s22 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + threadpool: + description: Run with threadpool? + required: false + type: boolean + default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + workflow_call: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: samsung_galaxy_s22 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + threadpool: + description: Run with threadpool? + required: false + type: boolean + default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: linux.2xlarge + outputs: + models: ${{ steps.set-parameters.outputs.models }} + devices: ${{ steps.set-parameters.outputs.devices }} + delegates: ${{ steps.set-parameters.outputs.delegates }} + steps: + - name: Set parameters + id: set-parameters + shell: bash + env: + # Separate default values from the workflow dispatch. To ensure defaults are accessible + # during scheduled runs and to provide flexibility for different defaults between + # on-demand and periodic benchmarking. + CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit" + CRON_DEFAULT_DEVICES: "samsung_galaxy_s22" + CRON_DEFAULT_DELEGATES: "xnnpack,qnn" + run: | + set -ex + MODELS="${{ inputs.models }}" + if [ -z "$MODELS" ]; then + MODELS="$CRON_DEFAULT_MODELS" + fi + DEVICES="${{ inputs.devices }}" + if [ -z "$DEVICES" ]; then + DEVICES="$CRON_DEFAULT_DEVICES" + fi + DELEGATES="${{ inputs.delegates }}" + if [ -z "$DELEGATES" ]; then + DELEGATES="$CRON_DEFAULT_DELEGATES" + fi + + # Mapping devices to their corresponding device-pool-arn + declare -A DEVICE_POOL_ARNS + DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" + DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db" + + # Resolve device names with their corresponding ARNs + if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then + DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")') + fi + declare -a MAPPED_ARNS=() + for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do + if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then + echo "Error: No ARN found for device '$DEVICE'. Abort." >&2 + exit 1 + fi + MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}") + done + + echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .) + echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT + echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + + export-models: + name: export-models + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + needs: set-parameters + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12-android + submodules: 'true' + timeout: 60 + upload-artifact: android-models + upload-artifact-to-s3: true + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + echo "::group::Setting up dev environment" + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + if [[ ${{ matrix.delegate }} == "qnn" ]]; then + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + fi + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} + echo "::endgroup::" + + echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" + BUILD_MODE="cmake" + DTYPE="fp32" + + if [[ ${{ matrix.model }} =~ ^stories* ]]; then + # Install requirements for export_llama + PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + # Test llama2 + if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then + DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "qnn" ]]; then + DELEGATE_CONFIG="qnn" + else + echo "Unsupported delegate ${{ matrix.delegate }}" + exit 1 + fi + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" + else + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + fi + echo "::endgroup::" + + build-llm-demo: + name: build-llm-demo + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + needs: set-parameters + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12-android + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + upload-artifact: android-apps + upload-artifact-to-s3: true + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + + export ANDROID_ABIS="arm64-v8a" + PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} + + # Let's see how expensive this job is, we might want to tone it down by running it periodically + benchmark-on-device: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + needs: + - set-parameters + - build-llm-demo + - export-models + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false + with: + # Due to scheduling a job may be pushed beyond the default 60m threshold + timeout: 120 + device-type: android + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + device-pool-arn: ${{ matrix.device }} + # Uploaded to S3 from the previous job, the name of the app comes from the project itself. + # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer. + # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only + # one app+flavor that could load and run the model. + android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk + android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk + # NB: Need to set the default spec here so that it works for periodic too + test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }} + # Uploaded to S3 from the previous job + extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 7b3d8ab9a8..54e9dbb761 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -5,6 +5,8 @@ on: branches: - main - release/* + tags: + - ciflow/android/* pull_request: paths: - .ci/docker/** @@ -24,9 +26,6 @@ jobs: build-llm-demo: name: build-llm-demo uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - strategy: - matrix: - tokenizer: [bpe, tiktoken] with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android @@ -34,6 +33,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 upload-artifact: android-apps + upload-artifact-to-s3: true script: | set -eux @@ -44,44 +44,13 @@ jobs: export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded # Build LLM Demo for Android - bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} - - # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat - upload-artifacts: - needs: build-llm-demo - runs-on: linux.2xlarge - steps: - - name: Download the artifacts from GitHub - uses: actions/download-artifact@v3 - with: - # The name here needs to match the name of the upload-artifact parameter - name: android-apps - path: ${{ runner.temp }}/artifacts/ - - - name: Verify the artifacts - shell: bash - working-directory: ${{ runner.temp }}/artifacts/ - run: | - ls -lah ./ - - - name: Upload the artifacts to S3 - uses: seemethere/upload-artifact-s3@v5 - with: - s3-bucket: gha-artifacts - s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifact - # NOTE: Consume stale artifacts won't make sense for benchmarking as the goal is always to - # benchmark models as fresh as possible. I'm okay to keep the 14 retention-days for now - # for TorchChat until we have a periodic job can publish it more often. Ideally I want to - # reduce it to <= 2 day, meaning the benchmark job will run daily. - retention-days: 14 - if-no-files-found: ignore - path: ${{ runner.temp }}/artifacts/ + bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Running Android emulator directly on the runner and not using Docker run-emulator: needs: build-llm-demo - runs-on: amz2023.linux.4xlarge + # NB: Use metal install for KVM support to run the emulator faster + runs-on: linux.24xl.spr-metal env: ANDROID_NDK_VERSION: r26c API_LEVEL: 34 @@ -129,9 +98,6 @@ jobs: uses: reactivecircus/android-emulator-runner@v2 with: api-level: ${{ env.API_LEVEL }} - # NB: x86_64 emulator is slow because the lack of KVM support on AWS, it - # seems that we can use metal instance for that but it hasn't been tried - # out yet. Also arm64-v8a arch requires an ARM runner arch: x86_64 script: ./build/run_android_emulator.sh # NB: This is to boot the emulator faster following the instructions on @@ -144,34 +110,3 @@ jobs: emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none # This is to make sure that the job doesn't fail flakily emulator-boot-timeout: 900 - - # Let's see how expensive this job is, we might want to tone it down by running it periodically - test-llama-app: - needs: upload-artifacts - permissions: - id-token: write - contents: read - uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main - strategy: - matrix: - # https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/README.md#alternative-2-build-from-local-machine - # mentions that tiktoken is only for Llama3. So, we can export it later in another archive - # like https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip when this is - # updated to run Llama3 - tokenizer: [bpe] - with: - device-type: android - runner: linux.2xlarge - test-infra-ref: '' - # This is the ARN of ExecuTorch project on AWS - project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 - # This is the custom Android device pool that only includes Samsung Galaxy S2x - device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa - # Uploaded to S3 from the previous job, the name of the app comes from the project itself - android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk - android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk - # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml - test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77 - # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30 - # days and the job will automatically re-upload the file when that happens. - extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml new file mode 100644 index 0000000000..b4b1d3aef5 --- /dev/null +++ b/.github/workflows/apple-perf.yml @@ -0,0 +1,308 @@ +name: apple-perf + +on: + schedule: + - cron: 0 1 * * * + # Note: GitHub has an upper limit of 10 inputs + workflow_dispatch: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: apple_iphone_15 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + workflow_call: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: apple_iphone_15 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: linux.2xlarge + outputs: + models: ${{ steps.set-parameters.outputs.models }} + devices: ${{ steps.set-parameters.outputs.devices }} + delegates: ${{ steps.set-parameters.outputs.delegates }} + steps: + - name: Set parameters + id: set-parameters + shell: bash + env: + # Separate default values from the workflow dispatch. To ensure defaults are accessible + # during scheduled runs and to provide flexibility for different defaults between + # on-demand and periodic benchmarking. + CRON_DEFAULT_MODELS: "stories110M,mv3,ic4,resnet50,edsr,mobilebert,w2l" + CRON_DEFAULT_DEVICES: "apple_iphone_15" + CRON_DEFAULT_DELEGATES: "xnnpack,coreml" + run: | + set -ex + MODELS="${{ inputs.models }}" + if [ -z "$MODELS" ]; then + MODELS="$CRON_DEFAULT_MODELS" + fi + DEVICES="${{ inputs.devices }}" + if [ -z "$DEVICES" ]; then + DEVICES="$CRON_DEFAULT_DEVICES" + fi + DELEGATES="${{ inputs.delegates }}" + if [ -z "$DELEGATES" ]; then + DELEGATES="$CRON_DEFAULT_DELEGATES" + fi + + # Mapping devices to their corresponding device-pool-arn + declare -A DEVICE_POOL_ARNS + DEVICE_POOL_ARNS[apple_iphone_15]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d" + + # Resolve device names with their corresponding ARNs + if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then + DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")') + fi + declare -a MAPPED_ARNS=() + for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do + if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then + echo "Error: No ARN found for device '$DEVICE'. Abort." >&2 + exit 1 + fi + MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}") + done + + echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .) + echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT + echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + + export-models: + name: export-models + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + needs: set-parameters + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + fail-fast: false + with: + # NB: Need to use our AWS MacOS runner to upload large models to S3 + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + timeout: 60 + upload-artifact: ios-models + upload-artifact-to-s3: true + script: | + set -eux + + echo "::group::Setting up CI environment" + .ci/scripts/setup-conda.sh + + BUILD_TOOL=cmake + # Setup MacOS dependencies as there is no Docker support on MacOS atm + GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + + if [[ ${{ matrix.delegate }} == "coreml" ]]; then + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/coreml/scripts/install_requirements.sh + fi + + if [[ ${{ matrix.delegate }} == "mps" ]]; then + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/mps/install_requirements.sh + fi + + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} + echo "::endgroup::" + + echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" + BUILD_MODE="cmake" + DTYPE="fp32" + + if [[ ${{ matrix.model }} =~ ^stories* ]]; then + # Install requirements for export_llama + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash examples/models/llama2/install_requirements.sh + + # Test llama2 + if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then + DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "coreml" ]]; then + DELEGATE_CONFIG="coreml" + fi + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" + else + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + fi + echo "::endgroup::" + + build-benchmark-app: + name: build-benchmark-app + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + needs: + - set-parameters + secrets: inherit + with: + runner: macos-latest-xlarge + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + upload-artifact: ios-apps + secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + timeout: 90 + script: | + set -eux + + echo "::group::Setting up CI environment" + .ci/scripts/setup-conda.sh + + BUILD_TOOL=cmake + # Setup MacOS dependencies as there is no Docker support on MacOS atm + GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + + # Setup Apple certificate for iOS development + BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ + KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ + .ci/scripts/setup-ios.sh + + # Install CoreML Backend Requirements + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/coreml/scripts/install_requirements.sh + + # Install MPS Backend Requirements + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/mps/install_requirements.sh + echo "::endgroup::" + + echo "::group::Build ExecuTorch iOS frameworks" + FRAMEWORKS=( + "executorch" + "backend_coreml" + "backend_mps" + "backend_xnnpack" + "kernels_custom" + "kernels_optimized" + "kernels_portable" + "kernels_quantized" + ) + + # Build Release iOS Frameworks + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack + + mkdir -p extension/apple/Benchmark/Frameworks + for FRAMEWORK in "${FRAMEWORKS[@]}"; do ( + cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/ + ) done + echo "::endgroup::" + + # NB: Although exported models can be copied to this directory and bundled together with the + # app, we don't use this in CI and rely on AWS extra data parameter to make the model and the + # tokenizer available to the benchmark. This decouples the app and the model. We just need to + # create the directory here to pass the build + mkdir -p extension/apple/Benchmark/Models + ${CONDA_RUN} --no-capture-output \ + build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME} + + upload-benchmark-app: + needs: build-benchmark-app + runs-on: linux.2xlarge + steps: + - name: Download the apps from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-apps + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the apps + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the apps to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifacts + retention-days: 14 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + benchmark-on-device: + needs: + - set-parameters + - upload-benchmark-app + - export-models + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false + with: + # Due to scheduling a job may be pushed beyond the default 60m threshold + timeout: 120 + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + device-pool-arn: ${{ matrix.device }} + # Uploaded to S3 from the previous job + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip + test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }} + extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 60022b81f9..229d8e5abf 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -8,13 +8,14 @@ on: pull_request: paths: - .ci/docker/** + - .ci/scripts/setup-ios.sh - .github/workflows/apple.yml - install_requirements.sh - backends/apple/** - build/build_apple_frameworks.sh - build/create_frameworks.sh - build/test_ios_ci.sh - - examples/demo-apps/** + - examples/demo-apps/apple_ios/** - extension/apple/** - extension/module/** workflow_dispatch: @@ -24,27 +25,89 @@ concurrency: cancel-in-progress: true jobs: - test-demo-ios: - name: test-demo-ios + build-demo-ios: + name: build-demo-ios uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + secrets: inherit with: runner: macos-latest-xlarge python-version: '3.11' submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 + secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + upload-artifact: ios-apps script: | BUILD_TOOL=cmake .ci/scripts/setup-conda.sh + # Setup Apple certificate for iOS development + BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ + KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ + .ci/scripts/setup-ios.sh + # Setup MacOS dependencies as there is no Docker support on MacOS atm GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + # Build and test iOS Demo App PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - build/test_ios_ci.sh + build/test_ios_ci.sh ${ARTIFACTS_DIR_NAME} + + # Upload the test demo app to S3 + upload-demo-ios: + needs: build-demo-ios + runs-on: linux.2xlarge + steps: + - name: Download the artifacts from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-apps + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the artifacts + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the artifacts to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifact + retention-days: 14 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + test-demo-ios: + # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to + # authenticate with the cloud service. So, this job will be skipped on the latter + if: ${{ !github.event.pull_request.head.repo.fork }} + needs: upload-demo-ios + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + with: + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + # This is the custom device pool that only includes iOS devices + device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d + # Uploaded to S3 from the previous job + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.xctestrun.zip + test-spec: https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml build-frameworks-ios: name: build-frameworks-ios diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7cb2cf69b8..56b70409d7 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,3 +54,25 @@ jobs: lint.json || true exit $RC + + android-java-format: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-linter + fetch-depth: 0 + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \ + examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \ + examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \ + extension/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java) + if [ -n "$FILES_NEEDS_FORMAT" ]; then + echo "Warning: The following files need formatting. Please use google-java-format." + echo "Use a binary from https://github.com/google/google-java-format/releases/" + echo "For example:" + echo "wget https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64" + echo "chmod +x google-java-format_linux-x86-64" + echo "./google-java-format_linux-x86-64 -i $FILES_NEEDS_FORMAT" + exit 1 + fi diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 4cc57b0c7f..df13140ca9 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -62,4 +62,4 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 591a0328b7..d4b81a2334 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -54,7 +54,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch with the add model on portable backend. - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "add" "${BUILD_TOOL}" "portable" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable" test-models-linux: name: test-models-linux @@ -81,7 +81,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" test-llama-runner-linux: name: test-llama-runner-linux @@ -91,6 +91,13 @@ jobs: dtype: [fp32] build-tool: [buck2, cmake] mode: [portable, xnnpack+custom, xnnpack+custom+qe] + include: + - dtype: bf16 + build-tool: cmake + mode: portable + - dtype: bf16 + build-tool: buck2 + mode: portable fail-fast: false with: runner: linux.2xlarge @@ -112,7 +119,7 @@ jobs: # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh # Test llama2 - PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M.pt "${BUILD_TOOL}" "${DTYPE}" "${MODE}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" test-llama-runner-linux-android: name: test-llama-runner-linux-android @@ -187,8 +194,8 @@ jobs: # Test selective build PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}" - test-export-llava-linux: - name: test-export-llava-linux + test-llava-runner-linux: + name: test-llava-runner-linux uses: pytorch/test-infra/.github/workflows/linux_job.yml@main strategy: fail-fast: false @@ -205,30 +212,19 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" + # install pybind + bash install_requirements.sh --pybind xnnpack + # install Llava requirements bash examples/models/llama2/install_requirements.sh bash examples/models/llava/install_requirements.sh - # run export_llava.sh - python examples/models/llava/export_llava.py --use-sdpa-with-kv-cache --pte-name llava_custom_sdpa.pte - - # verify file exists - if [ ! -f "llava_custom_sdpa.pte" ]; then - echo "llava_custom_sdpa.pte not found!" - exit 1 - fi - - python examples/models/llava/export_llava.py --no-use-sdpa-with-kv-cache --pte-name llava.pte - - # verify file exists - if [ ! -f "llava.pte" ]; then - echo "llava.pte not found!" - exit 1 - fi - # run python unittest python -m unittest examples.models.llava.test.test_llava + # run e2e (export, tokenizer and runner) + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh + test-quantized-aot-lib-linux: name: test-quantized-aot-lib-linux uses: pytorch/test-infra/.github/workflows/linux_job.yml@main @@ -337,7 +333,7 @@ jobs: size=${arr[4]} # threshold=48120 on devserver with gcc11.4 # todo(lfq): update once binary size is below 50kb. - threshold="51768" + threshold="51784" if [[ "$size" -le "$threshold" ]]; then echo "Success $size <= $threshold" else @@ -383,3 +379,38 @@ jobs: # Run pytest with coverage pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test + + + test-llama-runner-qnn-linux: + name: test-llama-runner-qnn-linux + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + matrix: + dtype: [fp32] + build-tool: [cmake] + mode: [qnn] + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12-android + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 900 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + DTYPE=${{ matrix.dtype }} + BUILD_TOOL=${{ matrix.build-tool }} + MODE=${{ matrix.mode }} + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + + # Setup executorch + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 + # Install requirements for export_llama + PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + # Test llama2 + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 9b28d26048..d7130561fa 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -9,6 +9,7 @@ on: - ciflow/trunk/* pull_request: paths: + - .ci/docker/ci_commit_pins/pytorch.txt - .ci/scripts/** workflow_dispatch: @@ -58,7 +59,7 @@ jobs: # Setup MacOS dependencies as there is no Docker support on MacOS atm PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" # Build and test xecutorch - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" test-custom-ops-macos: name: test-custom-ops-macos @@ -142,7 +143,6 @@ jobs: conda activate "${CONDA_ENV}" source .ci/scripts/utils.sh - install_flatc_from_source install_executorch install_arm @@ -168,7 +168,6 @@ jobs: conda activate "${CONDA_ENV}" source .ci/scripts/utils.sh - install_flatc_from_source install_executorch install_arm @@ -224,8 +223,10 @@ jobs: strategy: matrix: dtype: [fp32] - build-tool: [buck2, cmake] mode: [portable, xnnpack+kv+custom, mps, coreml] + include: + - dtype: bf16 + mode: portable fail-fast: false with: runner: macos-m1-stable @@ -236,25 +237,12 @@ jobs: script: | DTYPE=${{ matrix.dtype }} - BUILD_TOOL=${{ matrix.build-tool }} MODE=${{ matrix.mode }} - if [[ "${BUILD_TOOL}" == "buck2" ]]; then - # TODO: Will add more modes that don't support buck2 - if [[ "${MODE}" == "mps" ]]; then - echo "mps doesn't support buck2." - exit 0 - fi - if [[ "${MODE}" == "coreml" ]]; then - echo "coreml doesn't support buck2." - exit 0 - fi - fi - bash .ci/scripts/setup-conda.sh # Setup executorch - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake if [[ "${MODE}" == "mps" ]]; then # Install mps delegate @@ -269,4 +257,180 @@ jobs: # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh # Test llama2 - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M.pt "${BUILD_TOOL}" "${DTYPE}" "${MODE}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}" + + # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner. + # test-llava-runner-macos: + # name: test-llava-runner-macos + # uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + # strategy: + # fail-fast: false + # with: + # runner: macos-14-xlarge + # python-version: '3.11' + # submodules: 'true' + # ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # timeout: 900 + # script: | + # BUILD_TOOL=cmake + + # bash .ci/scripts/setup-conda.sh + # # Setup MacOS dependencies as there is no Docker support on MacOS atm + # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + + # # install Llava requirements + # ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh + + # # run python unittest + # ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava + + # # run e2e (export, tokenizer and runner) + # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release + + test-qnn-model: + name: test-qnn-model + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + matrix: + dtype: [fp32] + model: [dl3, mv3, mv2, ic4, ic3, vit] + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12-android + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 900 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn" + + test-coreml-model: + name: test-coreml-model + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + strategy: + fail-fast: false + with: + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + BUILD_TOOL=cmake + BACKEND=coreml + + bash .ci/scripts/setup-conda.sh + + # Setup MacOS dependencies as there is no Docker support on MacOS atm + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh + echo "Finishing installing coreml." + + # Build and test coreml model + MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l) + for MODEL_NAME in "${MODELS[@]}"; do + echo "::group::Exporting coreml model: $MODEL_NAME" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" + echo "::endgroup::" + done + + test-huggingface-transformers: + name: test-huggingface-transformers + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + secrets: inherit + strategy: + matrix: + hf_model_repo: [google/gemma-2b] + fail-fast: false + with: + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.12xlarge + docker-image: executorch-ubuntu-22.04-clang12 + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + echo "::group::Set up ExecuTorch" + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake + + echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a" + rm -rf cmake-out + cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config Release + + echo "Build llama runner" + dir="examples/models/llama2" + cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out/${dir} \ + ${dir} + cmake --build cmake-out/${dir} -j9 --config Release + echo "::endgroup::" + + echo "::group::Set up HuggingFace Dependencies" + if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then + echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR." + exit 1 + fi + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install accelerate sentencepiece + # TODO(guangyang): Switch to use released transformers library after all required patches are included + pip install "git+https://github.com/huggingface/transformers.git@6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1" + pip list + echo "::endgroup::" + + echo "::group::Export to ExecuTorch" + TOKENIZER_FILE=tokenizer.model + TOKENIZER_BIN_FILE=tokenizer.bin + ET_MODEL_NAME=et_model + # Fetch the file using a Python one-liner + DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c " + from huggingface_hub import hf_hub_download + # Download the file from the Hugging Face Hub + downloaded_path = hf_hub_download( + repo_id='${{ matrix.hf_model_repo }}', + filename='${TOKENIZER_FILE}' + ) + print(downloaded_path) + ") + if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then + echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH" + python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE} + ls ./tokenizer.bin + else + echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}." + exit 1 + fi + + python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME} + + cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" + echo "::endgroup::" diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 189a5cf3aa..dd5fb7bc2e 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -20,6 +20,6 @@ jobs: with: repository: pytorch/executorch stable-branch: viable/strict - requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"Android\", \"Apple\"]' + requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"^Android$\", \"^Apple$\"]' secret-bot-token: ${{ secrets.UPDATEBOT_TOKEN }} rockset-api-key: ${{ secrets.ROCKSET_API_KEY }} diff --git a/.github/workflows/upload-android-test-specs.yml b/.github/workflows/upload-android-test-specs.yml new file mode 100644 index 0000000000..e9b1054080 --- /dev/null +++ b/.github/workflows/upload-android-test-specs.yml @@ -0,0 +1,94 @@ +name: Upload AWS Device Farm Android test specs + +on: + pull_request: + paths: + - .github/workflows/upload-android-test-specs.yml + - extension/android/benchmark/android-llm-device-farm-test-spec.yml + push: + branches: + - main + paths: + - .github/workflows/upload-android-test-specs.yml + - extension/android/benchmark/android-llm-device-farm-test-spec.yml + +concurrency: + # NB: This concurency group needs to be different than the one used in android-perf, otherwise + # GH complains about concurrency deadlock + group: android-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + upload-android-test-spec-for-validation: + runs-on: linux.2xlarge + steps: + - uses: actions/checkout@v3 + + - name: Upload the spec as a GitHub artifact for validation + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifacts + retention-days: 1 + if-no-files-found: error + path: extension/android/benchmark/android-llm-device-farm-test-spec.yml + + validate-android-test-spec: + needs: upload-android-test-spec-for-validation + uses: ./.github/workflows/android-perf.yml + permissions: + id-token: write + contents: read + with: + # Just use a small model here with a minimal amount of configuration to test the spec + models: stories110M + devices: samsung_galaxy_s22 + delegates: xnnpack + test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/android-llm-device-farm-test-spec.yml + + upload-android-test-spec: + needs: validate-android-test-spec + runs-on: ubuntu-22.04 + timeout-minutes: 15 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: pip + + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-android + aws-region: us-east-1 + + - name: Only push to S3 when running the workflow manually from main branch + if: ${{ github.ref == 'refs/heads/main' }} + shell: bash + run: | + set -eux + echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}" + + - name: Upload the spec to S3 ossci-android bucket + shell: bash + working-directory: extension/android/benchmark/ + env: + SPEC_FILE: android-llm-device-farm-test-spec.yml + run: | + set -eux + + pip install awscli==1.32.18 + + AWS_CMD="aws s3 cp --dryrun" + if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then + AWS_CMD="aws s3 cp" + fi + + shasum -a 256 "${SPEC_FILE}" + ${AWS_CMD} "${SPEC_FILE}" s3://ossci-android/executorch/ --acl public-read diff --git a/.github/workflows/upload-apple-test-specs.yml b/.github/workflows/upload-apple-test-specs.yml new file mode 100644 index 0000000000..06d20ef2be --- /dev/null +++ b/.github/workflows/upload-apple-test-specs.yml @@ -0,0 +1,95 @@ +name: Upload AWS Device Farm Apple iOS test specs + +on: + pull_request: + paths: + - .github/workflows/upload-apple-test-specs.yml + - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + push: + branches: + - main + paths: + - .github/workflows/upload-apple-test-specs.yml + - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + +concurrency: + # NB: This concurency group needs to be different than the one used in apple-perf, otherwise + # GH complains about concurrency deadlock + group: apple-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + upload-apple-test-spec-for-validation: + runs-on: linux.2xlarge + steps: + - uses: actions/checkout@v3 + + - name: Upload the spec as a GitHub artifact for validation + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifacts + retention-days: 1 + if-no-files-found: error + path: examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + + validate-apple-test-spec: + needs: upload-apple-test-spec-for-validation + uses: ./.github/workflows/apple-perf.yml + secrets: inherit + permissions: + id-token: write + contents: read + with: + # Just use a small model here with a minimal amount of configuration to test the spec + models: stories110M + devices: apple_iphone_15 + delegates: xnnpack + test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/default-ios-device-farm-appium-test-spec.yml + + upload-apple-test-spec: + needs: validate-apple-test-spec + runs-on: ubuntu-22.04 + timeout-minutes: 15 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: pip + + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios + aws-region: us-east-1 + + - name: Only push to S3 when running the workflow manually from main branch + if: ${{ github.ref == 'refs/heads/main' }} + shell: bash + run: | + set -eux + echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}" + + - name: Upload the spec to S3 ossci-ios bucket + shell: bash + working-directory: examples/demo-apps/apple_ios + env: + SPEC_FILE: default-ios-device-farm-appium-test-spec.yml + run: | + set -eux + + pip install awscli==1.32.18 + + AWS_CMD="aws s3 cp --dryrun" + if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then + AWS_CMD="aws s3 cp" + fi + + shasum -a 256 "${SPEC_FILE}" + ${AWS_CMD} "${SPEC_FILE}" s3://ossci-ios/executorch/ --acl public-read diff --git a/.gitignore b/.gitignore index d766479b11..bd3528a4c4 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ __pycache__/ *.swp *~ .~lock.* +*.idea diff --git a/.gitmodules b/.gitmodules index 33324b17e2..71ff854bb0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -21,16 +21,13 @@ url = https://github.com/Maratyszcza/FXdiv.git [submodule "backends/xnnpack/third-party/XNNPACK"] path = backends/xnnpack/third-party/XNNPACK - url = https://github.com/digantdesai/XNNPACK.git + url = https://github.com/google/XNNPACK.git [submodule "backends/xnnpack/third-party/cpuinfo"] path = backends/xnnpack/third-party/cpuinfo url = https://github.com/pytorch/cpuinfo.git [submodule "backends/xnnpack/third-party/pthreadpool"] path = backends/xnnpack/third-party/pthreadpool url = https://github.com/Maratyszcza/pthreadpool.git -[submodule "examples/third-party/LLaVA"] - path = examples/third-party/LLaVA - url = https://github.com/haotian-liu/LLaVA.git [submodule "examples/third-party/fbjni"] path = examples/third-party/fbjni url = https://github.com/facebookincubator/fbjni.git diff --git a/.lintrunner.toml b/.lintrunner.toml index c28512c598..7aa15d6563 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -74,6 +74,9 @@ exclude_patterns = [ # NB: Objective-C is not supported 'examples/apple/**', 'examples/demo-apps/apple_ios/**', + # File contains @generated + 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', + 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', ] command = [ 'python', @@ -177,6 +180,9 @@ exclude_patterns = [ '**/*.bat', '**/*.jpg', '**/*.jar', + # File contains @generated + 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', + 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', ] command = [ 'python', diff --git a/CMakeLists.txt b/CMakeLists.txt index 897fdefd14..1fd1ce7e6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,6 +130,12 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER) add_definitions(-DET_EVENT_TRACER_ENABLED) endif() +option(EXECUTORCH_DO_NOT_USE_CXX11_ABI "Define _GLIBCXX_USE_CXX11_ABI=0 if ON" + OFF +) +if(EXECUTORCH_DO_NOT_USE_CXX11_ABI) + add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +endif() # -ffunction-sections -fdata-sections: breaks function and data into sections so # they can be properly gc'd. -s: strip symbol. -fno-exceptions -fno-rtti: # disables exceptions and runtime type. @@ -175,10 +181,16 @@ option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension" OFF ) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF) + +option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF) + option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF) option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF) +option(EXECUTORCH_BUILD_NEURON "Build the backends/mediatek directory" OFF) + option(EXECUTORCH_BUILD_PYBIND "Build the Python Bindings" OFF) option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF) @@ -187,7 +199,7 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF) -option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK") +option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools") option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF) @@ -218,6 +230,7 @@ cmake_dependent_option( ) if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) + set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON) set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON) endif() @@ -497,7 +510,8 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE) ) target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema) if(DL_LIBRARY_EXISTS) - target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) # For dladdr() + # For dladdr() + target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) endif() target_include_directories( executorch_no_prim_ops_shared PUBLIC ${_common_include_directories} @@ -533,17 +547,13 @@ target_link_options_shared_lib(executorch) # operators necessary for the models that will run. # if(BUILD_EXECUTORCH_PORTABLE_OPS) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() -if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) -endif() - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations) # @@ -574,42 +584,46 @@ cmake_dependent_option( EXECUTORCH_BUILD_EXECUTOR_RUNNER "Build the executor_runner executable" ON EXECUTORCH_BUILD_HOST_TARGETS OFF ) -if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) - # Baseline libraries that executor_runner will link against. - set(_executor_runner_libs executorch gflags) - if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) - elseif(EXECUTORCH_BUILD_CADENCE) - list(APPEND _executor_runner_libs cadence_ops_lib) - else() - list(APPEND _executor_runner_libs portable_ops_lib) - endif() +# Add googletest if any test targets should be built +if(EXECUTORCH_BUILD_GTESTS) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest) +endif() - # Generate lib to register quantized ops - if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - list(APPEND _executor_runner_libs quantized_ops_lib) - endif() +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +endif() - add_executable(executor_runner ${_executor_runner__srcs}) - if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE) - target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") - endif() - target_link_libraries(executor_runner ${_executor_runner_libs}) - target_compile_options(executor_runner PUBLIC ${_common_compile_options}) +if(EXECUTORCH_BUILD_CADENCE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence) endif() -# Add googletest if any test targets should be built -if(EXECUTORCH_BUILD_GTESTS) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest) +if(EXECUTORCH_BUILD_COREML) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) +endif() + +if(EXECUTORCH_BUILD_MPS) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps) endif() -if(EXECUTORCH_BUILD_SDK) +if(EXECUTORCH_BUILD_NEURON) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek) +endif() + +if(EXECUTORCH_BUILD_QNN) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) +endif() + +if(EXECUTORCH_BUILD_XNNPACK) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) +endif() + +if(EXECUTORCH_BUILD_DEVTOOLS) set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE ) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() if(EXECUTORCH_BUILD_EXTENSION_APPLE) @@ -624,36 +638,23 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) endif() -if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) -endif() - -if(EXECUTORCH_BUILD_XNNPACK) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) -endif() - -if(EXECUTORCH_BUILD_VULKAN) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan) +if(EXECUTORCH_BUILD_EXTENSION_TRAINING) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training) endif() -if(EXECUTORCH_BUILD_QNN) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) -endif() - -if(EXECUTORCH_BUILD_ARM_BAREMETAL) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) -endif() - -if(EXECUTORCH_BUILD_MPS) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps) +if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) endif() -if(EXECUTORCH_BUILD_COREML) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) +if(EXECUTORCH_BUILD_EXTENSION_TENSOR) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor) endif() -if(EXECUTORCH_BUILD_CADENCE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence) +if(EXECUTORCH_BUILD_PTHREADPOOL + AND EXECUTORCH_BUILD_CPUINFO + AND CMAKE_CXX_STANDARD GREATER_EQUAL 14 +) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) endif() if(EXECUTORCH_BUILD_PYBIND) @@ -663,8 +664,8 @@ if(EXECUTORCH_BUILD_PYBIND) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) endif() - if(NOT EXECUTORCH_BUILD_SDK) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) + if(NOT EXECUTORCH_BUILD_DEVTOOLS) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() # find pytorch lib, to allow pybind to take at::Tensor as input/output @@ -680,7 +681,6 @@ if(EXECUTORCH_BUILD_PYBIND) executorch extension_data_loader portable_ops_lib - quantized_ops_aot_lib util torch ) @@ -699,11 +699,6 @@ if(EXECUTORCH_BUILD_PYBIND) list(APPEND _dep_libs xnnpack_backend XNNPACK) endif() - if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - target_link_options_shared_lib(quantized_ops_lib) - list(APPEND _dep_libs quantized_kernels quantized_ops_lib) - endif() - # compile options for pybind set(_pybind_compile_options -Wno-deprecated-declarations @@ -716,10 +711,8 @@ if(EXECUTORCH_BUILD_PYBIND) ) # util lib add_library( - util - ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/util/read_file.cpp + util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp ) target_include_directories( util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS} @@ -755,6 +748,27 @@ if(EXECUTORCH_BUILD_PYBIND) # the torch libs are in `site-packages/torch/lib`. BUILD_RPATH "@loader_path/../../../torch/lib" INSTALL_RPATH "@loader_path/../../../torch/lib" + # Assume is the root `site-packages/executorch` + # Need to add /extension/llm/custom_ops for + # libcustom_ops_aot_lib.dylib + BUILD_RPATH "@loader_path/../../extension/llm/custom_ops" + INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops" + # Need to add /kernels/quantized for + # libquantized_ops_aot_lib.dylib + BUILD_RPATH "@loader_path/../../kernels/quantized" + INSTALL_RPATH "@loader_path/../../kernels/quantized" + ) + else() + set_target_properties( + portable_lib + PROPERTIES + # Assume is the root `site-packages/executorch` + # Need to add /extension/llm/custom_ops for + # libcustom_ops_aot_lib + # Need to add /kernels/quantized for + # libquantized_ops_aot_lib + BUILD_RPATH + "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized" ) endif() @@ -765,9 +779,45 @@ endif() if(EXECUTORCH_BUILD_KERNELS_CUSTOM) # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/examples/models/llama2/custom_ops - ) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops) +endif() + +if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) + target_link_options_shared_lib(quantized_ops_lib) +endif() + +if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) + # Baseline libraries that executor_runner will link against. + set(_executor_runner_libs executorch gflags) + + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) + elseif(EXECUTORCH_BUILD_CADENCE) + list(APPEND _executor_runner_libs cadence_ops_lib) + else() + list(APPEND _executor_runner_libs portable_ops_lib) + endif() + + # Generate lib to register quantized ops + if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) + list(APPEND _executor_runner_libs quantized_ops_lib) + endif() + + add_executable(executor_runner ${_executor_runner__srcs}) + if(CMAKE_BUILD_TYPE STREQUAL "Release") + if(APPLE) + target_link_options(executor_runner PRIVATE "LINKER:-dead_strip") + else() + target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") + endif() + endif() + target_link_libraries(executor_runner ${_executor_runner_libs}) + target_compile_options(executor_runner PUBLIC ${_common_compile_options}) +endif() + +if(EXECUTORCH_BUILD_VULKAN) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan) endif() # Print all summary diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a0c37e3b3c..d434c1fe19 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,6 +23,8 @@ We actively welcome your pull requests (PRs). See the [testing section](#testing) for more information. 1. If you've changed APIs or added a new tool or feature, [update the documentation](#updating-documentation). +1. If you added an experimental API or deprecated an existing API, follow the + [API Life Cycle and Deprecation Policy](/docs/source/api-life-cycle.md). 1. Make sure your code follows the [style guides](#coding-style) and passes the [lint checks](#lintrunner). 1. If you haven't already, complete the [Contributor License Agreement ("CLA")](#contributor-license-agreement-cla). @@ -129,9 +131,7 @@ for detailed advice. #### C++ language version -**C++11.** - -NOTE: The code does not yet fully conform to this, and some files require C++17. +**C++17.** Rationale: This is a compromise between being compatible with older, proprietary toolchains, and having access to relatively modern C++ features. diff --git a/LICENSE b/LICENSE index 10b13cbfe7..6c1f7a760a 100644 --- a/LICENSE +++ b/LICENSE @@ -6,6 +6,7 @@ Copyright (c) Meta Platforms, Inc. and affiliates. Copyright 2023 Arm Limited and/or its affiliates. Copyright (c) Qualcomm Innovation Center, Inc. Copyright (c) 2023 Apple Inc. +Copyright (c) 2024 MediaTek Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index c4e6e0caf7..0e78f4da35 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are: - **Portability:** Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and SDK from - PyTorch model authoring and conversion, to debugging and deployment to a wide - variety of platforms. +- **Productivity:** Enabling developers to use the same toolchains and Developer + Tools from PyTorch model authoring and conversion, to debugging and deployment + to a wide variety of platforms. - **Performance:** Providing end users with a seamless and high-performance experience due to a lightweight runtime and utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. @@ -22,6 +22,8 @@ please visit our documentation website [for the latest release](https://pytorch. Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin. +Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch. + ## Feedback We welcome any feedback, suggestions, and bug reports from the community to help @@ -93,7 +95,7 @@ tools. ├── schema # ExecuTorch PTE file format flatbuffer schemas. ├── scripts # Utility scripts for size management, dependency management, etc. -├── sdk # Model profiling, debugging, and introspection. +├── devtools # Model profiling, debugging, and introspection. ├── shim # Compatibility layer between OSS and Internal builds ├── test # Broad scoped end-to-end tests. ├── third-party # Third-party dependencies. diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt index d670b60e6c..27e09b3f58 100644 --- a/backends/apple/coreml/CMakeLists.txt +++ b/backends/apple/coreml/CMakeLists.txt @@ -13,6 +13,11 @@ if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) endif() +if(EXECUTORCH_BUILD_DEVTOOLS) + # protobuf requires frtti + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti") +endif() + option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF) # inmemoryfs sources @@ -59,6 +64,7 @@ set(SDK_SOURCES runtime/sdk/ETCoreMLModelAnalyzer.mm runtime/sdk/ETCoreMLModelStructurePath.mm runtime/sdk/ETCoreMLOperationProfilingInfo.mm + runtime/sdk/ETCoreMLModelDebugInfo.mm runtime/sdk/ETCoreMLModelDebugger.mm runtime/sdk/ETCoreMLModelProfiler.mm runtime/sdk/ETCoreMLPair.mm @@ -130,7 +136,7 @@ target_include_directories( target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..) target_link_libraries(coremldelegate PRIVATE executorch_no_prim_ops) -if(EXECUTORCH_BUILD_SDK) +if(EXECUTORCH_BUILD_DEVTOOLS) target_sources(coremldelegate PRIVATE ${SDK_SOURCES} ${PROTOBUF_SOURCES}) target_include_directories( coremldelegate @@ -141,6 +147,8 @@ if(EXECUTORCH_BUILD_SDK) add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/third-party/coremltools/deps/protobuf/cmake ) + + target_link_options_shared_lib(libprotobuf-lite) target_link_libraries(coremldelegate PRIVATE libprotobuf-lite) endif() @@ -166,7 +174,7 @@ endif() target_compile_options(coremldelegate PRIVATE "-fobjc-arc") target_compile_options(coremldelegate PRIVATE "-fno-exceptions") -if(EXECUTORCH_BUILD_SDK) +if(EXECUTORCH_BUILD_DEVTOOLS) target_compile_options( executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED ) diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py index 375fdf406b..5084405c46 100644 --- a/backends/apple/coreml/compiler/coreml_preprocess.py +++ b/backends/apple/coreml/compiler/coreml_preprocess.py @@ -3,6 +3,7 @@ # CoreML backend for delegating a EdgeProgram to CoreML. import json +import logging import shutil import uuid @@ -14,6 +15,7 @@ from typing import Any, Dict, final, List, Optional, Tuple import coremltools as ct +import coremltools.optimize as cto import executorchcoreml from executorch.exir.backend.backend_details import ( @@ -23,12 +25,16 @@ ) from executorch.exir.backend.compile_spec_schema import CompileSpec +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + class COMPILE_SPEC_KEYS(Enum): COMPUTE_UNITS = "compute_units" MODEL_TYPE = "model_type" MIN_DEPLOYMENT_TARGET = "min_deployment_target" MODEL_COMPUTE_PRECISION = "model_compute_precision" + OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config" class MODEL_PATHS(Enum): @@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec( compute_unit.name.lower().encode("utf-8"), ) + @staticmethod + def generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config: Dict, + ) -> CompileSpec: + """ + Returns the compile spec representing the model post conversion quantization, + which is a dict that will construct cto.coreml.OpLinearQuantizerConfig + """ + str_representation = json.dumps(op_linear_quantizer_config) + byte_representation = str_representation.encode("utf-8") + return CompileSpec( + COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value, + byte_representation, + ) + + @staticmethod + def op_linear_quantizer_config_from_compile_specs( + compile_specs: List[CompileSpec], + ) -> cto.coreml.OpLinearQuantizerConfig: + """ + Returns the model's post conversion quantization by parsing the list of compile specs. + """ + for compile_spec in compile_specs: + if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value: + config_dict_str = compile_spec.value.decode("utf-8") + config_dict = json.loads(config_dict_str) + config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict) + return config + + return None + @staticmethod def generate_compile_specs( compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL, minimum_deployment_target: ct.target = ct.target.iOS15, compute_precision: ct.precision = ct.precision.FLOAT16, model_type: MODEL_TYPE = MODEL_TYPE.MODEL, + op_linear_quantizer_config: Optional[Dict] = None, ) -> List[CompileSpec]: """ Returns the list of compile specs that's used by CoreMLBackend to lower the module. @@ -192,6 +230,12 @@ def generate_compile_specs( CoreMLBackend.generate_compute_precision_compile_spec(compute_precision) ) compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type)) + if op_linear_quantizer_config is not None: + compile_specs.append( + CoreMLBackend.generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config + ) + ) return compile_specs @@ -368,18 +412,18 @@ def preprocess( compile_specs, ) ) - model_compute_precision: ct.precision = ( CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs) ) - minimum_deployment_target: ct.target = ( CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs) ) - compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs( compile_specs ) + op_linear_quantizer_config = ( + CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs) + ) mlmodel = ct.convert( model=edge_program, @@ -392,4 +436,15 @@ def preprocess( compute_units=compute_units, ) + if op_linear_quantizer_config is not None: + logger.warning( + "Core ML Backend op_linear_quantizer_config API is experimental" + ) + config = cto.coreml.OptimizationConfig( + global_config=op_linear_quantizer_config, + # skip embedding + op_type_configs={"gather": None}, + ) + mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config) + return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type) diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py index ecf6d44b19..c0b6663f72 100644 --- a/backends/apple/coreml/partition/coreml_partitioner.py +++ b/backends/apple/coreml/partition/coreml_partitioner.py @@ -17,7 +17,7 @@ Partitioner, PartitionResult, ) -from executorch.exir.backend.utils import tag_constant_data +from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer from torch.export.exported_program import ExportedProgram from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupportBase @@ -61,6 +61,7 @@ def __init__( self, skip_ops_for_coreml_delegation: Optional[List[str]] = None, compile_specs: Optional[List[CompileSpec]] = None, + take_over_mutable_buffer: Optional[bool] = True, ) -> None: if skip_ops_for_coreml_delegation is None: skip_ops_for_coreml_delegation = [] @@ -69,6 +70,7 @@ def __init__( backend_id=CoreMLBackend.__name__, compile_specs=compile_specs if compile_specs is not None else [], ) + self.take_over_mutable_buffer = take_over_mutable_buffer def partition(self, exported_program: ExportedProgram) -> PartitionResult: # Run the CapabilityBasedPartitioner to return the largest possible @@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: partition_tags[tag] = self.delegation_spec tag_constant_data(exported_program) + if self.take_over_mutable_buffer: + logger.info( + "Core ML partitioner will take over torch mutable buffer as Core ML state, " + "so if your model contains mutable buffer, " + "then you will need MacOS15+/iOS18+ to execute. " + "If you want your mutable buffer model to be compatible with older OS, " + "then please set `take_over_mutable_buffer=False`" + ) + tag_mutated_buffer(exported_program) return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm index 57316e2801..226307f3c8 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm @@ -29,9 +29,10 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model { if (self.ignoreOutputBackings) { predictionOptions.outputBackings = @{}; } - id outputs = [self.model.mlModel predictionFromFeatures:inputs - options:predictionOptions - error:error]; + + id outputs = [self.model predictionFromFeatures:inputs + options:predictionOptions + error:error]; if (!outputs) { return nil; } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h index 9bf3183e65..5802659346 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h @@ -37,15 +37,12 @@ __attribute__((objc_subclassing_restricted)) orderedOutputNames:(NSOrderedSet*)orderedOutputNames error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; -- (nullable NSArray*)prepareInputs:(const std::vector&)inputs - error:(NSError* __autoreleasing*)error; - -- (nullable NSArray*)prepareOutputBackings:(const std::vector&)outputs - error:(NSError* __autoreleasing*)error; - /// The underlying MLModel. @property (strong, readonly, nonatomic) MLModel* mlModel; +/// The model state. +@property (strong, readonly, nonatomic, nullable) id state; + /// The asset from which the model is loaded. @property (strong, readonly, nonatomic) ETCoreMLAsset* asset; @@ -58,6 +55,19 @@ __attribute__((objc_subclassing_restricted)) /// The ordered output names of the model. @property (copy, readonly, nonatomic) NSOrderedSet* orderedOutputNames; + +- (nullable id)predictionFromFeatures:(id)input + options:(MLPredictionOptions*)options + error:(NSError* __autoreleasing*)error; + +- (nullable NSArray*)prepareInputs:(const std::vector&)inputs + error:(NSError* __autoreleasing*)error; + +- (nullable NSArray*)prepareOutputBackings:(const std::vector&)outputs + error:(NSError* __autoreleasing*)error; + +- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error; + @end NS_ASSUME_NONNULL_END diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm index ee7218bd27..6b39ae5f92 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm @@ -7,10 +7,12 @@ #import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLLogging.h" +#import "multiarray.h" +#import "objc_array_util.h" +#import "MLModel_Prewarm.h" #import -#import -#import #import #pragma mark - ETCoreMLMultiArrayDescriptor @@ -155,6 +157,19 @@ size_t get_number_of_bytes(MLMultiArrayDataType data_type) { return get_multi_array_constraints_by_name(description.outputDescriptionsByName); } +#if MODEL_STATE_IS_SUPPORTED +API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0), watchos(11.0)) +void reset_state_for_feature_name(NSString *feature_name, MLState *state) { + [state getMultiArrayForStateNamed:feature_name handler:^(MLMultiArray *buffer) { + [buffer getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray * __unused strides) { + uint8_t *start = reinterpret_cast(mutableBytes); + uint8_t *end = start + size; + std::fill(start, end, uint8_t(0)); + }]; + }]; +} +#endif + } #pragma mark - ETCoreMLModel @@ -194,6 +209,11 @@ - (nullable instancetype)initWithAsset:(ETCoreMLAsset *)asset _cache = [[NSCache alloc] init]; _inputConstraintsByName = get_multi_array_input_constraints_by_name(mlModel.modelDescription); _outputConstraintsByName = get_multi_array_output_constraints_by_name(mlModel.modelDescription); +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + _state = mlModel.modelDescription.stateDescriptionsByName.count > 0 ? [_mlModel newState] : nil; + } +#endif } return self; @@ -272,4 +292,52 @@ MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type } +- (nullable id)predictionFromFeatures:(id)input + options:(MLPredictionOptions *)options + error:(NSError **)error { +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + if (self.state != nil) { + return [self.mlModel predictionFromFeatures:input + usingState:(MLState *)self.state + options:options + error:error]; + } + } +#endif + + id result = [self.mlModel predictionFromFeatures:input + options:options + error:error]; + + return result; +} + +- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error { + NSError *localError = nil; + BOOL result = [self.mlModel prewarmUsingState:self.state error:error]; + if (!result) { + ETCoreMLLogError(localError, + "%@: Failed to prewarm model with identifier = %@", + NSStringFromClass(self.class), + self.identifier); + } + +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + NSDictionary *stateDescriptions = self.mlModel.modelDescription.stateDescriptionsByName; + [stateDescriptions enumerateKeysAndObjectsUsingBlock:^(NSString *featureName, MLFeatureDescription * __unused obj, BOOL * __unused stop) { + reset_state_for_feature_name(featureName, (MLState *) self.state); + }]; + } +#endif + + + if (error) { + *error = localError; + } + + return result; +} + @end diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index e7846256e6..cd0fbc86f9 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -5,36 +5,37 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLAssetManager.h" +#import "ETCoreMLDefaultModelExecutor.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModel.h" +#import "ETCoreMLModelCompiler.h" +#import "ETCoreMLModelExecutor.h" +#import "ETCoreMLModelLoader.h" +#import "ETCoreMLModelManager.h" +#import "ETCoreMLStrings.h" +#import "MLModel_Prewarm.h" +#import "MLMultiArray_Copy.h" #import -#import +#import "inmemory_filesystem_utils.hpp" #import #import -#import -#import -#import +#import "model_metadata.h" +#import "multiarray.h" +#import "objc_array_util.h" #import #import -#import +#import "serde_json.h" #import #import #import #if ET_EVENT_TRACER_ENABLED -#import -#import -#import +#import "ETCoreMLModelAnalyzer.h" +#import "ETCoreMLModelDebugInfo.h" +#import "ETCoreMLModelStructurePath.h" +#import "objc_safe_cast.h" #endif namespace { @@ -317,31 +318,14 @@ void add_compute_unit(std::string& identifier, MLComputeUnits compute_units) { return [[ETCoreMLAsset alloc] initWithBackingAsset:std::move(backingAsset.value())]; } -NSDictionary * _Nullable get_operation_path_to_symbol_name_map(const inmemoryfs::InMemoryFileSystem *inMemoryFS, - NSError * __autoreleasing *error) { +ETCoreMLModelDebugInfo * _Nullable get_model_debug_info(const inmemoryfs::InMemoryFileSystem *inMemoryFS, + NSError * __autoreleasing *error) { NSData *file_data = get_file_data(inMemoryFS, ETCoreMLStrings.debugInfoFileRelativePath); if (!file_data) { return nil; } - - id object = [NSJSONSerialization JSONObjectWithData:file_data options:(NSJSONReadingOptions)0 error:error]; - if (!object) { - return nil; - } - - NSDictionary *json_dict = SAFE_CAST(object, NSDictionary); - NSMutableDictionary *result = [NSMutableDictionary dictionaryWithCapacity:json_dict.count]; - NSDictionary *> *debug_symbol_to_operation_path_map = SAFE_CAST(json_dict[ETCoreMLStrings.debugSymbolToOperationPathKeyName], NSDictionary); - for (NSString *symbol_name in debug_symbol_to_operation_path_map) { - NSArray *> *components = SAFE_CAST(debug_symbol_to_operation_path_map[symbol_name], NSArray); - if (components.count == 0) { - continue; - } - ETCoreMLModelStructurePath *path = [[ETCoreMLModelStructurePath alloc] initWithComponents:components]; - result[path] = symbol_name; - } - - return result; + + return [ETCoreMLModelDebugInfo modelDebugInfoFromData:file_data error:error]; } #endif @@ -490,16 +474,16 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier } NSError *localError = nil; - NSDictionary *operation_path_to_symbol_name_map = get_operation_path_to_symbol_name_map(inMemoryFS, - &localError); + ETCoreMLModelDebugInfo *debug_info = get_model_debug_info(inMemoryFS, &localError); if (localError) { ETCoreMLLogError(localError, "Failed to parse debug info file"); } + return [[ETCoreMLModelAnalyzer alloc] initWithCompiledModelAsset:compiledModelAsset modelAsset:modelAsset + modelDebugInfo:debug_info metadata:metadata - operationPathToDebugSymbolMap:operation_path_to_symbol_name_map configuration:configuration assetManager:self.assetManager error:error]; @@ -614,21 +598,8 @@ - (BOOL)prewarmModelWithHandle:(ModelHandle *)handle if (!model) { return NO; } - - NSError *localError = nil; - BOOL result = [model.mlModel prewarmAndReturnError:&localError]; - if (!result) { - ETCoreMLLogError(localError, - "%@: Failed to prewarm model with identifier = %@", - NSStringFromClass(self.assetManager.class), - model.identifier); - } - - if (error) { - *error = localError; - } - - return result; + + return [model prewarmAndReturnError:error]; } - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { @@ -655,7 +626,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { NSError *prewarmError = nil; if (![asset prewarmAndReturnError:&prewarmError]) { - ETCoreMLLogError(localError, + ETCoreMLLogError(prewarmError, "%@: Failed to prewarm asset with identifier = %@", NSStringFromClass(strongSelf.assetManager.class), asset.identifier); @@ -698,16 +669,15 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset { error:&localError]; // Try without output backings. if (!modelOutputs && predictionOptions.outputBackings.count > 0) { - localError = nil; executor.ignoreOutputBackings = YES; + localError = nil; + modelOutputs = [executor executeModelWithInputs:inputFeatures + predictionOptions:predictionOptions + loggingOptions:loggingOptions + eventLogger:eventLogger + error:&localError]; } - - modelOutputs = [executor executeModelWithInputs:inputFeatures - predictionOptions:predictionOptions - loggingOptions:loggingOptions - eventLogger:eventLogger - error:&localError]; - + if (error) { *error = localError; } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.h b/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.h index 74383769c4..7d0e010180 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.h @@ -66,6 +66,8 @@ NS_ASSUME_NONNULL_BEGIN @property (class, copy, readonly, nonatomic, nullable) NSString* debugInfoFileRelativePath; /// The debug symbol to operation path key name. @property (class, copy, readonly, nonatomic, nullable) NSString* debugSymbolToOperationPathKeyName; +/// The debug symbol to handles key name. +@property (class, copy, readonly, nonatomic, nullable) NSString* debugSymbolToHandlesKeyName; @end diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.mm index e8eb2d3cff..fb66f7b7c0 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLStrings.mm @@ -95,6 +95,11 @@ + (NSString *)debugSymbolToOperationPathKeyName { return ETCoreMLDebugSymbolToOperationPathKeyName; } ++ (NSString *)debugSymbolToHandlesKeyName { + static NSString * const ETCoreMLDebugSymbolToHandlesKeyName = @"debugSymbolToHandles"; + return ETCoreMLDebugSymbolToHandlesKeyName; +} + + (nullable NSString *)assetsDirectoryPath { static dispatch_once_t onceToken; static NSString *result = nil; diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h index c066608b89..6caf99507d 100644 --- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h +++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h @@ -8,6 +8,9 @@ #import +#if !defined(MODEL_STATE_IS_SUPPORTED) && __has_include() +#define MODEL_STATE_IS_SUPPORTED 1 +#endif NS_ASSUME_NONNULL_BEGIN @@ -15,9 +18,10 @@ NS_ASSUME_NONNULL_BEGIN /// Pre-warms the model by running a prediction with zeroed-out inputs. /// +/// @param state The model state. /// @param error On failure, error is filled with the failure information. /// @retval `YES` if the prediction succeeded otherwise `NO`. -- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error; +- (BOOL)prewarmUsingState:(nullable id)state error:(NSError* __autoreleasing*)error; @end diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm index 71ce967ac3..d6f59666cf 100644 --- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm +++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm @@ -71,16 +71,28 @@ + (MLMultiArray *)zeroedMultiArrayWithShape:(NSArray *)shape @implementation MLModel (Prewarm) -- (BOOL)prewarmAndReturnError:(NSError * __autoreleasing *)error { +- (BOOL)prewarmUsingState:(nullable id)state error:(NSError * __autoreleasing *)error { @autoreleasepool { id inputs = ::get_zeroed_inputs(self, error); if (!inputs) { return NO; } - - id outputs = [self predictionFromFeatures:inputs error:error]; + + + id outputs = nil; + if (state != nil) { +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + outputs = [self predictionFromFeatures:inputs usingState:(MLState *)state error:error]; + return outputs != nil; + } +#endif + } + + outputs = [self predictionFromFeatures:inputs error:error]; return outputs != nil; } } + @end diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm index f6eb7a83fd..efa3dd2472 100644 --- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm +++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm @@ -157,7 +157,7 @@ - (BOOL)_loadAndReturnError:(NSError * _Nullable __autoreleasing *)error { if (self.config.should_prewarm_asset) { [modelManager prewarmRecentlyUsedAssetsWithMaxCount:1]; } - + return YES; } @@ -188,9 +188,14 @@ - (ModelHandle*)loadModelFromAOTData:(NSData*)data return nil; } - return [self.impl loadModelFromAOTData:data - configuration:configuration - error:error]; + auto handle = [self.impl loadModelFromAOTData:data + configuration:configuration + error:error]; + if ((handle != NULL) && self.config.should_prewarm_model) { + [self.impl prewarmModelWithHandle:handle error:nil]; + } + + return handle; } - (BOOL)executeModelWithHandle:(ModelHandle*)handle diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm index ab596575a2..fc3e5a47f9 100644 --- a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm +++ b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm @@ -124,7 +124,8 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) { auto event_tracer = context.event_tracer(); if (event_tracer) { options.log_profiling_info = true; - options.log_intermediate_tensors = event_tracer->intermediate_outputs_logging_status(); + auto debug_level = event_tracer->event_tracer_debug_level(); + options.log_intermediate_tensors = (debug_level >= EventTracerDebugLogLevel::kIntermediateOutputs); } return options; diff --git a/backends/apple/coreml/runtime/delegate/model_event_logger.h b/backends/apple/coreml/runtime/delegate/model_event_logger.h index c78ebcaac1..91fe3b18cf 100644 --- a/backends/apple/coreml/runtime/delegate/model_event_logger.h +++ b/backends/apple/coreml/runtime/delegate/model_event_logger.h @@ -34,8 +34,8 @@ class ModelEventLogger { /// /// @param op_path_to_value_map A dictionary with the operation path as the key and the operation's value as the /// value. - /// @param op_path_to_debug_symbol_name_map A dictionary with the operation path as the key and the symbol name as - /// the value. The symbol name is the delegate handle. + /// @param op_path_to_debug_symbol_name_map A dictionary with the operation path as the key and the debug symbol + /// name as the value. virtual void log_intermediate_tensors( NSDictionary* op_path_to_value_map, NSDictionary* op_path_to_debug_symbol_name_map) const noexcept = 0; diff --git a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h index a11d41bf7f..1943e0f05b 100644 --- a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h +++ b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h @@ -20,7 +20,7 @@ class BackendDelegate; namespace torch { namespace executor { -class CoreMLBackendDelegate final : public PyTorchBackendInterface { +class CoreMLBackendDelegate final : public ::executorch::runtime::BackendInterface { public: CoreMLBackendDelegate() noexcept; ~CoreMLBackendDelegate() = default; diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h index 4048dae5fe..a8efe2171c 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.h @@ -7,7 +7,7 @@ #import -#import +#import "ETCoreMLModelExecutor.h" namespace executorchcoreml { struct ModelMetadata; @@ -15,6 +15,7 @@ struct ModelMetadata; @class ETCoreMLAsset; @class ETCoreMLAssetManager; +@class ETCoreMLModelDebugInfo; @class ETCoreMLModelStructurePath; @protocol ETCoreMLModelEventLogger; @@ -32,16 +33,15 @@ __attribute__((objc_subclassing_restricted)) /// /// @param compiledModelAsset The compiled model asset (mlmodelc). /// @param modelAsset The model asset (mlpackage). +/// @param modelDebugInfo The model debug info. /// @param metadata The model metadata. -/// @param operationPathToDebugSymbolMap The operation path to debug symbol map. /// @param configuration The model configuration. /// @param assetManager The asset manager used to manage storage of compiled models. /// @param error On failure, error is filled with the failure information. - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset*)compiledModelAsset modelAsset:(nullable ETCoreMLAsset*)modelAsset + modelDebugInfo:(nullable ETCoreMLModelDebugInfo*)modelDebugInfo metadata:(const executorchcoreml::ModelMetadata&)metadata - operationPathToDebugSymbolMap: - (nullable NSDictionary*)operationPathToDebugSymbolMap configuration:(MLModelConfiguration*)configuration assetManager:(ETCoreMLAssetManager*)assetManager error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm index 57212445e5..988b5d808a 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm @@ -5,22 +5,24 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import -#import +#import "ETCoreMLModelAnalyzer.h" + +#import "ETCoreMLAsset.h" +#import "ETCoreMLAssetManager.h" +#import "ETCoreMLDefaultModelExecutor.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModel.h" +#import "ETCoreMLModelLoader.h" +#import "ETCoreMLModelStructurePath.h" +#import "ETCoreMLModelDebugInfo.h" +#import "ETCoreMLModelDebugger.h" +#import "ETCoreMLModelProfiler.h" +#import "ETCoreMLStrings.h" +#import "model_logging_options.h" +#import "model_event_logger.h" +#import "model_metadata.h" +#import "model_package_info.h" +#import "objc_safe_cast.h" namespace { using namespace executorchcoreml; @@ -34,7 +36,7 @@ @interface ETCoreMLModelAnalyzer () @property (strong, nonatomic, nullable) ETCoreMLModelProfiler *profiler; @property (strong, nonatomic, nullable) ETCoreMLModelDebugger *debugger; @property (strong, nonatomic, nullable) id executor; -@property (readonly, copy, nonatomic, nullable) NSDictionary *operationPathToDebugSymbolMap; +@property (readonly, copy, nonatomic, nullable) ETCoreMLModelDebugInfo *modelDebugInfo; @property (readonly, strong, nonatomic) MLModelConfiguration *configuration; @end @@ -43,8 +45,8 @@ @implementation ETCoreMLModelAnalyzer - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledModelAsset modelAsset:(nullable ETCoreMLAsset *)modelAsset + modelDebugInfo:(nullable ETCoreMLModelDebugInfo *)modelDebugInfo metadata:(const executorchcoreml::ModelMetadata&)metadata - operationPathToDebugSymbolMap:(nullable NSDictionary *)operationPathToDebugSymbolMap configuration:(MLModelConfiguration *)configuration assetManager:(ETCoreMLAssetManager *)assetManager error:(NSError * __autoreleasing *)error { @@ -72,9 +74,9 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod if (self) { _model = model; _modelAsset = modelAsset; + _modelDebugInfo = modelDebugInfo; _assetManager = assetManager; _configuration = configuration; - _operationPathToDebugSymbolMap = operationPathToDebugSymbolMap; _executor = [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model]; } @@ -86,10 +88,9 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod eventLogger:(const executorchcoreml::ModelEventLogger *)eventLogger error:(NSError * __autoreleasing *)error { if (self.profiler == nil) { - ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithCompiledModelAsset:self.model.asset - outputNames:self.model.orderedOutputNames - configuration:self.configuration - error:error]; + ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithModel:self.model + configuration:self.configuration + error:error]; self.profiler = profiler; } @@ -113,7 +114,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod return nil; } - eventLogger->log_profiling_infos(profilingInfos, self.operationPathToDebugSymbolMap); + eventLogger->log_profiling_infos(profilingInfos, self.modelDebugInfo.pathToDebugSymbolMap); return modelOutputs; } @@ -131,6 +132,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod if (!self.debugger) { self.debugger = [[ETCoreMLModelDebugger alloc] initWithModelAsset:self.modelAsset + modelDebugInfo:self.modelDebugInfo outputNames:self.model.orderedOutputNames configuration:self.configuration assetManager:self.assetManager @@ -143,6 +145,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod NSArray *modelOutputs = nil; NSArray *operationPaths = self.debugger.operationPaths; + NSDictionary *operationPathToDebugSymbolMap = self.debugger.operationPathToDebugSymbolMap; NSInteger n = operationPaths.count/MAX_MODEL_OUTPUTS_COUNT + (operationPaths.count % MAX_MODEL_OUTPUTS_COUNT == 0 ? 0 : 1); for (NSInteger i = 0; i < n; i++) { @autoreleasepool { @@ -157,7 +160,7 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod } if (outputs.count > 0) { - eventLogger->log_intermediate_tensors(outputs, self.operationPathToDebugSymbolMap); + eventLogger->log_intermediate_tensors(outputs, operationPathToDebugSymbolMap); } } } diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugInfo.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugInfo.h new file mode 100644 index 0000000000..ab8fdaaedd --- /dev/null +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugInfo.h @@ -0,0 +1,47 @@ +// +// ETCoreMLModelDebugInfo.h +// +// Copyright © 2024 Apple Inc. All rights reserved. +// +// Please refer to the license found in the LICENSE file in the root directory of the source tree. + +#import + +@class ETCoreMLModelStructurePath; + +NS_ASSUME_NONNULL_BEGIN + +/// A class representing the profiling info of an operation. +__attribute__((objc_subclassing_restricted)) +@interface ETCoreMLModelDebugInfo : NSObject + +- (instancetype)init NS_UNAVAILABLE; + ++ (instancetype)new NS_UNAVAILABLE; + + +/// Constructs an `ETCoreMLModelDebugInfo` instance. +/// +/// @param pathToDebugSymbolMap Operation path to debug symbol map. +/// @param pathToDebugHandlesMap Operation path to debug handles map. +- (instancetype)initWithPathToDebugSymbolMap:(NSDictionary*)pathToDebugSymbolMap + pathToDebugHandlesMap: + (NSDictionary*>*)pathToDebugHandlesMap + NS_DESIGNATED_INITIALIZER; + +/// Constructs an `ETCoreMLModelDebugInfo` instance. +/// +/// @param data The json data. +/// @param error On failure, error is filled with the failure information. ++ (nullable instancetype)modelDebugInfoFromData:(NSData*)data error:(NSError* __autoreleasing*)error; + +/// Operation path to debug symbol map. +@property (readonly, strong, nonatomic) NSDictionary* pathToDebugSymbolMap; + +/// Operation path to debug handles map. +@property (readonly, strong, nonatomic) + NSDictionary*>* pathToDebugHandlesMap; + +@end + +NS_ASSUME_NONNULL_END diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugInfo.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugInfo.mm new file mode 100644 index 0000000000..4e14d2502d --- /dev/null +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugInfo.mm @@ -0,0 +1,71 @@ +// +// ETCoreMLModelDebugInfo.mm +// +// Copyright © 2024 Apple Inc. All rights reserved. +// +// Please refer to the license found in the LICENSE file in the root directory of the source tree. + + +#import "ETCoreMLModelDebugInfo.h" + +#import "ETCoreMLStrings.h" +#import "ETCoreMLModelStructurePath.h" +#import "objc_safe_cast.h" + + +@implementation ETCoreMLModelDebugInfo + +- (instancetype)initWithPathToDebugSymbolMap:(NSDictionary *)pathToDebugSymbolMap + pathToDebugHandlesMap:(NSDictionary *> *)pathToDebugHandlesMap { + self = [super init]; + if (self) { + _pathToDebugSymbolMap = [pathToDebugSymbolMap copy]; + _pathToDebugHandlesMap = [pathToDebugHandlesMap copy]; + } + + return self; +} + ++ (nullable instancetype)modelDebugInfoFromData:(NSData *)data error:(NSError * __autoreleasing *)error { + id object = [NSJSONSerialization JSONObjectWithData:data options:(NSJSONReadingOptions)0 error:error]; + if (!object) { + return nil; + } + + NSDictionary *jsonDict = SAFE_CAST(object, NSDictionary); + // Construct operation path to debug symbol map. + NSDictionary *> *debugSymbolToPathMap = SAFE_CAST(jsonDict[ETCoreMLStrings.debugSymbolToOperationPathKeyName], NSDictionary); + NSMutableDictionary *pathToDebugSymbolMap = [NSMutableDictionary dictionaryWithCapacity:debugSymbolToPathMap.count]; + for (NSString *symbolName in debugSymbolToPathMap) { + NSArray *> *components = SAFE_CAST(debugSymbolToPathMap[symbolName], NSArray); + if (components.count == 0) { + continue; + } + ETCoreMLModelStructurePath *path = [[ETCoreMLModelStructurePath alloc] initWithComponents:components]; + pathToDebugSymbolMap[path] = symbolName; + + } + // Construct operation path to debug handles map. + NSDictionary *> *debugSymbolToHandles = SAFE_CAST(jsonDict[ETCoreMLStrings.debugSymbolToHandlesKeyName], NSDictionary); + NSMutableDictionary *> *pathToDebugHandlesMap = [NSMutableDictionary dictionaryWithCapacity:debugSymbolToHandles.count]; + for (NSString *debugSymbol in debugSymbolToHandles) { + NSArray *components = debugSymbolToPathMap[debugSymbol]; + if (components.count == 0) { + continue; + } + + NSArray *debugHandles = debugSymbolToHandles[debugSymbol]; + if (debugHandles.count == 0) { + continue; + } + + ETCoreMLModelStructurePath *path = [[ETCoreMLModelStructurePath alloc] initWithComponents:components]; + pathToDebugHandlesMap[path] = debugHandles; + } + + return [[ETCoreMLModelDebugInfo alloc] initWithPathToDebugSymbolMap:pathToDebugSymbolMap + pathToDebugHandlesMap:pathToDebugHandlesMap]; + +} + +@end diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h index 7221086318..40b9e32394 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.h @@ -9,6 +9,7 @@ @class ETCoreMLAsset; @class ETCoreMLAssetManager; +@class ETCoreMLModelDebugInfo; @class ETCoreMLModelStructurePath; typedef NSDictionary ETCoreMLModelOutputs; @@ -25,11 +26,13 @@ __attribute__((objc_subclassing_restricted)) /// Constructs an `ETCoreMLModelDebugger` instance. /// /// @param modelAsset The model asset (mlpackage). +/// @param modelDebugInfo The model debug info. /// @param outputNames The model output names. /// @param configuration The model configuration. /// @param assetManager The asset manager used to manage storage of compiled models. /// @param error On failure, error is filled with the failure information. - (nullable instancetype)initWithModelAsset:(ETCoreMLAsset*)modelAsset + modelDebugInfo:(nullable ETCoreMLModelDebugInfo*)modelDebugInfo outputNames:(NSOrderedSet*)outputNames configuration:(MLModelConfiguration*)configuration assetManager:(ETCoreMLAssetManager*)assetManager @@ -55,6 +58,10 @@ __attribute__((objc_subclassing_restricted)) /// The paths to all the operations for which we can get the outputs. @property (readonly, copy, nonatomic) NSArray* operationPaths; +/// Operation path to debug symbol map. +@property (readonly, copy, nonatomic) + NSDictionary* operationPathToDebugSymbolMap; + @end NS_ASSUME_NONNULL_END diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm index 32a629c6f3..3be28b56d6 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelDebugger.mm @@ -5,21 +5,23 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. +#import "ETCoreMLModelDebugger.h" + #import -#import -#import -#import -#import -#import -#import -#import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLAssetManager.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModelCompiler.h" +#import "ETCoreMLModelDebugInfo.h" +#import "ETCoreMLModelStructurePath.h" +#import "ETCoreMLPair.h" +#import "ETCoreMLStrings.h" #import #import #import #import -#import -#import +#import "model_package_info.h" +#import "objc_json_serde.h" #import #import @@ -68,10 +70,6 @@ BOOL is_const_operation(const MILSpec::Operation& operation) { return operation.type() == "const"; } -BOOL is_cast_operation(const MILSpec::Operation& operation) { - return operation.type() == "cast"; -} - BOOL is_datatype_supported_as_model_output(MILSpec::DataType datatype) { switch (datatype) { case MILSpec::DataType::INT32: @@ -95,11 +93,7 @@ BOOL is_operation_output_supported_as_model_output(const MILSpec::Operation& ope if (is_const_operation(operation)) { return NO; } - - if (is_cast_operation(operation)) { - return NO; - } - + return YES; } @@ -316,7 +310,6 @@ void update_model_spec_version_to_include_fp16_output(Model& model_spec) { return nil; } - // Compile the model. return [ETCoreMLModelCompiler compileModelAtURL:model_url maxWaitTimeInSeconds:(5 * 60) error:error]; @@ -383,6 +376,108 @@ void set_intermediate_outputs(id output_features, result[path] = multi_array; } } + +NSArray *get_operation_dependencies(const MILSpec::Operation &operation, + ETCoreMLModelStructurePath *path, + NSSet *paths) { + const auto& inputs = operation.inputs(); + const auto cppPath = path.underlyingValue; + NSMutableArray *deps = [NSMutableArray arrayWithCapacity:inputs.size()]; + for (const auto& [_, arg] : inputs) { + const auto& bindings = arg.arguments(); + for (const auto& binding : bindings) { + if (binding.has_value()) { + continue; + } + + const auto& name = binding.name(); + auto dep = cppPath; + dep.remove_last_component(); + dep.append_component(Path::Program::Operation(name)); + ETCoreMLModelStructurePath *path = [[ETCoreMLModelStructurePath alloc] initWithUnderlyingValue:dep]; + if ([paths containsObject:path]) { + [deps addObject:path]; + } + } + } + + return deps; +} + +NSDictionary *> *get_debug_handle_to_operation_paths_map(ETCoreMLModelDebugInfo *debug_info) { + NSUInteger capacity = debug_info.pathToDebugHandlesMap.count; + NSMutableDictionary *> *result = [NSMutableDictionary dictionaryWithCapacity:capacity]; + [debug_info.pathToDebugHandlesMap enumerateKeysAndObjectsUsingBlock:^(ETCoreMLModelStructurePath *path, + NSArray *debug_handles, + BOOL * _Nonnull __unused stop) { + for (NSString *debug_handle in debug_handles) { + NSMutableArray *paths = result[debug_handle]; + if (!paths) { + paths = [NSMutableArray array]; + result[debug_handle] = paths; + } + + [paths addObject:path]; + } + + }]; + + return result; +} + +BOOL is_node_terminal_node(ETCoreMLModelStructurePath *node, + NSArray *nodes, + NSDictionary *> *dependencies) { + NSMutableSet *nodes_dependencies = [NSMutableSet set]; + for (ETCoreMLModelStructurePath *current_node in nodes) { + if ([current_node isEqual:node]) { + continue; + } + NSArray *node_dependencies = dependencies[current_node]; + if (node_dependencies.count > 0) { + [nodes_dependencies addObjectsFromArray:node_dependencies]; + } + } + + return ![nodes_dependencies containsObject:node]; +} + +ETCoreMLModelStructurePath *_Nullable find_terminal_node_from_nodes(NSArray *nodes, + NSDictionary *> *dependencies) { + if (nodes.count < 2) { + return nodes.firstObject; + } + + for (ETCoreMLModelStructurePath *node in nodes) { + if (is_node_terminal_node(node, nodes, dependencies)) { + return node; + } + } + + return nil; +} + +NSDictionary *get_operation_path_to_debug_symbol_map(ETCoreMLModelDebugInfo *model_debug_info, + NSDictionary *> *debug_handle_to_operation_paths_map, + NSDictionary *> *dependencies) { + // When decomposing an EXIR operation into a MIL graph, it is essential to report the output of the terminal node of the MIL graph. + // This output corresponds directly to the output of the original EXIR operation. + NSUInteger capacity = debug_handle_to_operation_paths_map.count; + NSMutableDictionary *operation_path_to_debug_symbol_map = [NSMutableDictionary dictionaryWithCapacity:capacity]; + [debug_handle_to_operation_paths_map enumerateKeysAndObjectsUsingBlock:^(NSString *debug_handle, + NSArray *operation_paths, + BOOL * __unused stop) { + ETCoreMLModelStructurePath *operation_path = find_terminal_node_from_nodes(operation_paths, dependencies); + NSString *debug_symbol = (operation_path != nil) ? model_debug_info.pathToDebugSymbolMap[operation_path] : nil; + if (debug_symbol) { + operation_path_to_debug_symbol_map[operation_path] = debug_symbol; + } + + }]; + + return operation_path_to_debug_symbol_map; +} + } @interface ETCoreMLModelDebugger () @@ -390,6 +485,8 @@ @interface ETCoreMLModelDebugger () @property (readonly, copy, nonatomic) NSOrderedSet *outputNames; /// The model asset. @property (readonly, copy, nonatomic) ETCoreMLAsset *modelAsset; +/// The model debug info. +@property (readonly, copy, nonatomic, nullable) ETCoreMLModelDebugInfo *modelDebugInfo; /// The asset manager. @property (readonly, copy, nonatomic) ETCoreMLAssetManager *assetManager; /// The model configuration. @@ -404,6 +501,7 @@ @implementation ETCoreMLModelDebugger { } - (nullable instancetype)initWithModelAsset:(ETCoreMLAsset *)modelAsset + modelDebugInfo:(nullable ETCoreMLModelDebugInfo *)modelDebugInfo outputNames:(NSOrderedSet *)outputNames configuration:(MLModelConfiguration *)configuration assetManager:(ETCoreMLAssetManager *)assetManager @@ -422,15 +520,27 @@ - (nullable instancetype)initWithModelAsset:(ETCoreMLAsset *)modelAsset if (!modelSpec) { return nil; } - + + __block NSMutableDictionary *> *dependencies = [NSMutableDictionary dictionary]; __block NSMutableArray *operationPaths = [NSMutableArray array]; + __block NSMutableSet *allOperationPaths = [NSMutableSet set]; visit_program_operation(*modelSpec, ^BOOL(const MILSpec::Operation &operation, ETCoreMLModelStructurePath *operationPath) { + dependencies[operationPath] = get_operation_dependencies(operation, operationPath, allOperationPaths); + [allOperationPaths addObject:operationPath]; if (is_operation_output_supported_as_model_output(operation)) { [operationPaths addObject:operationPath]; } + return YES; }); - + + + NSDictionary *> *debugHandleToOperationPathsMap = get_debug_handle_to_operation_paths_map(modelDebugInfo); + + NSDictionary *operationPathToDebugSymbolMap = get_operation_path_to_debug_symbol_map(modelDebugInfo, + debugHandleToOperationPathsMap, + dependencies); + self = [super init]; if (self) { _modelAsset = modelAsset; @@ -440,6 +550,8 @@ - (nullable instancetype)initWithModelAsset:(ETCoreMLAsset *)modelAsset _modelSpec = std::move(modelSpec); _modelSpecURL = modelSpecURL; _operationPaths = operationPaths; + _operationPathToDebugSymbolMap = operationPathToDebugSymbolMap; + _modelDebugInfo = modelDebugInfo; } return self; diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h index a2fbb98582..7a43a30d75 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h @@ -5,10 +5,11 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. +#import "ETCoreMLPair.h" #import -#import #import +@class ETCoreMLAsset; @class ETCoreMLModel; @class ETCoreMLModelStructurePath; @class ETCoreMLOperationProfilingInfo; @@ -30,14 +31,12 @@ __attribute__((objc_subclassing_restricted)) /// Constructs an `ETCoreMLModelProfiler` instance. /// -/// @param compiledModelAsset The compiled model asset (mlmodelc). -/// @param outputNames The model output names. +/// @param model The model. /// @param configuration The model configuration. /// @param error On failure, error is filled with the failure information. -- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset*)compiledModelAsset - outputNames:(NSOrderedSet*)outputNames - configuration:(MLModelConfiguration*)configuration - error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; +- (nullable instancetype)initWithModel:(ETCoreMLModel*)model + configuration:(MLModelConfiguration*)configuration + error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; /// Returns profiling info of operations at the specified paths. /// diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm index 927fb7700e..5998701eb0 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm @@ -5,16 +5,18 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import -#import -#import -#import -#import +#import "ETCoreMLModelProfiler.h" + +#import "ETCoreMLAsset.h" +#import "ETCoreMLModel.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModelStructurePath.h" +#import "ETCoreMLOperationProfilingInfo.h" +#import "ETCoreMLPair.h" +#import "ETCoreMLStrings.h" #import #import -#import +#import "program_path.h" namespace { using namespace executorchcoreml::modelstructure; @@ -220,8 +222,8 @@ void set_model_outputs(id output_features, } @interface ETCoreMLModelProfiler () -/// The CoreML model. -@property (readonly, strong, nonatomic) MLModel *model; +/// The model. +@property (readonly, strong, nonatomic) ETCoreMLModel *model; /// The model output names. @property (readonly, copy, nonatomic) NSOrderedSet *outputNames; #if MODEL_PROFILING_IS_AVAILABLE @@ -239,25 +241,19 @@ @interface ETCoreMLModelProfiler () @implementation ETCoreMLModelProfiler -- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledModelAsset - outputNames:(NSOrderedSet *)outputNames - configuration:(MLModelConfiguration *)configuration - error:(NSError * __autoreleasing *)error { +- (nullable instancetype)initWithModel:(ETCoreMLModel *)model + configuration:(MLModelConfiguration *)configuration + error:(NSError * __autoreleasing *)error { #if MODEL_PROFILING_IS_AVAILABLE if (@available(macOS 14.4, iOS 17.4, tvOS 17.4, watchOS 10.4, *)) { - NSURL *compiledModelURL = compiledModelAsset.contentURL; + NSURL *compiledModelURL = model.asset.contentURL; MLComputePlan *computePlan = get_compute_plan_of_model_at_url(compiledModelURL, configuration, error); if (!computePlan) { return nil; } - - MLModel *model = [MLModel modelWithContentsOfURL:compiledModelURL error:error]; - if (!model) { - return nil; - } - + __block NSMutableArray *operationPaths = [NSMutableArray array]; __block NSMutableDictionary *operationToPathMap = [NSMutableDictionary dictionary]; __block NSMutableArray *topologicallySortedOperations = [NSMutableArray new]; @@ -279,7 +275,6 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod self = [super init]; if (self) { - _outputNames = [outputNames copy]; _model = model; _computePlan = computePlan; _operationToPathMap = operationToPathMap; @@ -331,10 +326,10 @@ - (nullable ETCoreMLModelProfilingResult *)profilingInfoForOperationsAtPaths:(NS return nil; } -- (nullable ETCoreMLModelProfilingResult *)profilingInfoForAllOperationsWithOptions:(MLPredictionOptions *)options - inputs:(id)inputs - modelOutputs:(NSArray *_Nullable __autoreleasing *_Nonnull)modelOutputs - error:(NSError* __autoreleasing *)error { +- (nullable ETCoreMLModelProfilingResult *)profilingInfoForOperationsAtPaths:(MLPredictionOptions *)options + inputs:(id)inputs + modelOutputs:(NSArray *_Nullable __autoreleasing *_Nonnull)modelOutputs + error:(NSError* __autoreleasing *)error { #if MODEL_PROFILING_IS_AVAILABLE if (@available(macOS 14.4, iOS 17.4, tvOS 17.4, watchOS 10.4, *)) { __block NSMutableArray *paths = [NSMutableArray array]; @@ -344,7 +339,7 @@ - (nullable ETCoreMLModelProfilingResult *)profilingInfoForAllOperationsWithOpti } return YES; }); - + return [self profilingInfoForOperationsAtPaths:paths options:options inputs:inputs diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.mm index ad09e10244..419618ac5e 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelStructurePath.mm @@ -7,7 +7,7 @@ #import "ETCoreMLModelStructurePath.h" -#import +#import "objc_safe_cast.h" namespace { using namespace executorchcoreml::modelstructure; diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.mm index 4ede54dd1e..c687f47ca2 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLOperationProfilingInfo.mm @@ -5,9 +5,10 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import -#import +#import "ETCoreMLOperationProfilingInfo.h" + +#import "hash_util.h" +#import "model_event_logger_impl.h" namespace { NSString *const kPreferredComputeUnitKey = @"preferredComputeUnit"; diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLPair.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLPair.mm index 078be4aeb1..087144a24c 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLPair.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLPair.mm @@ -5,7 +5,7 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import +#import "ETCoreMLPair.h" @implementation ETCoreMLPair diff --git a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.h b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.h index e88d9754ee..49cb16a2ca 100644 --- a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.h +++ b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.h @@ -7,10 +7,10 @@ #pragma once +#import "model_event_logger.h" #import -#import -namespace torch::executor { +namespace executorch::runtime { class EventTracer; } @@ -21,14 +21,14 @@ namespace executorchcoreml { class ModelEventLoggerImpl final : public ModelEventLogger { public: /// Construct a `ModelEventLoggerImpl` from the `EventTracer`. - explicit ModelEventLoggerImpl(torch::executor::EventTracer* tracer) : tracer_(tracer) { } + explicit ModelEventLoggerImpl(::executorch::runtime::EventTracer* tracer) : tracer_(tracer) { } /// Logs profiling infos. /// /// @param op_path_to_profiling_info_map A dictionary with the operation path as the key and operation's profiling /// info as the value. - /// @param op_path_to_debug_symbol_name_map A dictionary with the operation path as the key and the symbol name as - /// the value. The symbol name is the delegate handle. + /// @param op_path_to_debug_symbol_name_map A dictionary with the operation path as the key and the debug symbol + /// name as the value. void log_profiling_infos( NSDictionary* op_path_to_profiling_info_map, NSDictionary* op_path_to_debug_symbol_name_map) const noexcept override; @@ -37,13 +37,13 @@ class ModelEventLoggerImpl final : public ModelEventLogger { /// /// @param op_path_to_value_map A dictionary with the operation path as the key and the operation's value as the /// value. - /// @param op_path_to_debug_symbol_name_map A dictionary with the operation path as the key and the symbol name as - /// the value. The symbol name is the delegate handle. + /// @param op_path_to_debug_symbol_name_map A dictionary with the operation path as the key and the debug symbol + /// name as the value. void log_intermediate_tensors( NSDictionary* op_path_to_value_map, - NSDictionary* op_path_to_debug_symbol_name_map) const noexcept override; + NSDictionary* op_path_to_debug_symbol_map) const noexcept override; private: - torch::executor::EventTracer* tracer_; + ::executorch::runtime::EventTracer* tracer_; }; } // namespace executorchcoreml diff --git a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm index f90a8f1a41..1d358583a8 100644 --- a/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm +++ b/backends/apple/coreml/runtime/sdk/model_event_logger_impl.mm @@ -5,13 +5,20 @@ // // Please refer to the license found in the LICENSE file in the root directory of the source tree. -#import -#import +#import "model_event_logger_impl.h" + +#import "ETCoreMLModelStructurePath.h" +#import "ETCoreMLOperationProfilingInfo.h" #import +#import "objc_array_util.h" #import -#import +#import +#import "MLMultiArray_Copy.h" namespace { + +using namespace torch::executor; + uint64_t time_units_to_nano_seconds(uint64_t time_units) { static mach_timebase_info_data_t info; static dispatch_once_t onceToken; @@ -22,6 +29,51 @@ uint64_t time_units_to_nano_seconds(uint64_t time_units) { return time_units * info.numer / info.denom; } +std::optional to_scalar_type(MLMultiArrayDataType data_type) { + switch (data_type) { + case MLMultiArrayDataTypeFloat16: { + return ScalarType::Half; + } + case MLMultiArrayDataTypeFloat32: { + return ScalarType::Float; + } + case MLMultiArrayDataTypeDouble: { + return ScalarType::Double; + } + case MLMultiArrayDataTypeInt32: { + return ScalarType::Int; + } + default: { + return std::nullopt; + } + } +} + +MLMultiArrayDataType get_supported_data_type(MLMultiArrayDataType data_type) { + switch (data_type) { + case MLMultiArrayDataTypeFloat16: { + return MLMultiArrayDataTypeFloat32; + } + default: { + return data_type; + } + } +} + +bool is_packed(NSArray *shape, NSArray *strides) { + if (shape.count == 0) { + return true; + } + size_t product = 1; + for (size_t i = shape.count; i > 0; i--) { + if (![strides[i - 1] isEqual:@(product)]) { + return false; + } + product *= shape[i - 1].unsignedLongValue; + } + + return true; +} } namespace executorchcoreml { @@ -53,7 +105,60 @@ uint64_t time_units_to_nano_seconds(uint64_t time_units) { } void ModelEventLoggerImpl::log_intermediate_tensors(NSDictionary *op_path_to_value_map, - NSDictionary *op_path_to_debug_symbol_name_map) const noexcept { - //TODO: Implement logging for intermediate tensors once ExecuTorch has support. + NSDictionary *op_path_to_debug_symbol_name_map) const noexcept { + [op_path_to_value_map enumerateKeysAndObjectsUsingBlock:^(ETCoreMLModelStructurePath *path, + MLMultiArray *intermediate_value, + BOOL * _Nonnull __unused stop) { + using namespace torch::executor; + + @autoreleasepool { + NSString *debug_symbol = op_path_to_debug_symbol_name_map[path]; + if (debug_symbol == nil) { + return; + } + + MLMultiArray *value = op_path_to_value_map[path]; + if (value == nil || value.count == 0) { + return; + } + + MLMultiArray *supported_value = value; + NSArray *shape = supported_value.shape; + NSError *local_error = nil; + MLMultiArrayDataType data_type = get_supported_data_type(value.dataType); + + if (!is_packed(shape, value.strides) || (supported_value.dataType != data_type)) { + supported_value = [[MLMultiArray alloc] initWithShape:shape + dataType:data_type + error:&local_error]; + NSCAssert(supported_value != nil, + @"ModelEventLoggerImpl: Failed to create packed multiarray with shape=%@, dataType=%ld, error=%@.", + shape, + static_cast(value.dataType), + local_error); + [value copyInto:supported_value]; + } + + + [supported_value getBytesWithHandler:^(const void * _Nonnull bytes, NSInteger size) { + auto sizes = to_vector(shape); + auto strides = to_vector(supported_value.strides); + auto scalar_type = to_scalar_type(data_type); + auto dim_order = std::vector(shape.count); + std::iota(std::begin(dim_order), std::end(dim_order), 0); + + NSCAssert(scalar_type.has_value(), @"ModelEventLoggerImpl: MultiArray dataType=%ld is not supported.", static_cast(data_type)); + auto tensor_impl = TensorImpl( + scalar_type.value(), + static_cast(sizes.size()), + sizes.data(), + const_cast(bytes), + dim_order.data(), + strides.data()); + auto tensor = Tensor(&tensor_impl); + tracer_->log_intermediate_output_delegate(debug_symbol.UTF8String, -1, tensor); + }]; + } + }]; } } // namespace executorchcoreml diff --git a/backends/apple/coreml/runtime/sdk/model_package_info.h b/backends/apple/coreml/runtime/sdk/model_package_info.h index f694a62305..51978002a4 100644 --- a/backends/apple/coreml/runtime/sdk/model_package_info.h +++ b/backends/apple/coreml/runtime/sdk/model_package_info.h @@ -9,7 +9,7 @@ #import -#import +#import "serde_json.h" #import #import diff --git a/backends/apple/coreml/runtime/sdk/model_package_info.mm b/backends/apple/coreml/runtime/sdk/model_package_info.mm index 5e52974b41..b7b26178fd 100644 --- a/backends/apple/coreml/runtime/sdk/model_package_info.mm +++ b/backends/apple/coreml/runtime/sdk/model_package_info.mm @@ -7,9 +7,9 @@ #import "model_package_info.h" -#import -#import -#import +#import "ETCoreMLLogging.h" +#import "objc_json_serde.h" +#import "serde_json.h" namespace { struct ModelPackageInfoKeys { diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm index 04458e6502..691d4d726e 100644 --- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm +++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm @@ -13,6 +13,8 @@ #import #import +#import "MLModel_Prewarm.h" + static constexpr size_t kRuntimeMemorySize = 50 * 1024U * 1024U; // 50 MB using namespace torch::executor; @@ -25,14 +27,14 @@ return [NSData dataWithContentsOfURL:url]; } -class DataLoaderImpl: public DataLoader { +class DataLoaderImpl final : public DataLoader { public: DataLoaderImpl(std::string filePath) :data_(read_data(filePath)) {} Result load( - size_t offset, size_t size, __ET_UNUSED const DataLoader::SegmentInfo& segment_info) override { + size_t offset, size_t size, ET_UNUSED const DataLoader::SegmentInfo& segment_info) const override { NSData *subdata = [data_ subdataWithRange:NSMakeRange(offset, size)]; return FreeableBuffer(subdata.bytes, size, nullptr); } @@ -42,7 +44,7 @@ } private: - NSData *data_; + NSData * const data_; }; using Buffer = std::vector; @@ -184,20 +186,28 @@ - (void)executeModelAtURL:(NSURL *)modelURL nLoads:(NSUInteger)nLoads nExecution - (void)testAddProgramExecute { NSURL *modelURL = [[self class] bundledResourceWithName:@"add_coreml_all" extension:@"pte"]; XCTAssertNotNil(modelURL); - [self executeModelAtURL:modelURL nLoads:5 nExecutions:2]; + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; } - (void)testMulProgramExecute { NSURL *modelURL = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"]; XCTAssertNotNil(modelURL); - [self executeModelAtURL:modelURL nLoads:5 nExecutions:2]; + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; } - (void)testMV3ProgramExecute { NSURL *modelURL = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"]; XCTAssertNotNil(modelURL); - [self executeModelAtURL:modelURL nLoads:5 nExecutions:2]; + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; +} + +#if MODEL_STATE_IS_SUPPORTED +- (void)testStateProgramExecute { + NSURL *modelURL = [[self class] bundledResourceWithName:@"state_coreml_all" extension:@"pte"]; + XCTAssertNotNil(modelURL); + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; } +#endif - (void)executeMultipleModelsConcurrently:(NSArray *)modelURLs nLoads:(NSUInteger)nLoads diff --git a/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm b/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm index 95c84ab674..2464ec8dbb 100644 --- a/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm +++ b/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm @@ -16,41 +16,53 @@ #import namespace { - using namespace executorchcoreml::modelstructure; - - using NotifyFn = std::function *op_path_to_value_map, - NSDictionary *op_path_to_debug_symbol_name_map)>; - - class ModelProfilingEventLoggerImpl: public executorchcoreml::ModelEventLogger { - public: - explicit ModelProfilingEventLoggerImpl(NotifyFn fn) - :fn_(fn) - {} - - void log_profiling_infos(NSDictionary *op_path_to_profiling_info_map, - NSDictionary *op_path_to_debug_symbol_name_map) const noexcept {} - - void log_intermediate_tensors(NSDictionary *op_path_to_value_map, - NSDictionary *op_path_to_debug_symbol_name_map) const noexcept { - fn_(op_path_to_value_map, op_path_to_debug_symbol_name_map); +using namespace executorchcoreml::modelstructure; + +using NotifyFn = std::function *op_path_to_value_map, + NSDictionary *op_path_to_debug_symbol_name_map)>; + +class ModelEventLoggerImpl: public executorchcoreml::ModelEventLogger { +public: + explicit ModelEventLoggerImpl(NotifyFn fn) + :fn_(fn) + {} + + void log_profiling_infos(NSDictionary *op_path_to_profiling_info_map, + NSDictionary *op_path_to_debug_symbol_name_map) const noexcept {} + + void log_intermediate_tensors(NSDictionary *op_path_to_value_map, + NSDictionary *op_path_to_debug_symbol_map) const noexcept { + fn_(op_path_to_value_map, op_path_to_debug_symbol_map); + } + +private: + NotifyFn fn_; +}; + +ETCoreMLModelStructurePath *make_path_with_output_name(const std::string& output_name, + const std::string& function_name = "main") { + Path path; + path.append_component(Path::Program()); + path.append_component(Path::Program::Function(function_name)); + path.append_component(Path::Program::Block(-1)); + path.append_component(Path::Program::Operation(output_name)); + + return [[ETCoreMLModelStructurePath alloc] initWithUnderlyingValue:std::move(path)]; +} + +void add_debugging_result(NSDictionary *debugging_result, + NSDictionary *path_to_symbol_name_map, + NSMutableDictionary *debugging_results) { + for (ETCoreMLModelStructurePath *path in debugging_result) { + NSString *debug_symbol = path_to_symbol_name_map[path]; + if (debug_symbol) { + debugging_results[path] = debugging_result[path]; } - - private: - NotifyFn fn_; - }; - - ETCoreMLModelStructurePath *make_path_with_output_name(const std::string& output_name, - const std::string& function_name = "main") { - Path path; - path.append_component(Path::Program()); - path.append_component(Path::Program::Function(function_name)); - path.append_component(Path::Program::Block(-1)); - path.append_component(Path::Program::Operation(output_name)); - - return [[ETCoreMLModelStructurePath alloc] initWithUnderlyingValue:std::move(path)]; } } +} + @interface ETCoreMLModelDebuggerTests : XCTestCase @end @@ -87,8 +99,8 @@ - (void)debugModelWithName:(NSString *)modelName MLPredictionOptions *predictionOptions = [[MLPredictionOptions alloc] init]; executorchcoreml::ModelLoggingOptions loggingOptions; loggingOptions.log_intermediate_tensors = true; - ModelProfilingEventLoggerImpl eventLogger(notify); - + ModelEventLoggerImpl eventLogger(notify); + NSArray *outputs = [analyzer executeModelWithInputs:inputs predictionOptions:predictionOptions loggingOptions:loggingOptions @@ -125,21 +137,39 @@ - (void)testMulProgramDebugging { } - (void)testMV3ProgramDebugging { - XCTSkip(@"There is a device specialization issue when getting on of the outputs, will fix after investigation."); - NotifyFn notify = [](NSDictionary *debuggingResult, - NSDictionary *pathToSymbolNameMap) { - // There are more than 200 ops, we verify the outputs for specific ops. - XCTAssertNotNil(debuggingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]); - XCTAssertNotNil(debuggingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]); - XCTAssertNotNil(debuggingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]); - XCTAssertNotNil(debuggingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]); - XCTAssertNotNil(debuggingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]); - XCTAssertNotNil(debuggingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]); + NSMutableDictionary *debuggingResults = [NSMutableDictionary new]; + NotifyFn notify = [debuggingResults](NSDictionary *debuggingResult, + NSDictionary *pathToSymbolNameMap) mutable { + add_debugging_result(debuggingResult, pathToSymbolNameMap, debuggingResults); }; [self debugModelWithName:@"mv3_coreml_all" repeatedInputValues:@[@(1), @(2)] notify:notify]; + + // There are more than 200 ops, we verify the outputs for specific ops. + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]); + XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]); + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]); + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]); + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]); + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]); +} + +- (void)testAddMulProgramDebugging { + NSMutableDictionary *debuggingResults = [NSMutableDictionary new]; + NotifyFn notify = [debuggingResults](NSDictionary *debuggingResult, + NSDictionary *pathToSymbolNameMap) mutable { + add_debugging_result(debuggingResult, pathToSymbolNameMap, debuggingResults); + }; + + [self debugModelWithName:@"add_mul_coreml_all" + repeatedInputValues:@[@(1), @(2)] + notify:notify]; + + // There are more than 200 ops, we verify the outputs for specific ops. + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_add_tensor")]); + XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mm_default_cast_fp16")]); } @end diff --git a/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm b/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm index 72ad71adc9..3c0908201a 100644 --- a/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm +++ b/backends/apple/coreml/runtime/test/ETCoreMLTestUtils.mm @@ -8,16 +8,17 @@ #import "ETCoreMLTestUtils.h" -#import -#import -#import -#import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLLogging.h" +#import "ETCoreMLModelDebugInfo.h" +#import "ETCoreMLModelAnalyzer.h" +#import "ETCoreMLModelCompiler.h" +#import "ETCoreMLStrings.h" #import -#import +#import "inmemory_filesystem_utils.hpp" #import #import -#import +#import "model_metadata.h" namespace { NSURL * _Nullable create_directory_if_needed(NSURL *url, @@ -239,6 +240,7 @@ + (nullable MLMultiArray *)filledMultiArrayWithConstraint:(MLMultiArrayConstrain + (BOOL)extractModelAssetAndMetadataFromAOTData:(NSData *)data modelAsset:(ETCoreMLAsset *_Nullable __autoreleasing *_Nonnull)modelAsset + modelDebugInfo:(ETCoreMLModelDebugInfo *_Nullable __autoreleasing *_Nonnull)modelDebugInfo metadata:(executorchcoreml::ModelMetadata&)metadata dstURL:(NSURL *)dstURL fileManager:(NSFileManager *)fileManager @@ -295,18 +297,35 @@ + (BOOL)extractModelAssetAndMetadataFromAOTData:(NSData *)data if (modelAsset) { *modelAsset = localAsset; } - + + __block auto debugInfoBuffer = inMemoryFS->get_file_content({ETCoreMLStrings.debugInfoFileRelativePath.UTF8String}, ec); + if (debugInfoBuffer && debugInfoBuffer->size() > 0) { + + NSData *data = [[NSData alloc] initWithBytesNoCopy:debugInfoBuffer->data() + length:debugInfoBuffer->size() + deallocator:^(void * _Nonnull __unused bytes, NSUInteger __unused length) { + debugInfoBuffer.reset(); + }]; + + ETCoreMLModelDebugInfo *lModelDebugInfo = [ETCoreMLModelDebugInfo modelDebugInfoFromData:data error:nil]; + if (modelDebugInfo) { + *modelDebugInfo = lModelDebugInfo; + } + } + return YES; } + (ETCoreMLModelAnalyzer *)createAnalyzerWithAOTData:(NSData *)data - dstURL:(NSURL *)dstURL - error:(NSError * __autoreleasing *)error { + dstURL:(NSURL *)dstURL + error:(NSError * __autoreleasing *)error { ETCoreMLAsset *modelAsset = nil; + ETCoreMLModelDebugInfo *modelDebugInfo = nil; executorchcoreml::ModelMetadata metadata; NSFileManager *fileManager = [[NSFileManager alloc] init]; if (![self extractModelAssetAndMetadataFromAOTData:data modelAsset:&modelAsset + modelDebugInfo:&modelDebugInfo metadata:metadata dstURL:dstURL fileManager:fileManager @@ -343,12 +362,12 @@ + (ETCoreMLModelAnalyzer *)createAnalyzerWithAOTData:(NSData *)data MLModelConfiguration *configuration = [[MLModelConfiguration alloc] init]; ETCoreMLModelAnalyzer *analyzer = [[ETCoreMLModelAnalyzer alloc] initWithCompiledModelAsset:compiledModelAsset modelAsset:modelAsset + modelDebugInfo:modelDebugInfo metadata:metadata - operationPathToDebugSymbolMap:nil configuration:configuration assetManager:assetManager error:error]; - + return analyzer; } diff --git a/backends/apple/coreml/runtime/test/export_stateful_model.py b/backends/apple/coreml/runtime/test/export_stateful_model.py new file mode 100644 index 0000000000..61d1a93980 --- /dev/null +++ b/backends/apple/coreml/runtime/test/export_stateful_model.py @@ -0,0 +1,77 @@ +# Copyright © 2024 Apple Inc. All rights reserved. +# +# Please refer to the license found in the LICENSE file in the root directory of the source tree. + +import os +from pathlib import Path + +import coremltools as ct +import executorch.exir as exir + +import torch + +from executorch.backends.apple.coreml.compiler import CoreMLBackend +from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from torch.export import export + + +class StatefulModel(torch.nn.Module): + def __init__( + self, + embedding_dim: int, + max_seq_len: int, + ): + super().__init__() + self.register_buffer( + "cache", torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32) + ) + + def forward( + self, + q: torch.Tensor, + k_val: torch.Tensor, + input_pos: torch.Tensor, + ): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + +def main() -> None: + embedding_dim = 3 + max_seq_len = 2 + model = StatefulModel(embedding_dim=embedding_dim, max_seq_len=max_seq_len) + example_inputs = ( + torch.randn((1, embedding_dim)), + torch.randn((1, embedding_dim)), + torch.tensor([0]), + ) + exported_model = export(model, example_inputs) + edge_program_manager = exir.to_edge(exported_model) + compile_specs = CoreMLBackend.generate_compile_specs( + compute_precision=ct.precision.FLOAT16, + compute_unit=ct.ComputeUnit.ALL, + minimum_deployment_target=ct.target.iOS18, + ) + + partitioner = CoreMLPartitioner( + skip_ops_for_coreml_delegation=None, + compile_specs=compile_specs, + ) + + delegated_program_manager = edge_program_manager.to_backend(partitioner) + exec_program = delegated_program_manager.to_executorch( + config=exir.ExecutorchBackendConfig(extract_delegate_segments=True) + ) + + buffer = exec_program.buffer + models_dir = Path(os.path.dirname(os.path.realpath(__file__))) / "models" + models_dir.mkdir(parents=False, exist_ok=True) + file_path = models_dir / "state_coreml_all.pte" + with open(file_path.resolve(), "wb") as file: + file.write(buffer) + + +if __name__ == "__main__": + main() # pragma: no cover diff --git a/backends/apple/coreml/runtime/util/objc_array_util.h b/backends/apple/coreml/runtime/util/objc_array_util.h index 5f4c8c7bc2..b3446a2288 100644 --- a/backends/apple/coreml/runtime/util/objc_array_util.h +++ b/backends/apple/coreml/runtime/util/objc_array_util.h @@ -18,6 +18,8 @@ template <> inline size_t to_value(NSNumber* value) { return value.unsignedLongV template <> inline ssize_t to_value(NSNumber* value) { return value.longLongValue; } +template <> inline int to_value(NSNumber* value) { return value.intValue; } + template ::value, T>::type> inline NSArray* to_array(const std::vector& array) { NSMutableArray* result = [NSMutableArray arrayWithCapacity:array.size()]; diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj index d8a5e61107..2daa5615ba 100644 --- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj +++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj @@ -7,6 +7,10 @@ objects = { /* Begin PBXBuildFile section */ + 8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 8307EB892C9262060011AE6D /* state_coreml_all.pte */; }; + 83BB78A02C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm in Sources */ = {isa = PBXBuildFile; fileRef = 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */; }; + 83BB78BF2C66AAAE00274ED7 /* add_mul_coreml_all.bin in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */; }; + 83BB78C02C66AAAE00274ED7 /* add_mul_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */; }; C945E8E02B997ECE009C3FAC /* ETCoreMLModelProfiler.mm in Sources */ = {isa = PBXBuildFile; fileRef = C945E8CF2B997ECD009C3FAC /* ETCoreMLModelProfiler.mm */; }; C945E8E12B997ECE009C3FAC /* ETCoreMLModelAnalyzer.mm in Sources */ = {isa = PBXBuildFile; fileRef = C945E8D42B997ECD009C3FAC /* ETCoreMLModelAnalyzer.mm */; }; C945E8E22B997ECE009C3FAC /* program_path.mm in Sources */ = {isa = PBXBuildFile; fileRef = C945E8D52B997ECD009C3FAC /* program_path.mm */; }; @@ -100,8 +104,8 @@ C9E7D7952AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78D2AB3F9BF00CCAE5D /* ETCoreMLModelManagerTests.mm */; }; C9E7D7962AB3F9BF00CCAE5D /* KeyValueStoreTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D78E2AB3F9BF00CCAE5D /* KeyValueStoreTests.mm */; }; C9E7D7A22AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9E7D7A12AB3FBB200CCAE5D /* CoreMLBackendDelegateTests.mm */; }; - F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */; }; C9EC7E1B2BC73B3200A6B166 /* MultiArrayTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */; }; + F24817E52BC655E100E80D98 /* libexecutorch_no_prim_ops.a in Frameworks */ = {isa = PBXBuildFile; fileRef = F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -117,6 +121,11 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 8307EB892C9262060011AE6D /* state_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = state_coreml_all.pte; path = ../test/models/state_coreml_all.pte; sourceTree = ""; }; + 83BB789E2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelDebugInfo.h; path = ../sdk/ETCoreMLModelDebugInfo.h; sourceTree = ""; }; + 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = ETCoreMLModelDebugInfo.mm; path = ../sdk/ETCoreMLModelDebugInfo.mm; sourceTree = ""; }; + 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_mul_coreml_all.bin; path = ../test/models/add_mul_coreml_all.bin; sourceTree = ""; }; + 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = add_mul_coreml_all.pte; path = ../test/models/add_mul_coreml_all.pte; sourceTree = ""; }; C945E8CD2B997ECD009C3FAC /* ETCoreMLModelProfiler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelProfiler.h; path = ../sdk/ETCoreMLModelProfiler.h; sourceTree = ""; }; C945E8CE2B997ECD009C3FAC /* ETCoreMLModelAnalyzer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelAnalyzer.h; path = ../sdk/ETCoreMLModelAnalyzer.h; sourceTree = ""; }; C945E8CF2B997ECD009C3FAC /* ETCoreMLModelProfiler.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; name = ETCoreMLModelProfiler.mm; path = ../sdk/ETCoreMLModelProfiler.mm; sourceTree = ""; }; @@ -299,9 +308,9 @@ C9EA3DB22B71A2B200B7D7BD /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; }; C9EA3FDE2B73EEA000B7D7BD /* libsqlite3.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.tbd; path = usr/lib/libsqlite3.tbd; sourceTree = SDKROOT; }; C9EA3FE52B73EF6300B7D7BD /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; - F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = ../libraries/libexecutorch_no_prim_ops.a; sourceTree = ""; }; C9EC7E092BC662A300A6B166 /* objc_array_util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = objc_array_util.h; path = ../util/objc_array_util.h; sourceTree = ""; }; C9EC7E1A2BC73B3200A6B166 /* MultiArrayTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = MultiArrayTests.mm; path = ../test/MultiArrayTests.mm; sourceTree = ""; }; + F24817E42BC655E100E80D98 /* libexecutorch_no_prim_ops.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libexecutorch_no_prim_ops.a; path = ../libraries/libexecutorch_no_prim_ops.a; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -329,6 +338,8 @@ C945E8D42B997ECD009C3FAC /* ETCoreMLModelAnalyzer.mm */, C945E8D32B997ECD009C3FAC /* ETCoreMLModelDebugger.h */, C945E8DF2B997ECE009C3FAC /* ETCoreMLModelDebugger.mm */, + 83BB789E2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.h */, + 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */, C945E8CD2B997ECD009C3FAC /* ETCoreMLModelProfiler.h */, C945E8CF2B997ECD009C3FAC /* ETCoreMLModelProfiler.mm */, C945E8D02B997ECD009C3FAC /* ETCoreMLModelStructurePath.h */, @@ -596,6 +607,9 @@ C985519C2AD2542D009143F9 /* mul_coreml_all.pte */, C985519B2AD2542D009143F9 /* mv3_coreml_all.bin */, C98551982AD2542D009143F9 /* mv3_coreml_all.pte */, + 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */, + 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */, + 8307EB892C9262060011AE6D /* state_coreml_all.pte */, ); name = models; sourceTree = ""; @@ -661,9 +675,12 @@ files = ( C98551A12AD2542D009143F9 /* mv3_coreml_all.bin in Resources */, C985519F2AD2542D009143F9 /* mul_coreml_all.bin in Resources */, + 83BB78BF2C66AAAE00274ED7 /* add_mul_coreml_all.bin in Resources */, + 83BB78C02C66AAAE00274ED7 /* add_mul_coreml_all.pte in Resources */, C985519E2AD2542D009143F9 /* mv3_coreml_all.pte in Resources */, C98551A02AD2542D009143F9 /* add_coreml_all.bin in Resources */, C98551A22AD2542D009143F9 /* mul_coreml_all.pte in Resources */, + 8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */, C98551A32AD2542D009143F9 /* add_coreml_all.pte in Resources */, ); runOnlyForDeploymentPostprocessing = 0; @@ -695,6 +712,7 @@ C94D51042ABDF84100AF47FD /* ETCoreMLStrings.mm in Sources */, C9E7D7932AB3F9BF00CCAE5D /* ETCoreMLAssetManagerTests.mm in Sources */, C945E8E42B997ECE009C3FAC /* model_event_logger_impl.mm in Sources */, + 83BB78A02C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm in Sources */, C945E9472B997EEE009C3FAC /* SoundAnalysisPreprocessing.pb.cc in Sources */, C97716B52AEA21B600FC0DAC /* inmemory_filesystem_utils.mm in Sources */, C9E7D7922AB3F9BF00CCAE5D /* DatabaseTests.mm in Sources */, diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh index 7beca63726..0c1822aa82 100755 --- a/backends/apple/coreml/scripts/generate_test_models.sh +++ b/backends/apple/coreml/scripts/generate_test_models.sh @@ -17,14 +17,17 @@ cd "$EXECUTORCH_ROOT_PATH" mkdir "$COREML_DIR_PATH/runtime/test/models/" #Generate models -echo "Executorch: Generating test models" cd "$EXECUTORCH_ROOT_PATH" -MODELS=("add" "mul" "mv3") +MODELS=("add" "add_mul" "mul" "mv3") for MODEL in "${MODELS[@]}" do + echo "Executorch: Generating $MODEL model" # TODO: Don't use the script in examples directory. python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --save_processed_bytes mv -f "$MODEL""_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models" mv -f "$MODEL""_coreml_all.bin" "$COREML_DIR_PATH/runtime/test/models" done + +echo "Executorch: Generating stateful model" +python3 "$SCRIPT_DIR_PATH/../runtime/test/export_stateful_model.py" diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh index 0018b5ffc2..b3ea0d77ca 100755 --- a/backends/apple/coreml/scripts/install_requirements.sh +++ b/backends/apple/coreml/scripts/install_requirements.sh @@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party" mkdir "$COREML_DIR_PATH/third-party" echo "${green}ExecuTorch: Cloning coremltools." -git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH +git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH cd $COREMLTOOLS_DIR_PATH STATUS=$? @@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel echo "${green}ExecuTorch: Installing coremltools." pip install "$COREMLTOOLS_DIR_PATH" +# CoreMLTools have started supporting numpy 2.0, +# but ExecuTorch example model test env is still using older transformers, +# so for now we will need to downgrade numpy to 1.x +# TODO: Remove this numpy downgrade once later transformers starts to be used +pip install numpy==1.26.4 STATUS=$? if [ $STATUS -ne 0 ]; then echo "${red}ExecuTorch: Failed to install coremltools." diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md index 4e66544f7b..0efd9bbcc2 100644 --- a/backends/apple/coreml/setup.md +++ b/backends/apple/coreml/setup.md @@ -50,7 +50,7 @@ xcode-select --install ```bash cd executorch -./build/build_apple_frameworks.sh --Release --coreml +./build/build_apple_frameworks.sh --coreml ``` 5. Open the project in Xcode, and drag `executorch.xcframework` and `coreml_backend.xcframework` frameworks generated from Step 2 to Frameworks. diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py index 45c468e450..72a7fbf093 100644 --- a/backends/apple/coreml/test/test_coreml_partitioner.py +++ b/backends/apple/coreml/test/test_coreml_partitioner.py @@ -4,11 +4,14 @@ import unittest +import coremltools as ct + import executorch.exir import torch import torchvision +from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner @@ -68,12 +71,65 @@ def test_vit_skip_conv(self): ) ) + conv_block = ["aten.convolution.default", "executorch_call_delegate"] + safe_softmax_block = [ + "getitem", + "getitem", + "getitem", + "getitem", + "aten.any.dim", + "executorch_call_delegate", + ] + final_block = ["getitem"] + total = conv_block + 12 * safe_softmax_block + final_block + + assert [ + node.target.__name__ + for node in delegated_program_manager.exported_program().graph.nodes + if node.op == "call_function" + ] == total + + def test_buffer(self): + embedding_dim = 3 + max_seq_len = 2 + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer( + "cache", + torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32), + ) + + def forward(self, q, k_val, input_pos): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + model = Model() + model.eval() + + q = torch.randn((1, embedding_dim)) + k_val = torch.randn((1, embedding_dim)) + input_pos = torch.tensor([0]) + example_inputs = (q, k_val, input_pos) + exir_program_aten = torch.export.export(model, example_inputs) + + compile_specs = CoreMLBackend.generate_compile_specs( + minimum_deployment_target=ct.target.iOS18 + ) + partitioner = CoreMLPartitioner(compile_specs=compile_specs) + edge_program_manager = executorch.exir.to_edge( + exir_program_aten, compile_config=self.edge_compile_config + ) + delegated_program_manager = edge_program_manager.to_backend(partitioner) + assert [ node.target.__name__ for node in delegated_program_manager.exported_program().graph.nodes if node.op == "call_function" ] == [ - "aten.convolution.default", "executorch_call_delegate", "getitem", ] @@ -83,3 +139,4 @@ def test_vit_skip_conv(self): test_runner = TestCoreMLPartitioner() test_runner.test_add_sub_skip_mm() test_runner.test_vit_skip_conv() + test_runner.test_buffer() diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt index f364319283..f47139a000 100644 --- a/backends/apple/mps/CMakeLists.txt +++ b/backends/apple/mps/CMakeLists.txt @@ -69,7 +69,7 @@ add_library(mpsdelegate ${_mps_backend__srcs}) find_library(FOUNDATION_FRAMEWORK Foundation) find_library(METAL_FRAMEWORK Metal) find_library(MPS_FRAMEWORK MetalPerformanceShaders) -find_library(MPS_GRAPG_FRAMEWORK MetalPerformanceShadersGraph) +find_library(MPS_GRAPH_FRAMEWORK MetalPerformanceShadersGraph) target_link_libraries( mpsdelegate @@ -79,7 +79,7 @@ target_link_libraries( ${FOUNDATION_FRAMEWORK} ${METAL_FRAMEWORK} ${MPS_FRAMEWORK} - ${MPS_GRAPG_FRAMEWORK} + ${MPS_GRAPH_FRAMEWORK} ) target_link_options_shared_lib(mpsdelegate) diff --git a/backends/apple/mps/TARGETS b/backends/apple/mps/TARGETS index 9961ddd3c7..1ab92b3fca 100644 --- a/backends/apple/mps/TARGETS +++ b/backends/apple/mps/TARGETS @@ -94,8 +94,9 @@ runtime.python_test( "//caffe2:torch", "//executorch/examples/models:models", "//executorch/exir/tests:models", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program/serialize:lib", + "//executorch/extension/export_util:export_util", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program/serialize:lib", "fbsource//third-party/pypi/pytest:pytest", ], ) diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py index 519b4b31ec..8362774fa9 100644 --- a/backends/apple/mps/mps_preprocess.py +++ b/backends/apple/mps/mps_preprocess.py @@ -2,9 +2,8 @@ # Copyright (c) 2023 Apple Inc. All rights reserved. # Provided subject to the LICENSE file in the top level directory. # - import logging -from typing import Dict, final, List +from typing import ClassVar, Dict, final, List, Tuple import torch @@ -16,6 +15,8 @@ ) from executorch.backends.apple.mps.serialization.mps_graph_schema import ( + Buffer, + DataSegment, MPSGraph, MPSTensor, OpType, @@ -24,7 +25,7 @@ from executorch.backends.apple.mps.serialization.mps_graph_serialize import ( convert_to_flatbuffer, ) -from executorch.backends.apple.mps.utils.mps_utils import is_parameter +from executorch.exir._serialize._program import Cord from executorch.exir.backend.backend_details import ( BackendDetails, @@ -39,6 +40,29 @@ @final class MPSBackend(BackendDetails): + @staticmethod + def slice_len_max(s): + assert s.start is not None + assert s.stop is not None + step = 1 + if s.step is not None: + step = s.step + return max((s.stop - s.start) // step, 1) + + MAGIC_IX: ClassVar[slice] = slice(4, 8) + DATA_SEGMENT_OFFSET_IX: ClassVar[slice] = slice(8, 16) + DATA_SEGMENT_SIZE_IX: ClassVar[slice] = slice(16, 24) + + # magic bytes that should be at the beginning of the header + EXPECTED_MAGIC: ClassVar[bytes] = b"MP00" + # The length of the header in bytes + EXPECTED_LENGTH: ClassVar[int] = ( + 4 + + slice_len_max(MAGIC_IX) + + slice_len_max(DATA_SEGMENT_OFFSET_IX) + + slice_len_max(DATA_SEGMENT_SIZE_IX) + ) + @staticmethod def preprocess( edge_program: ExportedProgram, @@ -67,6 +91,7 @@ def preprocess( output_ids=[], constant_ids=[], graph_type=OpType.mps_graph, + constant_segment=DataSegment(0, 0), ) convert_model_to_fp16 = True @@ -100,10 +125,43 @@ def preprocess( else: op_handler[node.op](edge_program, node_visitors, node, mps_graph) + segment_data, mps_graph = _extract_constant_segment(mps_graph) if logging.DEBUG >= logging.root.level: pretty_print(mps_graph) - return PreprocessResult(processed_bytes=convert_to_flatbuffer(mps_graph)) + # Add to aggregate segments cord with padding. + padding_length = _padding_required(len(segment_data), 16) + if padding_length > 0: + segment_data.append(b"\x00" * padding_length) + + # Combine mps_graph with segment data + combined = Cord() + graph_bytes = convert_to_flatbuffer(mps_graph) + + data_segment_offset: int = MPSBackend.EXPECTED_LENGTH + data_segment_offset = data_segment_offset + len(graph_bytes) + + graph_padding_length = _padding_required(data_segment_offset, 16) + data_segment_offset = data_segment_offset + graph_padding_length + data_segment_size = len(segment_data) + + data: bytes = ( + b"\x00\x00\x00\x00" + + MPSBackend.EXPECTED_MAGIC + + data_segment_offset.to_bytes(8, byteorder="little") + + data_segment_size.to_bytes(8, byteorder="little") + ) + assert len(data) == MPSBackend.EXPECTED_LENGTH + + combined.append(data) + combined.append(graph_bytes) + + if graph_padding_length > 0: + combined.append(b"\x00" * graph_padding_length) + # Append the segment data to the end of the mps graph + combined.append(segment_data) + + return PreprocessResult(processed_bytes=bytes(combined)) @staticmethod def handle_call_function( @@ -138,10 +196,8 @@ def handle_placeholder( node: torch.fx.Node, mps_graph: MPSGraph, ) -> None: - # Handle only constants. Placeholders have already - # been visited in `process_input_placeholders` - if is_parameter(edge_program, node): - node_visitors[node.op].define_tensor(node, mps_graph) + # Constants are handled directly when visiting the nodes. + pass @staticmethod def handle_output( @@ -164,12 +220,41 @@ def handle_get_attr( pass +def _padding_required(offset: int, alignment: int) -> int: + """Returns the padding required to align `offset` to `alignment`.""" + remainder: int = offset % alignment + if remainder != 0: + return alignment - remainder + return 0 + + +def _extract_constant_segment(mps_graph: MPSGraph) -> Tuple[Cord, MPSGraph]: + """Extracts the constant segment from the MPSGraph and returns the updated MPSGraph along with the segment data.""" + # Note that the beginning of the segment data is not aligned. Need to handle out of this call. + segment_data = Cord() + offset = 0 + for i in range(len(mps_graph.mps_values)): + tensor = mps_graph.mps_values[i] + if tensor.constant_buffer_size > 0: + # Notice that buffer is already force aligned so we don't need to pad it + segment_data.append(tensor.constant_buffer.storage) + + # Reset buffer to empty + tensor.constant_buffer = Buffer(storage=b"") + # Update segment offset + tensor.segment_offset = offset + offset += tensor.constant_buffer_size + + return segment_data, mps_graph + + def tensor_to_str(mps_tensor: MPSTensor): tensor_str = "MPSTensor(" tensor_str += "datatype=" + str(mps_tensor.datatype) + ", " tensor_str += "num_dims=" + str(mps_tensor.num_dims) + ", " tensor_str += "dims=" + str(mps_tensor.dims) + ", " - tensor_str += "constant_buffer_size=" + str(mps_tensor.constant_buffer_size) + tensor_str += "constant_buffer_size=" + str(mps_tensor.constant_buffer_size) + ", " + tensor_str += "segment_offset=" + str(mps_tensor.segment_offset) tensor_str += ")" return tensor_str @@ -193,3 +278,4 @@ def pretty_print(mps_graph: MPSGraph): logging.info(" Output ids:") for out_id in mps_graph.output_ids: logging.info(f" {out_id}") + logging.info(f" Constant segment: {mps_graph.constant_segment}") diff --git a/backends/apple/mps/operators/__init__.py b/backends/apple/mps/operators/__init__.py index 4c5c09c00b..0e474bdfe1 100644 --- a/backends/apple/mps/operators/__init__.py +++ b/backends/apple/mps/operators/__init__.py @@ -4,9 +4,8 @@ # from . import ( # noqa - # Activation ops activation_ops, - # binary ops + # Binary ops binary_ops, # Clamp ops clamp_ops, @@ -22,6 +21,10 @@ normalization_ops, op_clone, op_getitem, + # Quant-Dequant ops + op_quant_dequant, + # Skip ops + op_skip_ops, # Pad ops pad_ops, # Pooling ops @@ -32,7 +35,7 @@ reduce_ops, # Shape ops shape_ops, - # unary ops + # Unary ops unary_ops, ) @@ -41,8 +44,6 @@ op_clone, # Binary ops binary_ops, - # Unary ops - unary_ops, # Activation ops activation_ops, # Linear algebra ops @@ -67,4 +68,10 @@ pad_ops, # Range ops range_ops, + # Unary ops + unary_ops, + # Quant-Dequant ops + op_quant_dequant, + # Skip ops + op_skip_ops, ] diff --git a/backends/apple/mps/operators/node_visitor.py b/backends/apple/mps/operators/node_visitor.py index 0b9b2d5512..2b443134bf 100644 --- a/backends/apple/mps/operators/node_visitor.py +++ b/backends/apple/mps/operators/node_visitor.py @@ -72,11 +72,12 @@ def define_tensor( self, node: torch.fx.Node, mps_graph: MPSGraph, + mps_data_type: MPSDataType = None, ) -> int: """Defines a tensor value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph + node (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ @@ -89,7 +90,7 @@ def define_tensor( # Get a unique id for the node. id = self.get_serialized_id(node, mps_graph) cb_size, constant_buffer, mps_data_type = self.get_serialized_buffer( - node, mps_graph, id + node, mps_graph, id, mps_data_type ) dims = get_shape(node) @@ -143,6 +144,9 @@ def define_tensor_list(self, node: torch.fx.Node, mps_graph: MPSGraph) -> List[i mps_graph.mps_values.append(mps_tensor) return self.tensor_to_id[node] + def hash_tensor(self, tensor): + return hash(tuple(tensor.reshape(-1).tolist())) + def define_constant( self, constant_tensor: torch.tensor, @@ -151,13 +155,16 @@ def define_constant( """Defines a scalar value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph + constant_tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ constant_tensor = constant_tensor.contiguous() - # MPS TODO: cache these values - id = len(mps_graph.mps_values) - self.tensor_to_id[constant_tensor] = id + hash = self.hash_tensor(constant_tensor) + if hash in self.tensor_to_id: + return self.tensor_to_id[hash] + + id = self.get_serialized_id(constant_tensor, mps_graph, hash) + mps_data_type = edge_dtype_to_mps_dtype(constant_tensor.dtype) constant_buffer_size, constant_buffer, mps_data_type = self.get_serialized_data( constant_tensor, mps_graph, mps_data_type, id @@ -184,14 +191,14 @@ def define_scalar( """Defines a scalar value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ assert isinstance(val, int) or isinstance(val, float) - # MPS TODO: cache these values - id = len(mps_graph.mps_values) - self.tensor_to_id[val] = id + if val in self.tensor_to_id: + return self.tensor_to_id[val] + + id = self.get_serialized_id(val, mps_graph, val) tensor = torch.tensor(val) constant_buffer_size, constant_buffer, mps_data_type = self.get_serialized_data( @@ -214,19 +221,22 @@ def get_serialized_buffer( node: torch.fx.Node, mps_graph: MPSGraph, node_id: int, + mps_data_type: MPSDataType = None, ) -> Tuple[int, Buffer, MPSDataType]: """ If tensor holds some constant data, serialize it and return the index of its placement in the constant buffer Args: - tensor (torch.fx.Node): _description_ + node (torch.fx.Node): _description_ mps_graph (MPSGraph): _description_ Returns: _type_: _description_ """ - mps_data_type = self.get_serialized_dtype(node) + mps_data_type = ( + self.get_serialized_dtype(node) if mps_data_type is None else mps_data_type + ) # Check if this node is a lifted parameter if not is_parameter(self.exported_program, node): @@ -255,6 +265,22 @@ def get_serialized_data( if id not in mps_graph.constant_ids: mps_graph.constant_ids.append(id) + if ( + mps_data_type is MPSDataType.mps_data_type_int4 + and tensor.dtype is torch.int8 + ): + if tensor.dim() != 2: + raise RuntimeError(f"Unexpected tensor shape {tensor.shape}") + + tensor = tensor.to(dtype=torch.int32) + tensor = (((tensor[::, ::2] & 0x0F) << 4) | (tensor[::, 1::2] & 0x0F)).to( + torch.uint8 + ) + tensor = ( + torch._convert_weight_to_int4pack(tensor.to("mps"), 2) + .cpu() + .view(dtype=torch.uint8) + ) array_type = ctypes.c_char * tensor.untyped_storage().nbytes() array = ctypes.cast( tensor.untyped_storage().data_ptr(), @@ -265,32 +291,40 @@ def get_serialized_data( return tensor.untyped_storage().nbytes(), buffer, mps_data_type def get_serialized_id( - self, node: Union[torch.fx.Node, float, int], mps_graph: MPSGraph + self, node: Union[torch.fx.Node, float, int], mps_graph: MPSGraph, hash=None ) -> int: """ Map a tensor to a unique id. If the tensor was already mapped, return the existent id. Args: - tensor (Union[torch.fx.Node, float]): _description_ + node (Union[torch.fx.Node, float]): _description_ mps_graph (MPSGraph): _description_ Returns: int: _description_ """ - if node in self.tensor_to_id: + if hash is not None and hash in self.tensor_to_id: + return self.tensor_to_id[hash] + elif node in self.tensor_to_id: return self.tensor_to_id[node] id = len(mps_graph.mps_values) - self.tensor_to_id[node] = id + if hash is not None: + self.tensor_to_id[hash] = id + else: + self.tensor_to_id[node] = id return id + def torch_dtype_to_mps_dtype(self, torch_dtype: torch.dtype) -> MPSDataType: + return edge_dtype_to_mps_dtype(torch_dtype) + def get_serialized_dtype( self, node: torch.fx.Node, ) -> MPSDataType: - return edge_dtype_to_mps_dtype(node.meta["val"].dtype) + return self.torch_dtype_to_mps_dtype(node.meta["val"].dtype) def create_tertiary_node( self, node: torch.fx.Node, mps_graph: MPSGraph, tertiary_op: MPSNodeUnion diff --git a/backends/apple/mps/operators/op_quant_dequant.py b/backends/apple/mps/operators/op_quant_dequant.py new file mode 100644 index 0000000000..970f3900f8 --- /dev/null +++ b/backends/apple/mps/operators/op_quant_dequant.py @@ -0,0 +1,142 @@ +# +# Copyright (c) 2024 Apple Inc. All rights reserved. +# Provided subject to the LICENSE file in the top level directory. +# + +import logging +from typing import cast + +import torch +from executorch.backends.apple.mps.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.apple.mps.serialization.mps_graph_schema import ( + MPSDataType, + MPSDequantizePerChannelGroup, + MPSGraph, + MPSNode, +) +from executorch.backends.apple.mps.utils.mps_utils import get_input_node + +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.DEBUG, format=FORMAT) + + +@register_node_visitor +class OpDequantizePerChannelGroupDefault(NodeVisitor): + target = "quantized_decomposed.dequantize_per_channel_group.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + mps_graph: MPSGraph, + ) -> None: + # Weights placeholders shouldn't have been defined until this point + if get_input_node(node, 0) in self.tensor_to_id: + raise RuntimeError( + f"Placeholder for {node.target.__name__} already visited" + ) + output_id = self.define_tensor(node, mps_graph) + input_id = self.define_tensor( + get_input_node(node, 0), mps_graph, MPSDataType.mps_data_type_int4 + ) + scales_id = self.define_tensor(get_input_node(node, 1), mps_graph) + + # there are no zero points in this quantization method (node.args[2] is all zeros) + zero_points_id = -1 + quant_min = cast(int, node.args[3]) + quant_max = cast(int, node.args[4]) + dtype = self.torch_dtype_to_mps_dtype(node.args[5]) + group_size = cast(int, node.args[6]) + output_dtype = self.torch_dtype_to_mps_dtype(node.args[7]) + + dequant_node = MPSNode( + mpsnode_union=MPSDequantizePerChannelGroup( + input1_id=input_id, + output_id=output_id, + scales_id=scales_id, + zero_points_id=zero_points_id, + quant_min=quant_min, + quant_max=quant_max, + dtype=dtype, + group_size=group_size, + output_dtype=output_dtype, + ) + ) + mps_graph.mps_nodes.append(dequant_node) + + +@register_node_visitor +class OpQuantizePerToken(NodeVisitor): + """ + Dynamic Quantize Per Token Node visitor + """ + + target = "quantized_decomposed.quantize_per_token.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + mps_graph: MPSGraph, + ) -> None: + """ + Skip activation dynamic quantization for now. + Currently all matmuls are going through [FP16/BF16] @ [QInt4/QInt8]. + Issue: #133407308 + """ + dq_input = self.define_tensor(get_input_node(node, 0), mps_graph) + self.tensor_to_id[node] = dq_input + + +@register_node_visitor +class OpDequantizePerToken(NodeVisitor): + """ + Dequantize Per Token Node visitor + """ + + target = "quantized_decomposed.dequantize_per_token.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + mps_graph: MPSGraph, + ) -> None: + """ + Skip activation dynamic quantization for now. + Currently all matmuls are going through [FP16/BF16] @ [QInt4/QInt8]. + Issue: #133407308 + """ + dq_input = self.define_tensor(get_input_node(node, 0), mps_graph) + self.tensor_to_id[node] = dq_input + + +@register_node_visitor +class OpChooseQparamsToken(NodeVisitor): + """ + do nothing if node is choose_qparams_per_token_asymmetric.tensor + """ + + target = "quantized_decomposed.choose_qparams_per_token_asymmetric.default" + + def define_node( + self, + node: torch.fx.Node, + mps_graph: MPSGraph, + ) -> None: + """ + Skip activation dynamic quantization for now. + Currently all matmuls are going through [FP16/BF16] @ [QInt4/QInt8]. + Issue: #133407308 + """ + input_id = self.define_tensor(get_input_node(node, 0), mps_graph) + self.tensor_to_id[node] = [input_id, input_id] diff --git a/backends/apple/mps/operators/op_skip_ops.py b/backends/apple/mps/operators/op_skip_ops.py new file mode 100644 index 0000000000..f654ac2439 --- /dev/null +++ b/backends/apple/mps/operators/op_skip_ops.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2024 Apple Inc. All rights reserved. +# Provided subject to the LICENSE file in the top level directory. +# + +import torch +from executorch.backends.apple.mps.operators.node_visitor import NodeVisitor +from executorch.backends.apple.mps.serialization.mps_graph_schema import MPSGraph + + +class OpSkipOps(NodeVisitor): + """ + Parent Class for handling Skip Ops + """ + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + mps_graph: MPSGraph, + ) -> None: + return diff --git a/backends/apple/mps/partition/mps_partitioner.py b/backends/apple/mps/partition/mps_partitioner.py index e84b0d26a9..6e8083fd63 100644 --- a/backends/apple/mps/partition/mps_partitioner.py +++ b/backends/apple/mps/partition/mps_partitioner.py @@ -26,8 +26,7 @@ from torch.fx.passes.operator_support import OperatorSupportBase FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" -logging.basicConfig(level=logging.DEBUG, format=FORMAT) - +logging.basicConfig(level=logging.INFO, format=FORMAT) # ops implemented as Metal kernels. METAL_KERNELS = [ diff --git a/backends/apple/mps/runtime/MPSBackend.mm b/backends/apple/mps/runtime/MPSBackend.mm index e2070bf2c6..cb96edbeb2 100644 --- a/backends/apple/mps/runtime/MPSBackend.mm +++ b/backends/apple/mps/runtime/MPSBackend.mm @@ -19,7 +19,7 @@ namespace torch { namespace executor { -class MPSBackend final : public PyTorchBackendInterface { +class MPSBackend final : public ::executorch::runtime::BackendInterface { public: ~MPSBackend() = default; @@ -55,7 +55,7 @@ bool is_available() const override { // Function that actually executes the model in the backend. Error execute( - __ET_UNUSED BackendExecutionContext& context, + ET_UNUSED BackendExecutionContext& context, DelegateHandle* handle, EValue** args) const override { auto executor = static_cast(handle); diff --git a/backends/apple/mps/runtime/MPSCompiler.h b/backends/apple/mps/runtime/MPSCompiler.h index 4bc2f60ee4..7f449d481b 100644 --- a/backends/apple/mps/runtime/MPSCompiler.h +++ b/backends/apple/mps/runtime/MPSCompiler.h @@ -24,7 +24,7 @@ class MPSCompiler { // Takes Flatbuffer Serialized MPS Model and rebuilds the MPSGraphExecutable // returns an executor object that holds the MPS runtime object which we // can then use to set inputs and run inference using the MPSGraphExecutable. - __ET_NODISCARD static Error compileModel( + ET_NODISCARD static Error compileModel( const void* buffer_pointer, size_t num_bytes, MPSExecutor* executor, diff --git a/backends/apple/mps/runtime/MPSCompiler.mm b/backends/apple/mps/runtime/MPSCompiler.mm index 560c1bb0a1..9a58016a77 100644 --- a/backends/apple/mps/runtime/MPSCompiler.mm +++ b/backends/apple/mps/runtime/MPSCompiler.mm @@ -32,7 +32,7 @@ Builds the mps runtime object using the buffer pointer. The buffer pointer must be a valid pointer to the serialized mps object. */ -__ET_NODISCARD Error MPSCompiler::compileModel( +ET_NODISCARD Error MPSCompiler::compileModel( const void* buffer_pointer, size_t num_bytes, MPSExecutor* executor, @@ -43,7 +43,7 @@ Error err = Error::Ok; std::unique_ptr mpsGraphBuilder( - new MPSGraphBuilder(buffer_pointer, executor->_mpsGraphTensorToId)); + new MPSGraphBuilder(buffer_pointer, num_bytes, executor->_mpsGraphTensorToId)); err = mpsGraphBuilder->compileModel(); ET_CHECK_OR_RETURN_ERROR( err == Error::Ok, Internal, "Failed to construct the MPS graph object"); diff --git a/backends/apple/mps/runtime/MPSDelegateHeader.h b/backends/apple/mps/runtime/MPSDelegateHeader.h new file mode 100644 index 0000000000..07a138b918 --- /dev/null +++ b/backends/apple/mps/runtime/MPSDelegateHeader.h @@ -0,0 +1,113 @@ +// +// Copyright (c) 2024 Apple Inc. All rights reserved. +// Provided subject to the LICENSE file in the top level directory. +// + +#pragma once + +#include + +namespace torch { +namespace executor { +namespace mps { +namespace delegate { + +/** + * MPS-header that is embedded before the flatbuffer payload + * + */ +struct MPSDelegateHeader { + /** + * The minimum size of the MPSDelegateHeader. The caller should provide at + * least this many bytes of the head of the serialized MPS Data + */ + static constexpr size_t kMinSize = 30; + + /** + * The magic offset. This offset is the same as the offset for flatbuffer + * header so we will be able to check if the header is is either the + * flatbuffer head or the wrapper header we introduce here + */ + static constexpr size_t kMagicOffset = 4; + + /** + * The magic bytes that identify the header. + * + * This is the canonical definition of the expected value. If the header + * layout ever changes in a compatibility-breaking way, increment the digits + * in the magic. But, doing so will prevent older binaries from recognizing + * the presence of the header. The compatibility-preserving way to make + * changes is to increase the header's length field and add new fields at the + * end. + */ + static constexpr size_t kMagicSize = 4; + static constexpr char kMagic[kMagicSize] = {'M', 'P', '0', '0'}; + + /** + * The size in bytes of the header length. We store 2 bytes for the header + * length + */ + static constexpr size_t kHeaderLengthSize = 2; + + /** + * The expected location of the header length field relative to the beginning + * of the header. + */ + static constexpr size_t kHeaderLengthOffset = + MPSDelegateHeader::kMagicOffset + MPSDelegateHeader::kMagicSize; + + /* + * The expected location of the constant data offset field relative to the + * beginning of the header. + */ + static constexpr size_t kConstantDataSegmentOffset = kHeaderLengthOffset; + + /* + * The expected location of the constant data size field relative to the + * beginning of the header. + */ + static constexpr size_t kConstantDataSizeOffset = + kConstantDataSegmentOffset + sizeof(uint64_t); + + /** + * The expected location of the flatbuffer data offset field relative to the + * beginning of the header. + */ + static constexpr size_t kFlatbufferDataOffsetOffset = + kConstantDataSizeOffset + sizeof(uint64_t); + + /** + * Look for and parse an ExtendedHeader in the provided data. + * + * @param[in] data The contents of the beginning of the serialized binary + * Program data, starting at offset 0 (i.e., the head of the file). + * @param[in] size Length of `data` in bytes. + * + * @returns an MPSHeader if the header was found and is valid. Returns an + * error if size was too short, if the header was not found, or if the + * header appeared to be corrupt. + */ + static Result Parse(const void* data, size_t size); + + /** + * The offset in bytes to the beginning of the constant data. + */ + uint64_t constant_data_offset; + /** + * The size in bytes of the constant data. + */ + uint64_t constant_data_size; + /** + * The offset in bytes to the beginning of the flatbuffer data. + */ + uint64_t flatbuffer_offset; + /** + * The size in bytes of the flatbuffer data. + */ + uint64_t flatbuffer_size; +}; + +} // namespace delegate +} // namespace mps +} // namespace executor +} // namespace torch diff --git a/backends/apple/mps/runtime/MPSDelegateHeader.mm b/backends/apple/mps/runtime/MPSDelegateHeader.mm new file mode 100644 index 0000000000..2994b30507 --- /dev/null +++ b/backends/apple/mps/runtime/MPSDelegateHeader.mm @@ -0,0 +1,53 @@ +// +// Copyright (c) 2024 Apple Inc. All rights reserved. +// Provided subject to the LICENSE file in the top level directory. +// + +#include + +#include + +#include +#include + +namespace torch { +namespace executor { +namespace mps { +namespace delegate { + +/// Interprets the 8 bytes at `data` as a little-endian uint64_t. +uint64_t getUInt64LE(const uint8_t* data) { + return (uint64_t)data[0] | ((uint64_t)data[1] << 8) | + ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) | + ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) | + ((uint64_t)data[6] << 48) | ((uint64_t)data[7] << 56); +} + +Result MPSDelegateHeader::Parse(const void* data, size_t size) { + const uint8_t* header_data = (const uint8_t*)data; + + if (size < MPSDelegateHeader::kMinSize) { + return Error::InvalidArgument; + } + + const uint8_t* magic_start = header_data + MPSDelegateHeader::kMagicOffset; + if (std::memcmp(magic_start, MPSDelegateHeader::kMagic, MPSDelegateHeader::kMagicSize) != 0) { + return Error::NotFound; + } + + uint64_t constant_data_offset = getUInt64LE(header_data + MPSDelegateHeader::kConstantDataSegmentOffset); + uint64_t constant_data_size = getUInt64LE(header_data + MPSDelegateHeader::kConstantDataSizeOffset); + uint64_t flatbuffer_offset = MPSDelegateHeader::kFlatbufferDataOffsetOffset; + uint64_t flatbuffer_size = size - flatbuffer_offset; + + return MPSDelegateHeader{ + constant_data_offset, + constant_data_size, + flatbuffer_offset, + flatbuffer_size}; +} + +} // namespace delegate +} // namespace mps +} // namespace executor +} // namespace torch diff --git a/backends/apple/mps/runtime/MPSDevice.h b/backends/apple/mps/runtime/MPSDevice.h index a8b5dbe2b8..27c0758246 100644 --- a/backends/apple/mps/runtime/MPSDevice.h +++ b/backends/apple/mps/runtime/MPSDevice.h @@ -32,6 +32,7 @@ enum class MacOSVersion : uint32_t { MACOS_VER_13_2_PLUS, MACOS_VER_13_3_PLUS, MACOS_VER_14_0_PLUS, + MACOS_VER_15_0_PLUS, }; enum class LibraryType : uint32_t { @@ -82,7 +83,8 @@ class MPSDevice { MPSDevice(); }; -bool isMacOS13OrNewer(MacOSVersion version = MacOSVersion::MACOS_VER_13_0_PLUS); +bool is_macos_13_or_newer( + MacOSVersion version = MacOSVersion::MACOS_VER_13_0_PLUS); } // namespace delegate } // namespace mps diff --git a/backends/apple/mps/runtime/MPSDevice.mm b/backends/apple/mps/runtime/MPSDevice.mm index f51851c379..6f59fec353 100644 --- a/backends/apple/mps/runtime/MPSDevice.mm +++ b/backends/apple/mps/runtime/MPSDevice.mm @@ -76,7 +76,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de static bool _macos_13_3_plus = [compileOptions respondsToSelector:@selector(maxTotalThreadsPerThreadgroup)] == YES; static bool _macos_14_0_plus = [mpsCD instancesRespondToSelector:@selector(conjugateWithTensor:name:)] == YES; - + static bool _macos_15_0_plus = [mpsCD instancesRespondToSelector:@selector(scaledDotProductAttentionWithQueryTensor:keyTensor:valueTensor:maskTensor:scale:name:)] == YES; switch (version) { case MacOSVersion::MACOS_VER_13_0_PLUS: return _macos_13_0_plus; @@ -88,6 +88,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de return _macos_13_3_plus; case MacOSVersion::MACOS_VER_14_0_PLUS: return _macos_14_0_plus; + case MacOSVersion::MACOS_VER_15_0_PLUS: + return _macos_15_0_plus; default: return false; } @@ -144,7 +146,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& de return err; } -bool isMacOS13OrNewer(MacOSVersion version) { +bool is_macos_13_or_newer(MacOSVersion version) { return MPSDevice::getInstance()->isMacOS13Plus(version); } diff --git a/backends/apple/mps/runtime/MPSExecutor.h b/backends/apple/mps/runtime/MPSExecutor.h index 847d00ac74..8d55edb452 100644 --- a/backends/apple/mps/runtime/MPSExecutor.h +++ b/backends/apple/mps/runtime/MPSExecutor.h @@ -73,9 +73,9 @@ class MPSExecutor { return _executable; } - __ET_NODISCARD Error forward(std::vector& outputs); + ET_NODISCARD Error forward(std::vector& outputs); - __ET_NODISCARD Error + ET_NODISCARD Error set_inputs_outputs(std::vector& inputs, std::vector& outputs); Error initDataBuffers(); diff --git a/backends/apple/mps/runtime/MPSExecutor.mm b/backends/apple/mps/runtime/MPSExecutor.mm index 64f3ae51d0..8032d9a6a7 100644 --- a/backends/apple/mps/runtime/MPSExecutor.mm +++ b/backends/apple/mps/runtime/MPSExecutor.mm @@ -29,7 +29,7 @@ @interface MPSGraphExecutable() #if TARGET_OS_SIMULATOR or defined(__x86_64__) _use_shared_mem = false; #endif - if (!isMacOS13OrNewer(MacOSVersion::MACOS_VER_14_0_PLUS)) { + if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { _use_shared_mem = false; } @@ -37,7 +37,7 @@ @interface MPSGraphExecutable() _outputsArray = [[NSMutableArray alloc] initWithCapacity:getNumOutputs()]; } -__ET_NODISCARD Error +ET_NODISCARD Error MPSExecutor::set_inputs_outputs(std::vector& inputs, std::vector& outputs) { ET_CHECK_OR_RETURN_ERROR(inputs.size() == getNumInputs(), Internal, "Inputs mismatch"); ET_CHECK_OR_RETURN_ERROR(outputs.size() == getNumOutputs(), Internal, "Outputs mismatch"); @@ -61,7 +61,7 @@ @interface MPSGraphExecutable() return Error::Ok; } -__ET_NODISCARD Error MPSExecutor::forward(std::vector& outputs) { +ET_NODISCARD Error MPSExecutor::forward(std::vector& outputs) { Error err = Error::Ok; MPSStream* mpsStream = getDefaultMPSStream(); if (mpsStream->commitAndContinueEnabled() || mpsStream->hasLiveCommandBuffer()) { diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.h b/backends/apple/mps/runtime/MPSGraphBuilder.h index 29b9471ae9..ea7f9f818a 100644 --- a/backends/apple/mps/runtime/MPSGraphBuilder.h +++ b/backends/apple/mps/runtime/MPSGraphBuilder.h @@ -16,6 +16,7 @@ #include // MPS headers +#include #include #include #include @@ -40,7 +41,8 @@ using NodePtr = const mpsgraph::MPSNode *; */ class MPSGraphBuilder { public: - MPSGraphBuilder(const void *buffer_pointer, std::unordered_map &mpsGraphTensorToId); + MPSGraphBuilder(const void *buffer_pointer, size_t num_bytes, + std::unordered_map &mpsGraphTensorToId); ~MPSGraphBuilder() = default; Error compileModel(); @@ -154,6 +156,8 @@ class MPSGraphBuilder { _DEFINE_MPS_OP(ConstantPadND); // Range ops _DEFINE_MPS_OP(Arange); + // Quant-Dequant ops + _DEFINE_MPS_OP(DequantizePerChannelGroup); // Helper functions Error addNodeToMPSGraph(NodePtr nodePtr); @@ -178,12 +182,15 @@ class MPSGraphBuilder { const mpsgraph::MPSGraph *_flatBufferGraph; // FlatBuffer raw bytes of the serialized MPS model. const void *_buffer_pointer; + size_t _num_bytes; bool _metal_kernel; MPSGraph *_mpsGraph; MPSGraphExecutable *_mpsGraphExecutable; NSMutableDictionary *_feeds; NSMutableArray *_targetTensors; + + const uint8_t *_constant_data_ptr; }; #undef _DEFINE_MPS_OP diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.mm b/backends/apple/mps/runtime/MPSGraphBuilder.mm index 8b571001d4..a11cb638fb 100644 --- a/backends/apple/mps/runtime/MPSGraphBuilder.mm +++ b/backends/apple/mps/runtime/MPSGraphBuilder.mm @@ -5,13 +5,19 @@ #include #include +#include namespace torch { namespace executor { namespace mps { namespace delegate { -MPSGraphBuilder::MPSGraphBuilder(const void* buffer_pointer, std::unordered_map& mpsGraphTensorToId) : _mpsGraphTensorToId(mpsGraphTensorToId), _buffer_pointer(buffer_pointer) { +MPSGraphBuilder::MPSGraphBuilder( + const void* buffer_pointer, + size_t num_bytes, + std::unordered_map& mpsGraphTensorToId) : + _mpsGraphTensorToId(mpsGraphTensorToId), _buffer_pointer(buffer_pointer), _num_bytes(num_bytes) { + _mpsGraph = [MPSGraph new]; _feeds = [NSMutableDictionary dictionary]; _targetTensors = [NSMutableArray new]; @@ -24,15 +30,36 @@ MPSGraphBuilder::compileModel() { Error err = Error::Ok; - ET_CHECK(_buffer_pointer != nullptr); + Result header = MPSDelegateHeader::Parse(_buffer_pointer, _num_bytes); + const uint8_t* flatbuffer_data_ptr = nullptr; + + if (header.ok()) { + flatbuffer_data_ptr = reinterpret_cast(_buffer_pointer) + + header->flatbuffer_offset; + _constant_data_ptr = reinterpret_cast(_buffer_pointer) + + header->constant_data_offset; + } else if (header.error() == Error::NotFound) { + ET_LOG( + Error, + "MPSDelegateHeader version mismatch: '%.4s' != expected '%.4s'", + // Header Magic and FlatbufferIdentifier are same offset and size + flatbuffers::GetBufferIdentifier(_buffer_pointer), + MPSDelegateHeader::kMagic); + return header.error(); + } else { + ET_LOG(Error, "MPSDelegateHeader may be corrupt"); + return header.error(); + } + + ET_CHECK(flatbuffer_data_ptr != nullptr); ET_CHECK_OR_RETURN_ERROR( - mpsgraph::MPSGraphBufferHasIdentifier(_buffer_pointer), + mpsgraph::MPSGraphBufferHasIdentifier(flatbuffer_data_ptr), DelegateInvalidCompatibility, "MPS Delegate Serialization Format version identifier '%.4s' != expected '%.4s'", - flatbuffers::GetBufferIdentifier(_buffer_pointer), + flatbuffers::GetBufferIdentifier(flatbuffer_data_ptr), mpsgraph::MPSGraphIdentifier()); - _flatBufferGraph = mpsgraph::GetMPSGraph(_buffer_pointer); + _flatBufferGraph = mpsgraph::GetMPSGraph(flatbuffer_data_ptr); switch (_flatBufferGraph->graph_type()) { case mpsgraph::OpType::metal_kernel: { diff --git a/backends/apple/mps/runtime/MPSStream.h b/backends/apple/mps/runtime/MPSStream.h index aa39e5799c..def926c982 100644 --- a/backends/apple/mps/runtime/MPSStream.h +++ b/backends/apple/mps/runtime/MPSStream.h @@ -63,7 +63,7 @@ class MPSStream { MPSCommandBuffer* commandBuffer(); id commandEncoder(); void endKernelCoalescing(); - __ET_NODISCARD Error synchronize(SyncType syncType); + ET_NODISCARD Error synchronize(SyncType syncType); bool commitAndContinueEnabled(); void copy( id srcBuffer, diff --git a/backends/apple/mps/runtime/MPSStream.mm b/backends/apple/mps/runtime/MPSStream.mm index 5dac5c913a..652d44aa02 100644 --- a/backends/apple/mps/runtime/MPSStream.mm +++ b/backends/apple/mps/runtime/MPSStream.mm @@ -55,7 +55,7 @@ @interface MPSGraphExecutionDescriptor () return _commandEncoder; } -__ET_NODISCARD +ET_NODISCARD Error MPSStream::synchronize(SyncType syncType) { endKernelCoalescing(); switch(syncType) { @@ -157,7 +157,7 @@ @interface MPSGraphExecutionDescriptor () endKernelCoalescing(); if (@available(iOS 13.0, *)) { id blitEncoder = [commandBuffer() blitCommandEncoder]; - + [blitEncoder copyFromBuffer:srcBuffer sourceOffset:(NSUInteger)srcOffset toBuffer:dstBuffer diff --git a/backends/apple/mps/runtime/operations/BinaryOps.mm b/backends/apple/mps/runtime/operations/BinaryOps.mm index fa366d93e7..0eba5bc805 100644 --- a/backends/apple/mps/runtime/operations/BinaryOps.mm +++ b/backends/apple/mps/runtime/operations/BinaryOps.mm @@ -119,7 +119,7 @@ graphNode->output_id() \ ); \ ET_CHECK_OR_RETURN_ERROR( \ - isMacOS13OrNewer(), NotSupported, \ + is_macos_13_or_newer(), NotSupported, \ "%s supported by MPS on MacOS13.0+/iOS16.1+", #aot_name); \ \ _idToMPSGraphTensor[graphNode->output_id()] = binaryOpTensor( \ @@ -176,7 +176,7 @@ return inputTensor; } - if (!isMacOS13OrNewer(MacOSVersion::MACOS_VER_13_0_PLUS)) { + if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_0_PLUS)) { MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputTensor.dataType]; MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor secondaryTensor:zeroTensor diff --git a/backends/apple/mps/runtime/operations/MPSGraphSequoiaOps.h b/backends/apple/mps/runtime/operations/MPSGraphSequoiaOps.h new file mode 100644 index 0000000000..ca5474eb13 --- /dev/null +++ b/backends/apple/mps/runtime/operations/MPSGraphSequoiaOps.h @@ -0,0 +1,56 @@ +#pragma once + +#include + +#if !defined(__MAC_15_0) && (!defined(MAC_OS_X_VERSION_15_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_15_0)) + +#define MPSDataTypeInt4 ((MPSDataType)(MPSDataTypeSignedBit | 4)) + +@interface MPSNDArrayIdentity : MPSNDArrayUnaryKernel +- (MPSNDArray *__nullable)reshapeWithCommandBuffer:(__nullable id)cmdBuf + sourceArray:(MPSNDArray *__nonnull)sourceArray + shape:(MPSShape *__nonnull)shape + destinationArray:(MPSNDArray *__nullable)destinationArray; +@end + +@interface MPSNDArrayDescriptor () +@property(readwrite, nonatomic) BOOL preferPackedRows; +@end + +@interface MPSNDArray () +- (nonnull instancetype)initWithBuffer:(id _Nonnull)buffer + offset:(NSUInteger)offset + descriptor:(MPSNDArrayDescriptor *_Nonnull)descriptor; +- (MPSNDArray *__nullable)arrayViewWithShape:(MPSShape *_Nullable)shape strides:(MPSShape *_Nonnull)strides; +@end + +@interface MPSNDArrayQuantizationDescriptor : NSObject +@end + +@interface MPSNDArrayQuantizedMatrixMultiplication : MPSNDArrayMatrixMultiplication +- (nonnull instancetype)initWithDevice:(nonnull id)device + leftQuantizationDescriptor:(MPSNDArrayQuantizationDescriptor *_Nullable)leftQuantizationDescriptor + rightQuantizationDescriptor:(MPSNDArrayQuantizationDescriptor *_Nullable)rightQuantizationDescriptor; + +- (void)encodeToCommandEncoder:(id _Nullable)encoder + commandBuffer:(nonnull id)commandBuffer + sourceArrays:(nonnull NSArray *)sourceArrays + destinationArray:(nonnull MPSNDArray *)destination; +@end + +@interface MPSNDArrayAffineQuantizationDescriptor : MPSNDArrayQuantizationDescriptor +- (nonnull instancetype)initWithDataType:(MPSDataType)quantizationDataType + hasZeroPoint:(BOOL)hasZeroPoint + hasMinValue:(BOOL)hasMinValue; +@property(readwrite, nonatomic) bool implicitZeroPoint; +@end + +@interface MPSGraph () +- (MPSGraphTensor *_Nonnull)dequantizeTensor:(MPSGraphTensor *_Nonnull)tensor + scaleTensor:(MPSGraphTensor *_Nonnull)scaleTensor + zeroPointTensor:(MPSGraphTensor *_Nonnull)zeroPointTensor + dataType:(MPSDataType)dataType + name:(NSString *_Nullable)name; +@end + +#endif diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm index 21c4a0d3e7..2336868863 100644 --- a/backends/apple/mps/runtime/operations/OperationUtils.mm +++ b/backends/apple/mps/runtime/operations/OperationUtils.mm @@ -27,9 +27,17 @@ case DataType::mps_data_type_float16: return MPSDataTypeFloat16; case DataType::mps_data_type_float32: + case DataType::mps_data_type_float64: return MPSDataTypeFloat32; case DataType::mps_data_type_int8: return MPSDataTypeInt8; + case DataType::mps_data_type_int4: { + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) { + return MPSDataTypeInt4; + } else { + return ((MPSDataType)(MPSDataTypeSignedBit | 4)); + } + } case DataType::mps_data_type_int16: return MPSDataTypeInt16; case DataType::mps_data_type_int32: @@ -88,10 +96,11 @@ NSData* MPSGraphBuilder::getConstantData(int32_t id) { TensorPtr mpsTensor = _flatBufferGraph->mps_values()->Get(id); - int32_t constantBufferSize = mpsTensor->constant_buffer_size(); - const unsigned char* constantBuffer = mpsTensor->constant_buffer()->storage()->data(); + uint64_t constantBufferSize = mpsTensor->constant_buffer_size(); + uint64_t segmentOffset = mpsTensor->segment_offset(); + const unsigned char* constantBuffer = _constant_data_ptr + segmentOffset; ET_CHECK_MSG(constantBufferSize > 0 && constantBuffer != nullptr, "[ERROR] Invalid constant buffer"); - return [[NSData alloc] initWithBytes:constantBuffer + return [[NSData alloc] initWithBytesNoCopy:(void*)constantBuffer length:constantBufferSize]; } @@ -216,6 +225,8 @@ _DEFINE_MPS_NODE(ConstantPadND); // Range ops _DEFINE_MPS_NODE(Arange); + // Quant-Dequant ops + _DEFINE_MPS_NODE(DequantizePerChannelGroup); case mpsgraph::MPSNodeUnion::NONE: default: @@ -313,32 +324,26 @@ MPSDataType getMPSScalarType(exec_aten::ScalarType scalar_type) { MPSGraphTensor* permuteTensor(MPSGraph* graph, MPSGraphTensor* inputTensor, NSArray* permuteOrder) { - if (isMacOS13OrNewer()) { - return [graph transposeTensor:inputTensor - permutation:permuteOrder - name:nil]; - } else { - NSUInteger srcRank = [[inputTensor shape] count]; - if (srcRank != [permuteOrder count]) { - return nil; - } - - MPSGraphTensor* outputTensor = inputTensor; - std::vector dimensionOrder(srcRank); - std::iota(std::begin(dimensionOrder), std::end(dimensionOrder), 0); + NSUInteger srcRank = [[inputTensor shape] count]; + if (srcRank != [permuteOrder count]) { + return nil; + } - for (int32_t i = 0; i < srcRank; i++) { - NSUInteger axis = [permuteOrder[i] integerValue]; - auto axisIter = std::find(dimensionOrder.begin(), dimensionOrder.end(), axis); - NSUInteger axis1 = i; - NSUInteger axis2 = axisIter - dimensionOrder.begin(); - iter_swap(dimensionOrder.begin() + i, axisIter); + MPSGraphTensor* outputTensor = inputTensor; + std::vector dimensionOrder(srcRank); + std::iota(std::begin(dimensionOrder), std::end(dimensionOrder), 0); - outputTensor = [graph transposeTensor:outputTensor dimension:axis1 withDimension:axis2 name:nil]; - } + for (int32_t i = 0; i < srcRank; i++) { + NSUInteger axis = [permuteOrder[i] integerValue]; + auto axisIter = std::find(dimensionOrder.begin(), dimensionOrder.end(), axis); + NSUInteger axis1 = i; + NSUInteger axis2 = axisIter - dimensionOrder.begin(); + iter_swap(dimensionOrder.begin() + i, axisIter); - return outputTensor; + outputTensor = [graph transposeTensor:outputTensor dimension:axis1 withDimension:axis2 name:nil]; } + + return outputTensor; } diff --git a/backends/apple/mps/runtime/operations/QuantDequant.mm b/backends/apple/mps/runtime/operations/QuantDequant.mm new file mode 100644 index 0000000000..c37282f79a --- /dev/null +++ b/backends/apple/mps/runtime/operations/QuantDequant.mm @@ -0,0 +1,52 @@ +// +// Copyright (c) 2024 Apple Inc. All rights reserved. +// Provided subject to the LICENSE file in the top level directory. +// + +#include + +namespace torch { +namespace executor { +namespace mps { +namespace delegate { + +Error +MPSGraphBuilder::mpsDequantizePerChannelGroupOp(NodePtr nodePtr) { + auto graphNode = nodePtr->mpsnode_union_as_MPSDequantizePerChannelGroup(); + ET_LOG( + Debug, "%s: (%d, %d, %d) -> %d", + __FUNCTION__, + graphNode->input1_id(), + graphNode->scales_id(), + graphNode->zero_points_id(), + graphNode->output_id() + ); + + ET_CHECK_OR_RETURN_ERROR( + is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS), + NotImplemented, + "[ERROR] Operation %s is supported starting with macOS 15.0+ | iOS 18.0 + | iPadOS 18+ | tvOS 18+ | visionOS 2.0+ !", + mpsgraph::EnumNameMPSNodeUnion(nodePtr->mpsnode_union_type())); + + MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id()); + MPSGraphTensor* scalesTensor = getMPSGraphTensor(graphNode->scales_id()); + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) { + MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0 + dataType:MPSDataTypeInt4]; + MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor + scaleTensor:scalesTensor + zeroPointTensor:zpTensor + dataType:MPSDataTypeFloat16 + name:nil]; + _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor; + } else { + _idToMPSGraphTensor[graphNode->output_id()] = nil; + } + + return Error::Ok; +} + +} // namespace delegate +} // namespace mps +} // namespace executor +} // namespace torch diff --git a/backends/apple/mps/runtime/operations/UnaryOps.mm b/backends/apple/mps/runtime/operations/UnaryOps.mm index ed06584b27..15a101b664 100644 --- a/backends/apple/mps/runtime/operations/UnaryOps.mm +++ b/backends/apple/mps/runtime/operations/UnaryOps.mm @@ -33,7 +33,7 @@ _idToMPSGraphTensor[graphNode->output_id()] = [_mpsGraph notWithTensor:inputTensor name:nil]; } else { ET_CHECK_OR_RETURN_ERROR( - isMacOS13OrNewer(), NotSupported, + is_macos_13_or_newer(), NotSupported, "mpsBitwiseNotOp supported by MPS on MacOS13.0+/iOS16.1+"); _idToMPSGraphTensor[graphNode->output_id()] = [_mpsGraph bitwiseNOTWithTensor:inputTensor name:nil]; } diff --git a/backends/apple/mps/serialization/mps_graph_schema.py b/backends/apple/mps/serialization/mps_graph_schema.py index 6909926e8c..bd7185feb4 100644 --- a/backends/apple/mps/serialization/mps_graph_schema.py +++ b/backends/apple/mps/serialization/mps_graph_schema.py @@ -16,15 +16,27 @@ class MPSDataType(IntEnum): mps_data_type_invalid = 0 mps_data_type_float16 = 1 mps_data_type_float32 = 2 - mps_data_type_bfloat16 = 3 - mps_data_type_int8 = 4 - mps_data_type_int16 = 5 - mps_data_type_int32 = 6 - mps_data_type_int64 = 7 - mps_data_type_uint8 = 8 - mps_data_type_bool = 9 - mps_data_type_complex_float16 = 10 - mps_data_type_complex_float32 = 11 + mps_data_type_float64 = 3 + mps_data_type_bfloat16 = 4 + + # Signed integers. + mps_data_type_int4 = 5 + mps_data_type_int8 = 6 + mps_data_type_int16 = 7 + mps_data_type_int32 = 8 + mps_data_type_int64 = 9 + + # Unsigned integers. range: [0, UTYPE_MAX] + mps_data_type_uint4 = 10 + mps_data_type_uint8 = 11 + mps_data_type_uint16 = 12 + mps_data_type_uint32 = 13 + mps_data_type_uint64 = 14 + + mps_data_type_bool = 15 + + mps_data_type_complex_float16 = 16 + mps_data_type_complex_float32 = 17 class OpType(IntEnum): @@ -58,6 +70,12 @@ class MPSNode3x1: output_id: int +@dataclass +class MPSDequantizeNode(MPSNode1x1): + scales_id: int + zero_points_id: int + + @dataclass class MPSConv(MPSNode3x1): stride_x: int = 0 @@ -640,6 +658,18 @@ class MPSArange: dtype: MPSDataType +## +## Quant - Dequant ops +## +@dataclass +class MPSDequantizePerChannelGroup(MPSDequantizeNode): + quant_min: int + quant_max: int + dtype: MPSDataType + group_size: int + output_dtype: MPSDataType + + MPSNodeUnion = Union[ # Activation ops MPSHardTanh, @@ -743,6 +773,8 @@ class MPSArange: MPSConstantPadND, # Range ops MPSArange, + # Quant-Dequant ops + MPSDequantizePerChannelGroup, ] @@ -763,7 +795,14 @@ class MPSTensor: num_dims: int dims: List[int] constant_buffer_size: int - constant_buffer: Buffer + constant_buffer: Buffer # deprecated + segment_offset: int = 0 + + +@dataclass +class DataSegment: + offset: int + size: int @dataclass @@ -775,3 +814,4 @@ class MPSGraph: output_ids: List[int] constant_ids: List[int] graph_type: OpType + constant_segment: DataSegment diff --git a/backends/apple/mps/serialization/schema.fbs b/backends/apple/mps/serialization/schema.fbs index 6e089d4526..d38a67a656 100644 --- a/backends/apple/mps/serialization/schema.fbs +++ b/backends/apple/mps/serialization/schema.fbs @@ -13,15 +13,28 @@ enum MPSDataType : short { mps_data_type_invalid = 0, mps_data_type_float16 = 1, mps_data_type_float32 = 2, - mps_data_type_bfloat16 = 3, - mps_data_type_int8 = 4, - mps_data_type_int16 = 5, - mps_data_type_int32 = 6, - mps_data_type_int64 = 7, - mps_data_type_uint8 = 8, - mps_data_type_bool = 9, - mps_data_type_complex_float16 = 10, - mps_data_type_complex_float32 = 11, + mps_data_type_float64 = 3, + mps_data_type_bfloat16 = 4, + + // Signed integers. + mps_data_type_int4 = 5, + mps_data_type_int8 = 6, + mps_data_type_int16 = 7, + mps_data_type_int32 = 8, + mps_data_type_int64 = 9, + + + // Unsigned integers. range: [0, UTYPE_MAX] + mps_data_type_uint4 = 10, + mps_data_type_uint8 = 11, + mps_data_type_uint16 = 12, + mps_data_type_uint32 = 13, + mps_data_type_uint64 = 14, + + mps_data_type_bool = 15, + + mps_data_type_complex_float16 = 16, + mps_data_type_complex_float32 = 17, } // ops like index.Tensor and index.put are currentely implemented as @@ -322,6 +335,19 @@ table MPSArange { dtype:MPSDataType; } +// Quant - Dequant ops +table MPSDequantizePerChannelGroup { + input1_id:int; + output_id:int; + scales_id:int; + zero_points_id:int; + quant_min:int; + quant_max:int; + dtype:MPSDataType; + group_size:int; + output_dtype:MPSDataType; +} + union MPSNodeUnion { // Activation ops MPSHardTanh, @@ -441,6 +467,9 @@ union MPSNodeUnion { // Range ops MPSArange, + + // Quant-Dequant ops + MPSDequantizePerChannelGroup, } table MPSNode { @@ -450,6 +479,7 @@ table MPSNode { // taken from executorch // Data buffer abstraction. +// Deprecated table Buffer { storage:[ubyte] (force_align: 16); } @@ -458,8 +488,21 @@ table MPSTensor { datatype:MPSDataType; num_dims:int; dims:[int]; - constant_buffer_size:int; - constant_buffer:Buffer; + constant_buffer_size:uint64; + constant_buffer:Buffer; // deprecated + segment_offset:uint64; +} + +table DataSegment { + // Segment offsets are relative to the segment base offset provided in + // the extended file header. Segments will typically be aligned in a + // way to make it possible to use mmap() to load them. + offset: uint64; + + // The size in bytes of valid data starting at the offset. The segment + // data may be followed by padding before the segment that follows it, + // to make it easier to use mmap(). + size: uint64; } table MPSGraph { @@ -473,6 +516,8 @@ table MPSGraph { constant_ids:[int]; graph_type:OpType; + + constant_segment:DataSegment; } root_type MPSGraph; diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl index 8b9c64e143..74d7944836 100644 --- a/backends/apple/mps/targets.bzl +++ b/backends/apple/mps/targets.bzl @@ -47,7 +47,7 @@ def define_common_targets(is_xplat = False, platforms = []): "//executorch/exir/backend:backend_lib", "//executorch/extension/pybindings/...", "//executorch/runtime/backend/...", - "//executorch/sdk/runners/...", + "//executorch/devtools/runners/...", "//executorch/test/...", "@EXECUTORCH_CLIENTS", ], diff --git a/backends/apple/mps/test/test_mps_linear.py b/backends/apple/mps/test/test_mps_linear.py new file mode 100644 index 0000000000..e2abbd3b7b --- /dev/null +++ b/backends/apple/mps/test/test_mps_linear.py @@ -0,0 +1,523 @@ +# +# Copyright (c) 2024 Apple Inc. All rights reserved. +# Provided subject to the LICENSE file in the top level directory. +# + +import inspect + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.apple.mps.test.test_mps_utils import TestMPS + + +class TestLinear(TestMPS): + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_fp16_linear(self): + for use_bias in (True, False): + for num_batch_dims in range(1, 3): + self._test_linear( + lambda in_size, out_size: torch.nn.Linear( + in_size, out_size, bias=use_bias # noqa + ), + num_batch_dims=num_batch_dims, + uses_bias=use_bias, + dtype=torch.float16, + atol=5e-2, + ) + + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_fp32_linear(self): + for use_bias in (True, False): + for num_batch_dims in range(1, 3): + self._test_linear( + lambda in_size, out_size: torch.nn.Linear( + in_size, out_size, bias=use_bias # noqa + ), + uses_bias=use_bias, + num_batch_dims=num_batch_dims, + ) + + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_qc8_linear(self): + for use_bias in (True, False): + for num_batch_dims in range(1, 3): + self._test_linear( + lambda in_size, out_size: torch.nn.Linear( + in_size, out_size, bias=use_bias # noqa + ), + uses_bias=use_bias, + quant_type="per_channel", + num_batch_dims=num_batch_dims, + ) + + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_fp32_addmm(self): + """ + Note that the ConvertToLinear pass requires the weight matrix to be transposed. + """ + + class AddMMModule(torch.nn.Module): + def __init__(self, in_size, out_size): + super().__init__() + self.mat = torch.nn.Parameter(torch.randn(in_size, out_size)) + self.bias = torch.nn.Parameter(torch.randn(1, out_size)) + + def forward(self, x): + return torch.addmm(self.bias, x, self.mat) + + self._test_linear( + lambda in_size, out_size: AddMMModule(in_size, out_size), + uses_bias=True, + ) + + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_fp32_linear_fused_relu(self): + class LinearReluModule(torch.nn.Module): + def __init__(self, in_size, out_size, use_bias): + super().__init__() + self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) + + def forward(self, x): + return torch.nn.functional.relu(self.linear(x)) + + for use_bias in (True, False): + for num_batch_dims in range(1, 3): + self._test_linear( + lambda in_size, out_size: LinearReluModule( + in_size, + out_size, + use_bias, # noqa + ), + uses_bias=use_bias, + num_batch_dims=num_batch_dims, + ) + + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_qs8_linear_fused_relu(self): + class LinearReluModule(torch.nn.Module): + def __init__(self, in_size, out_size, use_bias): + super().__init__() + self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) + + def forward(self, x): + return torch.nn.functional.relu(self.linear(x)) + + for use_bias in (True, False): + for num_batch_dims in range(1, 3): + self._test_linear( + lambda in_size, out_size: LinearReluModule( + in_size, + out_size, + use_bias, # noqa + ), + num_batch_dims=num_batch_dims, + uses_bias=use_bias, + quant_type="per_tensor", + ) + + @unittest.skip("Dynamic shapes not supported in MPS backend") + def test_qs8_linear(self): + for use_bias in (True, False): + for num_batch_dims in range(1, 3): + self._test_linear( + lambda in_size, out_size: torch.nn.Linear( + in_size, out_size, bias=use_bias # noqa + ), + uses_bias=use_bias, + num_batch_dims=num_batch_dims, + quant_type="per_tensor", + ) + + @unittest.skip( + "quantized_decomposed_dequantize_per_channel_default is not supported bt MPS delegate" + ) + def test_qd8_fp32_per_token_weight_per_channel_int8(self): + self._run_manual_dqlinear_tests(8, torch.float) + + @unittest.skip( + "quantized_decomposed_dequantize_per_channel_default is not supported bt MPS delegate" + ) + def test_qd8_fp32_per_token_weight_per_channel_int4(self): + self._run_manual_dqlinear_tests(4, torch.float) + + def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): + M_sizes = [1] + K_sizes = [64] + bl_sizes = [64] + N_sizes = [32] + + for use_bias in [True, False]: + for i, _ in enumerate(M_sizes): + M = int(M_sizes[i]) + K = int(K_sizes[i]) + N = int(N_sizes[i]) + bl = int(bl_sizes[i]) + mod = self.ManualDQLinear( + input_channels=K, + output_channels=N, + weight_n_bit=4, + dtype=torch.float, + group_size=bl, + force_groupwise_quant=True, + use_bias=use_bias, + ) + + inputs = (torch.randn(1, M, K),) + self._test_manual_dq_linear( + mod, + inputs, + weight_groupwise=True, + use_bias=use_bias, + ) + + @unittest.skip("Need to fix the dq_per_channel_group output dtype") + def _test_qd8_fp16_per_token_weight_per_channel_group_int4(self): + M_sizes = [1, 2, 17, 31] + K_sizes = [8, 32, 64, 128] + bl_sizes = [8, 16, 16, 32] + N_sizes = [2, 17, 92, 128] + + for use_bias in [True, False]: + for i, _ in enumerate(M_sizes): + M = int(M_sizes[i]) + K = int(K_sizes[i]) + N = int(N_sizes[i]) + bl = int(bl_sizes[i]) + mod = self.ManualDQLinear( + input_channels=K, + output_channels=N, + weight_n_bit=4, + dtype=torch.float16, + group_size=bl, + force_groupwise_quant=True, + use_bias=use_bias, + ) + + inputs = (torch.randn(1, M, K, dtype=torch.float16),) + self._test_manual_dq_linear( + mod, + inputs, + weight_groupwise=True, + use_bias=use_bias, + atol=0.1, + rtol=0.1, + ) + + def _test_linear( + self, + make_module, + uses_bias, + num_batch_dims=1, + quant_type=None, + dtype: torch.dtype = torch.float, + atol=1e-03, + ): + in_sizes = [3, 4, 4] + input_sizes = [4, 37, 17] + output_sizes = [4, 17, 37] + + for i, _ in enumerate(in_sizes): + in_size = int(in_sizes[i]) + input_size = int(input_sizes[i]) + output_size = int(output_sizes[i]) + input_shape = [in_size] * num_batch_dims + [input_size] + print(f"Testing input_shape {input_shape} with {output_size} out_channels") + + module = make_module(input_size, output_size).eval().to(dtype) + inputs = (torch.randn(input_shape).to(dtype),) + dynamic_shape = {} + for i in range(num_batch_dims): + dynamic_shape[i] = torch.export.Dim(f"batch{i}", min=2, max=in_size) + + dynamic_shape = (dynamic_shape,) + print(dynamic_shape) + self.lower_and_test_without_partitioner( + module, + inputs, + func_name=inspect.stack()[0].function[5:], + dynamic_shapes=dynamic_shape, + atol=atol, + rtol=1e-03, + ) + + class ManualDQLinear(torch.nn.Module): + def __init__( + self, + input_channels: int = 4, + output_channels: int = 4, + dtype: torch.dtype = torch.float, + weight_n_bit: int = 4, + group_size: int = 0, + force_groupwise_quant: bool = False, + use_bias: bool = False, + ): + super().__init__() + + self.ic = input_channels + self.oc = output_channels + + assert dtype in [torch.float, torch.half], "Unsupported op dtype" + self.op_dtype = dtype + + self.group_size = self.ic if group_size == 0 else group_size + self.num_groups = 1 + if self.group_size != self.ic: + assert self.ic % self.group_size == 0 + assert self.group_size % 8 == 0 # TODO make this 16 + self.num_groups = self.ic // self.group_size + + assert weight_n_bit in [4, 8], "Unsupported weight_n_bit" + self.w_n_bit = weight_n_bit + self.w_quant_min, self.w_quant_max = self.get_min_max(self.w_n_bit) + + self.w = torch.nn.Parameter( + torch.randn(self.oc, self.ic), requires_grad=False + ) + self.w_q = torch.nn.Parameter( + torch.zeros(self.oc, self.ic), requires_grad=False + ) + # Quantize the weights as per folded setup + if self.group_size != self.ic or force_groupwise_quant: + self.w_scales = torch.nn.Parameter( + torch.zeros(self.oc, self.num_groups), requires_grad=False + ) + self.w_zero_points = torch.nn.Parameter( + torch.zeros(self.oc, self.num_groups), requires_grad=False + ) + self.quant_weight_per_channel_group() + else: # per_channel quantization + self.w_scales = torch.nn.Parameter( + torch.zeros(self.oc), requires_grad=False + ) + self.w_zero_points = torch.nn.Parameter( + torch.zeros(self.oc), requires_grad=False + ) + self.quant_weight_per_channel() + + self.bias = ( + torch.nn.Parameter( + torch.randn(self.oc).to(self.op_dtype), requires_grad=False + ) + if use_bias + else None + ) + + def get_min_max(self, n_bit: int = 4): + max_int = 2 ** (n_bit - 1) - 1 + min_int = -(2 ** (n_bit - 1)) + return min_int, max_int + + def get_channel_qparams_symmetric( + self, + w: torch.Tensor, + n_bit: int = 4, + precision: torch.dtype = torch.float32, + ): + assert w.dim() == 2 + + to_quant = w.to(precision) + assert torch.isnan(to_quant).sum() == 0 + + max_val = to_quant.amax(dim=1, keepdim=True) + min_val = to_quant.amin(dim=1, keepdim=True) + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + + min_int, max_int = self.get_min_max(n_bit) + + max_val_abs = torch.max(-min_val_neg, max_val_pos) + scales = max_val_abs / (float(max_int - min_int) / 2) + scales = torch.max( + scales, torch.full_like(scales, torch.finfo(torch.float32).eps) + ) + zeros = torch.full_like(scales, 0) + return scales.to(precision).reshape(w.shape[0]), zeros.to( + precision + ).reshape(w.shape[0]).reshape(w.shape[0]) + + # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues + def get_group_qparams_symmetric( + self, w, n_bit=4, groupsize=128, precision=torch.float32 + ): + # needed for GPTQ with padding + if groupsize > w.shape[-1]: + groupsize = w.shape[-1] + assert groupsize > 1 + assert w.shape[-1] % groupsize == 0 + assert w.dim() == 2 + + to_quant = w.reshape(-1, groupsize) + assert torch.isnan(to_quant).sum() == 0 + + max_val = to_quant.amax(dim=1, keepdim=True) + min_val = to_quant.amin(dim=1, keepdim=True) + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + + max_val_abs = torch.max(-min_val_neg, max_val_pos) + max_int = 2 ** (n_bit - 1) - 1 + min_int = -(2 ** (n_bit - 1)) + + scales = max_val_abs / (float(max_int - min_int) / 2) + scales = torch.max( + scales, torch.full_like(scales, torch.finfo(torch.float32).eps) + ) + # TODO: make sure abs(scales) is not too small? + zeros = torch.full_like(scales, 0) + return scales.to(precision).reshape(w.shape[0], -1), zeros.to( + precision + ).reshape(w.shape[0], -1) + + # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues + def group_quantize_tensor_symmetric( + self, w, n_bit=4, group_size=128, precision=torch.float32 + ): + scales, zeros = self.get_group_qparams_symmetric( + w, n_bit, group_size, precision + ) + n_bit = 4 + max_int = 2 ** (n_bit - 1) - 1 + min_int = -(2 ** (n_bit - 1)) + # TODO: currently we don't know how to express torch.int4, we'll + # add torch.int4 to core later + w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group( + w, scales, zeros, min_int, max_int, torch.int8, group_size + ) + + return w_int8, scales, zeros + + def fwd_input_per_token(self, input: torch.Tensor) -> torch.Tensor: + ip_quant_min = -128 + ip_quant_max = 127 + ( + ip_scales, + ip_zero_points, + ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric( + input, torch.int8 + ) + + input = torch.ops.quantized_decomposed.quantize_per_token( + input, + ip_scales, + ip_zero_points, + ip_quant_min, + ip_quant_max, + torch.int8, + ) + input = torch.ops.quantized_decomposed.dequantize_per_token( + input, + ip_scales, + ip_zero_points, + ip_quant_min, + ip_quant_max, + torch.int8, + self.op_dtype, + ) + return input + + def quant_weight_per_channel(self): + ( + self.w_scales.data, + self.w_zero_points.data, + ) = self.get_channel_qparams_symmetric( + self.w, n_bit=self.w_n_bit, precision=self.op_dtype + ) + self.w_q.data = torch.ops.quantized_decomposed.quantize_per_channel( + self.w, + self.w_scales, + self.w_zero_points, + axis=0, + quant_min=self.w_quant_min, + quant_max=self.w_quant_max, + dtype=torch.int8, + ) + + def quant_weight_per_channel_group(self): + self.w_q.data, w, zp = self.group_quantize_tensor_symmetric( + self.w, + n_bit=self.w_n_bit, + group_size=self.group_size, + ) + expected_min, expected_max = self.get_min_max(self.w_n_bit) + assert ( + torch.min(self.w_q.data) >= expected_min + ), "Found smaller than min element in quantized weight tensor" + assert ( + torch.max(self.w_q.data) <= expected_max + ), "Found larger than max element in quantized weight tensor" + assert ( + w.ndim == 2 and zp.ndim == 2 + ), f"Expecting 2d scales and zp tensors, but got {w.shape}, {zp.shape}" + self.w_scales.data, self.w_zero_points.data = w, zp + + def fwd_weight_per_channel(self) -> torch.Tensor: + # This is HACKY because the dequant will produce fp32 + return torch.ops.quantized_decomposed.dequantize_per_channel( + self.w_q, + self.w_scales, + self.w_zero_points, + axis=0, + quant_min=self.w_quant_min, + quant_max=self.w_quant_max, + dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later + ) + + def fwd_weight_per_channel_group(self) -> torch.Tensor: + return torch.ops.quantized_decomposed.dequantize_per_channel_group( + self.w_q, + self.w_scales, + self.w_zero_points, + self.w_quant_min, + self.w_quant_max, + dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later + group_size=self.group_size, + output_dtype=self.op_dtype, + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + # Input + input = self.fwd_input_per_token(input) + + # Weights + w = ( + self.fwd_weight_per_channel_group() + if self.w_scales.ndim == 2 + else self.fwd_weight_per_channel() + ) + assert isinstance(w, torch.Tensor) + return torch.nn.functional.linear(input, w, self.bias) + + def _test_manual_dq_linear( + self, + mod: torch.nn.Module, + inputs: Tuple[torch.Tensor], + weight_groupwise: bool = False, + use_bias: bool = False, + ): + self.lower_and_test_without_partitioner( + mod, inputs, func_name=inspect.stack()[0].function[5:] + ) + + def _run_manual_dqlinear_tests(self, weight_n_bit: int, op_dtype: torch.dtype): + in_sizes = [1, 4, 4] + input_sizes = [4, 37, 17] + output_sizes = [4, 17, 37] + + for use_bias in [True, False]: + for i, _ in enumerate(in_sizes): + in_size = int(in_sizes[i]) + input_size = int(input_sizes[i]) + output_size = int(output_sizes[i]) + mod = self.ManualDQLinear( + input_channels=input_size, + output_channels=output_size, + weight_n_bit=weight_n_bit, + dtype=op_dtype, + use_bias=use_bias, + ) + + inputs = (torch.randn(1, in_size, input_size).to(op_dtype),) + self._test_manual_dq_linear(mod, inputs, use_bias=use_bias) diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py index f3ae771c3d..6f7d00d7b0 100644 --- a/backends/apple/mps/test/test_mps_utils.py +++ b/backends/apple/mps/test/test_mps_utils.py @@ -6,28 +6,23 @@ import logging import unittest -from typing import Any, Tuple, Union +from typing import Any, Tuple import executorch.exir as exir import torch from executorch.backends.apple.mps import MPSBackend from executorch.backends.apple.mps.partition import MPSPartitioner -from executorch.exir import ( - EdgeCompileConfig, - EdgeProgramManager, - ExirExportedProgram, - to_edge, +from executorch.devtools import BundledProgram +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, ) +from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.capture._config import ExecutorchBackendConfig -from executorch.exir.tracer import Value -from executorch.sdk import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) -from torch.export import export, ExportedProgram +from executorch.extension.export_util.utils import export_to_edge +from torch.export import export # Config for Capturing the weights, will be moved in the future @@ -37,47 +32,6 @@ ) -def _to_core_aten( - model: Union[torch.fx.GraphModule, torch.nn.Module], - example_inputs: Tuple[Value, ...], -) -> ExportedProgram: - # post autograd export. eventually this will become .to_core_aten - if not isinstance(model, torch.fx.GraphModule): - raise ValueError( - f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}" - ) - core_aten_ep = export(model, example_inputs) - logging.info(f"Core ATen graph:\n{core_aten_ep.graph}") - return core_aten_ep - - -def _core_aten_to_edge( - core_aten_exir_ep: ExportedProgram, - edge_compile_config=None, -) -> EdgeProgramManager: - if not edge_compile_config: - edge_compile_config = exir.EdgeCompileConfig( - _check_ir_validity=False, # quant ops currently break ir verification - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) - edge_manager: EdgeProgramManager = to_edge( - core_aten_exir_ep, compile_config=edge_compile_config - ) - - edge_manager.exported_program().graph.print_tabular() - logging.info(f"Exported graph:\n{edge_manager.exported_program().graph}") - return edge_manager - - -def export_to_edge( - model: Union[torch.fx.GraphModule, torch.nn.Module], - example_inputs: Tuple[Value, ...], - edge_compile_config=_EDGE_COMPILE_CONFIG, -) -> EdgeProgramManager: - core_aten_ep = _to_core_aten(model, example_inputs) - return _core_aten_to_edge(core_aten_ep, edge_compile_config) - - class ansi_colors: HEADER = "\033[95m" OKBLUE = "\033[94m" @@ -168,33 +122,65 @@ def dump_bundled_program(sample_inputs, expected_output, executorch_program, fun class TestMPS(unittest.TestCase): - def assert_outputs_equal(self, model_output, ref_output): + def assert_outputs_equal( + self, + model_output, + ref_output, + use_fp16: bool = False, + atol: float = 1e-03, + rtol: float = 1e-03, + ): """ Helper testing function that asserts that the model output and the reference output are equal with some tolerance. Due to numerical differences between eager mode and the MPS's backend, we relax the detal such that absolute tolerance is 1e-3. and relative tolerance is 1e-3. """ - - # Compare the result from executor and eager mode direclty + # Compare the result from executor and eager mode directly if isinstance(ref_output, tuple) or isinstance(ref_output, list): # Multiple outputs executor always returns tuple, even if there is one output - self.assertTrue( - len(ref_output) == len(model_output), - msg="Length of outputs is not matching!", - ) + assert len(ref_output) == len( + model_output + ), "Length of outputs is not matching!" for i in range(len(ref_output)): - self.assertTrue( - torch.allclose( - model_output[i], ref_output[i], atol=1e-03, rtol=1e-03 - ) - ) + res_output = model_output[i].cpu() + expected_output = ref_output[i].cpu() + if use_fp16 and ( + expected_output.dtype == torch.float16 + or res_output.dtype == torch.float16 + ): + # cast back from fp16 to fp32 (ExecuTorch results are in FP32 by default) + expected_output = expected_output.to(torch.float32) + res_output = res_output.to(torch.float32) + if ( + torch.allclose(res_output, expected_output, atol=atol, rtol=rtol) + is False + ): + mean_err = ( + (res_output - expected_output).abs() / expected_output + ).mean() + logging.debug(f"mean err = {mean_err}") + self.assertLess(mean_err, 0.05) else: # If one output, eager returns tensor while executor tuple of size 1 - self.assertTrue( - torch.allclose(model_output[0], ref_output, atol=1e-03, rtol=1e-03), - msg="Outputs are not matching!", - ) + expected_output = ref_output.cpu() + res_output = model_output[0].cpu() + if use_fp16 and ( + expected_output.dtype == torch.float16 + or res_output.dtype == torch.float16 + ): + # cast back from fp16 to fp32 (ExecuTorch results are in FP32 by default) + expected_output = expected_output.to(torch.float32) + res_output = res_output.to(torch.float32) + if ( + torch.allclose(res_output, expected_output, atol=atol, rtol=rtol) + is False + ): + mean_err = ( + (res_output - expected_output).abs() / expected_output + ).mean() + logging.debug(f"mean err = {mean_err}") + self.assertLess(mean_err, 0.05) def lower_module_and_test_output( self, @@ -204,6 +190,9 @@ def lower_module_and_test_output( use_partitioner: bool = True, use_fp16: bool = False, bundled_program=True, + dynamic_shapes=None, + atol: float = 1e-03, + rtol: float = 1e-03, ) -> ExirExportedProgram: """ Helper testing function that takes a torch.nn.Module and lowers it to MPS with @@ -220,11 +209,14 @@ def lower_module_and_test_output( expected_output = model(*sample_inputs) - model = torch._export.capture_pre_autograd_graph(model, sample_inputs) + model = torch._export.capture_pre_autograd_graph( + model, sample_inputs, dynamic_shapes=dynamic_shapes + ) edge_program = export_to_edge( model, sample_inputs, + dynamic_shapes=dynamic_shapes, edge_compile_config=EdgeCompileConfig( _check_ir_validity=False, _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. @@ -237,7 +229,7 @@ def lower_module_and_test_output( compile_specs = [CompileSpec("use_fp16", bytes([use_fp16]))] if use_partitioner: - logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}") + logging.info(f"Edge IR graph:\n{edge_program.exported_program()}") delegated_program = edge_program delegated_program = edge_program.to_backend( MPSPartitioner(compile_specs=compile_specs) @@ -247,9 +239,7 @@ def lower_module_and_test_output( ) executorch_program = delegated_program.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) else: delegated_program = to_backend( @@ -266,9 +256,7 @@ def lower_module_and_test_output( _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. ), ).to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) if bundled_program: @@ -292,7 +280,7 @@ def lower_module_and_test_output( logging.info(f"Expected output: {expected_output}") logging.info(f"MPS delegate output: {model_output}") - self.assert_outputs_equal(model_output, expected_output) + self.assert_outputs_equal(model_output, expected_output, atol, rtol) logging.info("Delegated program matches PyTorch Eager mode result!") return delegated_program @@ -307,6 +295,9 @@ def lower_and_test_with_partitioner( example_inputs, func_name: str, use_fp16: bool = False, + dynamic_shapes=None, + atol: float = 1e-03, + rtol: float = 1e-03, ): logging.info(func_name) self.lower_module_and_test_output( @@ -315,4 +306,29 @@ def lower_and_test_with_partitioner( use_partitioner=True, func_name=func_name, use_fp16=use_fp16, + dynamic_shapes=None, + atol=atol, + rtol=rtol, + ) + + def lower_and_test_without_partitioner( + self, + graph_module, + example_inputs, + func_name: str, + use_fp16: bool = False, + dynamic_shapes=None, + atol: float = 1e-03, + rtol: float = 1e-03, + ): + logging.info(func_name) + self.lower_module_and_test_output( + graph_module, + example_inputs, + use_partitioner=False, + func_name=func_name, + use_fp16=use_fp16, + dynamic_shapes=dynamic_shapes, + atol=atol, + rtol=rtol, ) diff --git a/backends/apple/mps/utils/mps_utils.py b/backends/apple/mps/utils/mps_utils.py index 5c26faa046..c31ebba0e4 100644 --- a/backends/apple/mps/utils/mps_utils.py +++ b/backends/apple/mps/utils/mps_utils.py @@ -24,6 +24,7 @@ def edge_dtype_to_mps_dtype(dtype: torch.dtype): edge_dtype_to_mps_dtype.map = { torch.float16: MPSDataType.mps_data_type_float16, torch.float32: MPSDataType.mps_data_type_float32, + torch.float64: MPSDataType.mps_data_type_float32, torch.bfloat16: MPSDataType.mps_data_type_bfloat16, torch.int8: MPSDataType.mps_data_type_int8, torch.int16: MPSDataType.mps_data_type_int16, @@ -72,7 +73,7 @@ def is_parameter(exp_prog: torch.export.ExportedProgram, node: torch.fx.Node) -> are supplied as inputs to the graph. Args: - edge_program (torch.export.ExportedProgram): _description_ + exp_prog (torch.export.ExportedProgram): _description_ node (torch.fx.Node): _description_ Returns: diff --git a/backends/apple/mps/utils/quant_utils.py b/backends/apple/mps/utils/quant_utils.py new file mode 100644 index 0000000000..bfe19094a6 --- /dev/null +++ b/backends/apple/mps/utils/quant_utils.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2024 Apple Inc. All rights reserved. +# Provided subject to the LICENSE file in the top level directory. +# + +import torch +from executorch.exir.dialects._ops import ops as exir_ops + +DQ_GROUP_TARGETS = { + exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default, +} + +Q_GROUP_TARGETS = { + exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, +} + +DQ_TARGETS = { + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, + exir_ops.edge.quantized_decomposed.dequantize_per_token.default, +}.union(DQ_GROUP_TARGETS) + +Q_TARGETS = { + exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, + exir_ops.edge.quantized_decomposed.quantize_per_channel.default, + exir_ops.edge.quantized_decomposed.quantize_per_token.default, +}.union(Q_GROUP_TARGETS) + + +def is_quant(tensor: torch.fx.Node) -> bool: + return tensor.target in Q_TARGETS + + +def is_dequant(tensor: torch.fx.Node) -> bool: + return tensor.target in DQ_TARGETS + + +def is_groupwise_q_dq(tensor: torch.fx.Node) -> bool: + return tensor.target in [DQ_GROUP_TARGETS, Q_GROUP_TARGETS] diff --git a/backends/arm/README.md b/backends/arm/README.md index 7167aa853b..6f4642f8d4 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -9,7 +9,7 @@ The expected flow is: * torch.nn.module -> TOSA -> command_stream for fully AoT flows e.g. embedded. * torch.nn.module -> TOSA for flows supporting a JiT compilation step. -Current backend support is being developed for TOSA to Ethos(TM)-U55/65 via the +Current backend support is being developed for TOSA to Ethos(TM)-U55/65/85 via the ethos-u-vela compilation stack. which follows the fully AoT flow. ## Layout @@ -33,7 +33,7 @@ Quantization: - `arm_quantizer_utils.py` - Utilities for quantization Runtime: -- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (PyTorchBackendInterface) for Ethos-U +- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U Other: - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS new file mode 100644 index 0000000000..220db37371 --- /dev/null +++ b/backends/arm/TARGETS @@ -0,0 +1,83 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "arm_partitioner", + srcs = [ + "arm_partitioner.py", + ], + typing = True, + deps = [ + ":arm_backend", + "//executorch/backends/arm/passes:passes", + "//executorch/exir:lib", + ], +) + +python_library( + name = "arm_backend", + srcs = [ + "arm_backend.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/flatbuffers:flatbuffers", + "fbsource//third-party/pypi/ml-dtypes:ml-dtypes", + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":arm_vela", + "//executorch/backends/arm/operators:lib", + "//executorch/backends/arm/operators:node_visitor", + "//executorch/backends/arm/passes:passes", + ], +) + +python_library( + name = "arm_vela", + srcs = [ + "arm_vela.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela", + ], +) + +python_library( + name = "tosa_mapping", + srcs = [ + "tosa_mapping.py", + ], + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "//caffe2:torch", + ], +) + +python_library( + name = "tosa_quant_utils", + srcs = [ + "tosa_quant_utils.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/numpy:numpy", + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":tosa_mapping", + "//executorch/exir/dialects:lib", + ], +) + +python_library( + name = "tosa_utils", + srcs = [ + "tosa_utils.py", + ], + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/serializer:serializer", + ":tosa_quant_utils", + "//executorch/backends/arm/operators:node_visitor", + ], +) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index f187191fee..a5f47c222f 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Main implementation of AoT flow to partition and preprocess for Arm target # backends. Converts via TOSA as an intermediate form supported by AoT and @@ -54,7 +56,7 @@ def ethosu_compile_spec( memory_mode: Optional[str] = None, extra_flags: Optional[str] = None, config_ini: Optional[str] = "Arm/vela.ini", - ): + ) -> "ArmCompileSpecBuilder": """ Generate compile spec for Ethos-U NPU @@ -84,7 +86,7 @@ def ethosu_compile_spec( return self - def tosa_compile_spec(self): + def tosa_compile_spec(self) -> "ArmCompileSpecBuilder": """ Generate compile spec for TOSA flatbuffer output """ @@ -94,14 +96,18 @@ def tosa_compile_spec(self): self.output_format = "tosa" return self - def dump_intermediate_artifacts_to(self, output_path: str): + def dump_intermediate_artifacts_to( + self, output_path: str + ) -> "ArmCompileSpecBuilder": """ Sets a path for dumping intermediate results during such as tosa and pte. """ self.path_for_intermediates = output_path return self - def set_permute_memory_format(self, set_nhwc_permutation: bool = True): + def set_permute_memory_format( + self, set_nhwc_permutation: bool = True + ) -> "ArmCompileSpecBuilder": """ Permute to channel last in compiler and runtime. Compilation and runtime will convert rank 4 inputs to channel last for each sub-graph. @@ -109,7 +115,7 @@ def set_permute_memory_format(self, set_nhwc_permutation: bool = True): self.permute_nhwc = set_nhwc_permutation return self - def set_quantize_io(self, quantize_io: bool = False): + def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder": """ Quantization of inputs and dequantization of outputs for cases where whole graph is quantized and method signature is not of quantized type. @@ -117,7 +123,7 @@ def set_quantize_io(self, quantize_io: bool = False): self.quantize_io = quantize_io return self - def build(self): + def build(self) -> List[CompileSpec]: """ Generate a list of compile spec objects from the builder """ @@ -159,13 +165,24 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool: return False -def get_intermediate_path(compile_spec: List[CompileSpec]) -> str: +def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]: for spec in compile_spec: if spec.key == "debug_artifact_path": return spec.value.decode() return None +def _get_first_delegation_tag(graph_module) -> str | None: + """Get the first delegation tag from the graph_module or return None.""" + for node in graph_module.graph.nodes: + tag = node.meta.get("delegation_tag") + if tag: + return tag + + logger.debug("No delegation tag found in partition.") + return None + + @final class ArmBackend(BackendDetails): @staticmethod @@ -220,8 +237,13 @@ def preprocess( # noqa: C901 # TODO: It would be awesome if this dump could somehow be done on top level and not here. # Problem is that the desc.json has to be created on the tosa_graph object, which we can't # access from top level. - if artifact_path is not None: - dbg_tosa_dump(tosa_graph, artifact_path) + if artifact_path: + tag = _get_first_delegation_tag(graph_module) + dbg_tosa_dump( + tosa_graph, + artifact_path, + suffix="{}".format(f"_{tag}" if tag else ""), + ) # Serialize and return the program. While we have always produced TOSA # output as an intermediate, some flows compile to device binaries in diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index 56dac5d248..6b57c3d965 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import logging import operator import os @@ -38,20 +40,31 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: supported = node.op == "call_function" and node.target in [ exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.addmm.default, + exir_ops.edge.aten.expand_copy.default, + exir_ops.edge.aten.cat.default, + exir_ops.edge.aten.bmm.default, exir_ops.edge.aten.permute_copy.default, exir_ops.edge.aten.hardtanh.default, exir_ops.edge.aten.convolution.default, exir_ops.edge.aten.div.Tensor, + exir_ops.edge.aten.exp.default, + exir_ops.edge.aten.log.default, + exir_ops.edge.aten.split_with_sizes_copy.default, exir_ops.edge.aten.full.default, + exir_ops.edge.aten.mul.Tensor, exir_ops.edge.aten._native_batch_norm_legit_no_training.default, exir_ops.edge.aten.avg_pool2d.default, exir_ops.edge.aten.sigmoid.default, + exir_ops.edge.aten.mm.default, + exir_ops.edge.aten.repeat.default, + exir_ops.edge.aten.relu.default, exir_ops.edge.aten._softmax.default, exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.clone.default, exir_ops.edge.aten.mean.dim, + exir_ops.edge.aten.unsqueeze_copy.default, operator.getitem, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index f387672b7b..01bb8bd55e 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -3,14 +3,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import os import struct -import subprocess import tempfile from typing import List import numpy as np +from ethosu.vela import vela # Pack either input or output tensor block, compose the related arrays into @@ -38,21 +40,22 @@ def vela_compile(tosa_graph, args: List[str]): with tempfile.TemporaryDirectory() as tmpdir: tosaname = "out.tosa" flatbuffer = tosa_graph.serialize() - with open(os.path.join(tmpdir, tosaname), "wb") as f: + tosa_path = os.path.join(tmpdir, tosaname) + with open(tosa_path, "wb") as f: f.write(flatbuffer) # invoke vela - vela_command = f"cd {tmpdir}; vela {' '.join(args)} {tosaname}" - try: - subprocess.run([vela_command], shell=True, check=True, capture_output=True) - except subprocess.CalledProcessError as process_error: - raise RuntimeError( - f"Vela compiler ('{vela_command}') failed with error:\n \ - {process_error.stderr.decode()}\n \ - Stdout:\n{process_error.stdout.decode()}" - ) - - np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") + output_dir = os.path.join(tmpdir, "output") + args.append(f"--output-dir={output_dir}") + args.append(tosa_path) + vela.main(" ".join(args).split(" ")) + + if any("ethos-u85" in arg for arg in args) or any( + "debug-force-regor" in arg for arg in args + ): + np_path = os.path.join(tmpdir, "output", "out_vela.npz") + else: + np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") blocks = b"" with np.load(np_path, allow_pickle=False) as data: diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS new file mode 100644 index 0000000000..fd04d5fb84 --- /dev/null +++ b/backends/arm/operators/TARGETS @@ -0,0 +1,34 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "node_visitor", + srcs = ["node_visitor.py"], + typing = True, + deps = [ + "//executorch/backends/arm:tosa_mapping", + ], +) + +python_library( + name = "ops", + srcs = glob(["op_*.py"]), + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":node_visitor", + "//executorch/backends/arm:tosa_mapping", + "//executorch/backends/arm:tosa_quant_utils", + "//executorch/backends/arm:tosa_utils", + "//executorch/exir:lib", + ], +) + +python_library( + name = "lib", + srcs = ["__init__.py"], + typing = True, + deps = [ + ":node_visitor", + ":ops", + ], +) diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index e868b584cf..7b94bfa837 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -3,24 +3,35 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from . import ( # noqa node_visitor, op_add, op_addmm, op_avg_pool2d, op_batch_norm, + op_bmm, + op_cat, op_conv2d, op_dequant, op_div, + op_exp, op_full, op_get_item, op_hardtanh, + op_log, op_mean_dim, + op_mm, + op_mul, op_permute, op_quant, + op_relu, + op_repeat, op_sigmoid, op_slice, op_softmax, op_sub, + op_unsqueeze, op_view, ) diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py index 59edc01e74..99fd0388e4 100644 --- a/backends/arm/operators/node_visitor.py +++ b/backends/arm/operators/node_visitor.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Dict, List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py index 33c0c49744..ec2ade9e8a 100644 --- a/backends/arm/operators/op_add.py +++ b/backends/arm/operators/op_add.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py index 444799d353..b4f782db4a 100644 --- a/backends/arm/operators/op_addmm.py +++ b/backends/arm/operators/op_addmm.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts @@ -12,10 +14,7 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_quant_utils import ( - compute_multiplier_and_shift, - get_quant_node_args, -) +from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args from executorch.backends.arm.tosa_utils import build_reshape from executorch.exir.dialects._ops import ops as exir_ops @@ -128,32 +127,20 @@ def define_node( weight_scale = get_quant_node_args(weight_node_q_node).scale output_rescale_scale = (input_scale * weight_scale) / consumer_node_scale - ( - multiplier_output, - shift_output, - ) = compute_multiplier_and_shift(output_rescale_scale) - - attr_rescale_output = ts.TosaSerializerAttribute() - attr_rescale_output.RescaleAttribute( - input_zp=0, - output_zp=consumer_node_node_zp, - multiplier=[multiplier_output], - shift=[shift_output], - scale32=True, - double_round=True, - per_channel=False, - input_unsigned=False, - output_unsigned=False, - ) reshaped_res = tosa_graph.addIntermediate(result_shape, ts.DType.INT32) build_reshape(tosa_graph, conv2d_res.name, result_shape, reshaped_res.name) - tosa_graph.addOperator( - TosaOp.Op().RESCALE, - [reshaped_res.name], - [output.name], - attr_rescale_output, + build_rescale( + tosa_fb=tosa_graph, + scale=output_rescale_scale, + input_node=reshaped_res, + output_name=output.name, + output_type=ts.DType.INT8, + output_shape=reshaped_res.shape, + input_zp=0, + output_zp=consumer_node_node_zp, + is_double_round=False, ) else: diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py index e6d07610c8..4caaad9202 100644 --- a/backends/arm/operators/op_avg_pool2d.py +++ b/backends/arm/operators/op_avg_pool2d.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_batch_norm.py b/backends/arm/operators/op_batch_norm.py index c41941722b..d17c3a1b81 100644 --- a/backends/arm/operators/op_batch_norm.py +++ b/backends/arm/operators/op_batch_norm.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py new file mode 100644 index 0000000000..161b5d2239 --- /dev/null +++ b/backends/arm/operators/op_bmm.py @@ -0,0 +1,85 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args +from executorch.backends.arm.tosa_utils import get_two_inputs +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class BMMVisitor(NodeVisitor): + target = "aten.bmm.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + input0, input1 = get_two_inputs(node) + + # aten.bmm maps directly to MATMUL + # NOTE: For now, only INT8 & FP32 is supported + + # For INT8, we need to get the zero points and add an intermediate tensor + # for a later rescale. + if is_quant_node: + input0_zp = get_quant_node_args(input0).zp + input1_zp = get_quant_node_args(input1).zp + bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) + bmm_output_name = bmm_result.name + else: + input0_zp, input1_zp = 0, 0 + bmm_output_name = output.name + + # Add the MATMUL to the TOSA graph. + attr = ts.TosaSerializerAttribute() + attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp) + + tosa_graph.addOperator( + TosaOp.Op().MATMUL, + [input0.name, input1.name], + [bmm_output_name], + attr, + ) + + # As INT8 accumulates into INT32, we need to rescale it back to INT8 + if is_quant_node: + input0_q_params = get_quant_node_args(input0) + input1_q_params = get_quant_node_args(input1) + output_q_params = get_quant_node_args(list(node.users)[0]) + + final_output_scale = ( + input0_q_params.scale * input1_q_params.scale + ) / output_q_params.scale + + build_rescale( + tosa_fb=tosa_graph, + scale=final_output_scale, + # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined. + input_node=bmm_result, + output_name=output.name, + output_type=ts.DType.INT8, + output_shape=bmm_result.shape, + input_zp=0, + output_zp=output_q_params.zp, + is_double_round=False, + ) diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py new file mode 100644 index 0000000000..652eb39737 --- /dev/null +++ b/backends/arm/operators/op_cat.py @@ -0,0 +1,47 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import List + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class CatVisitor(NodeVisitor): + target = "aten.cat.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + tensors = inputs[0].special + dim = 0 if len(inputs) < 2 else inputs[1].number + rank = len(output.shape) + dim = (dim + rank) % rank + dim = output.dim_order.index(dim) + + attr = ts.TosaSerializerAttribute() + attr.AxisAttribute(dim) + + tosa_graph.addOperator( + TosaOp.Op().CONCAT, [tensor.name for tensor in tensors], [output.name], attr + ) diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 323b11601c..64cde0724f 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -2,7 +2,9 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List + +# pyre-unsafe +from typing import cast, List import serializer.tosa_serializer as ts import torch @@ -40,7 +42,7 @@ def adjust_pad_if_needed(self, input, weight, stride, pad, dilation): if mod_remainder > pad: raise RuntimeError( - f"ignoring input element is not currently supported, got a large stride {stride}" + "This case should be handled by the SizeAdjustConv2d pass, is it enabled?" ) return pad - mod_remainder @@ -156,11 +158,12 @@ def define_node( # integer value domain of the next op. Otherwise return float32 output. if is_quant_node: # Get scale_factor from input, weight, and output. - _, input_scale, _, _, _, _ = getNodeArgs(node.args[0]) - _, weight_scale, _, _, _, _ = getNodeArgs(node.args[1]) + _, input_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[0])) + _, weight_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[1])) _, output_scale, output_zp, _, _, _ = getNodeArgs(list(node.users)[0]) build_rescale_conv_output( tosa_graph, + # pyre-fixme[61]: Uninitialized local [61]: Local variable `conv2d_res` is undefined, or not always defined. conv2d_res, output.name, actual_out_type, diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py index 269afceccb..afa1dda946 100644 --- a/backends/arm/operators/op_dequant.py +++ b/backends/arm/operators/op_dequant.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_div.py b/backends/arm/operators/op_div.py index e365cf6cfe..0857e0ed32 100644 --- a/backends/arm/operators/op_div.py +++ b/backends/arm/operators/op_div.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py new file mode 100644 index 0000000000..f98bb3f88c --- /dev/null +++ b/backends/arm/operators/op_exp.py @@ -0,0 +1,83 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import numpy as np + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + +from executorch.backends.arm.tosa_quant_utils import ( + dequantize_value, + get_quant_node_args, + QuantArgs, + quantize_value, +) +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class ExpVisitor(NodeVisitor): + target = "aten.exp.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + assert len(node.all_input_nodes) == 1 + assert len(node.users) == 1 + + if is_quant_node: + # Assume quantized input is 8 bit. + + # Create attribute for 8 bit table lookup. + input_node = node.all_input_nodes[0] + in_quantargs = get_quant_node_args(input_node) + output_node = list(node.users)[0] + out_quantargs = get_quant_node_args(output_node) + + table = exp_table_8bit(in_quantargs, out_quantargs) + table_attr = ts.TosaSerializerAttribute() + table_attr.TableAttribute(table) + + tosa_graph.addOperator( + TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ) + else: + tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name]) + + +def exp_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs): + """ + Returns a table mapping 256 entries to exp([qmin,qmax]) + """ + + def exp(x): + # Convert quantized input to floating point exp input space. + v = dequantize_value(x, in_quantargs) + # Compute exp. + v = np.exp(v) + # Convert exp output back to quantized space. + return quantize_value(v, out_quantargs) + + return [ + exp(x) + for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8) + ] diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py index f929b02ee6..eec27bb909 100644 --- a/backends/arm/operators/op_full.py +++ b/backends/arm/operators/op_full.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py index 59004f4968..a696b33aa7 100644 --- a/backends/arm/operators/op_get_item.py +++ b/backends/arm/operators/op_get_item.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py index 3d58f6d628..62c0a27f05 100644 --- a/backends/arm/operators/op_hardtanh.py +++ b/backends/arm/operators/op_hardtanh.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py new file mode 100644 index 0000000000..5276173efa --- /dev/null +++ b/backends/arm/operators/op_log.py @@ -0,0 +1,83 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import numpy as np + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + +from executorch.backends.arm.tosa_quant_utils import ( + dequantize_value, + get_quant_node_args, + QuantArgs, + quantize_value, +) +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class LogVisitor(NodeVisitor): + target = "aten.log.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + assert len(node.all_input_nodes) == 1 + assert len(node.users) == 1 + + if is_quant_node: + # Assume quantized input is 8 bit. + + # Create attribute for 8 bit table lookup. + input_node = node.all_input_nodes[0] + in_quantargs = get_quant_node_args(input_node) + output_node = list(node.users)[0] + out_quantargs = get_quant_node_args(output_node) + + table = log_table_8bit(in_quantargs, out_quantargs) + table_attr = ts.TosaSerializerAttribute() + table_attr.TableAttribute(table) + + tosa_graph.addOperator( + TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ) + else: + tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name]) + + +def log_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs): + """ + Returns a table mapping 256 entries to log([qmin,qmax]) + """ + + def log(x): + # Convert quantized input to floating point log input space. + v = dequantize_value(x, in_quantargs) + # Compute log. + v = np.log(v) + # Convert log output back to quantized space. + return quantize_value(v, out_quantargs) + + return [ + log(x) + for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8) + ] diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py index 20e1b2b8d7..3c9aea3085 100644 --- a/backends/arm/operators/op_mean_dim.py +++ b/backends/arm/operators/op_mean_dim.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts @@ -11,7 +13,6 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common @register_node_visitor @@ -30,29 +31,4 @@ def define_node( is_quant_node: bool, ) -> None: - input_tensor = inputs[0] - dim = node.args[1] - keep_dim = node.args[2] - - # mean.dim(-1, -2) is the same as avg_pool2d when just computing mean over HW dimensions. - # Since tosa doesn't have mean.dim operation, lowers it to average pooling instead. - if dim == [-1, -2]: - if keep_dim is True: - # Given the shape format of input is (N, C, H, W) - kernel_size = [input_tensor.shape[2], input_tensor.shape[3]] - stride = [1, 1] - padding = [0, 0, 0, 0] - - build_avg_pool_2d_common( - node, - tosa_graph, - input_tensor, - kernel_size, - stride, - padding, - is_quant_node, - output, - ) - return - raise AssertionError("unsupported") diff --git a/backends/arm/operators/op_mm.py b/backends/arm/operators/op_mm.py new file mode 100644 index 0000000000..ebddb3a40e --- /dev/null +++ b/backends/arm/operators/op_mm.py @@ -0,0 +1,109 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import serializer.tosa_serializer as ts +import torch +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args +from executorch.backends.arm.tosa_utils import ( + build_reshape, + expand_dims, + get_two_inputs, +) +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class MMVisitor(NodeVisitor): + target = "aten.mm.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + input0, input1 = get_two_inputs(node) + + # For atem.mm, the two inputs are of rank 2 + # For TOSA it needs to be rank 3 + # So they need to be reshaped from (H, W) to (1, H, W) + # NOTE: For now, only INT8 & FP32 is supported + reshape_dtype = ts.DType.INT8 if is_quant_node else ts.DType.FP32 + input0_reshaped = expand_dims(tosa_graph, inputs[0], reshape_dtype, 0) + input1_reshaped = expand_dims(tosa_graph, inputs[1], reshape_dtype, 0) + + # The output also needs to be rank 3 + output_new_shape = (1, output.shape[0], output.shape[1]) + + # For INT8, we need to get the zero point, otherwise it is 0 + input0_zp, input1_zp = 0, 0 + if is_quant_node: + input0_zp = get_quant_node_args(input0).zp + input1_zp = get_quant_node_args(input1).zp + + mat_mul_result = tosa_graph.addIntermediate( + output_new_shape, ts.DType.INT32 if is_quant_node else output.dtype + ) + + attr = ts.TosaSerializerAttribute() + attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp) + + tosa_graph.addOperator( + TosaOp.Op().MATMUL, + [input0_reshaped.name, input1_reshaped.name], + [mat_mul_result.name], + attr, + ) + + if is_quant_node: + reshape_intermediate = tosa_graph.addIntermediate( + output.shape, ts.DType.INT32 + ) + reshape_output_name = reshape_intermediate.name + else: + reshape_output_name = output.name + + # Reshape the final output back to rank 2 + build_reshape( + tosa_graph, mat_mul_result.name, output.shape, reshape_output_name + ) + + # As INT8 accumulates into INT32, we need to rescale it back to INT8 + if is_quant_node: + input0_q_params = get_quant_node_args(input0) + input1_q_params = get_quant_node_args(input1) + output_q_params = get_quant_node_args(list(node.users)[0]) + + final_output_scale = ( + input0_q_params.scale * input1_q_params.scale + ) / output_q_params.scale + + # As the input will be INT32, the input_zp must be set to 0 + build_rescale( + tosa_fb=tosa_graph, + scale=final_output_scale, + # pyre-ignore[61]: Uninitialized local [61]: Local variable `reshape_intermediate` is undefined, or not always defined. + input_node=reshape_intermediate, + output_name=output.name, + output_type=ts.DType.INT8, + output_shape=reshape_intermediate.shape, + input_zp=0, + output_zp=output_q_params.zp, + is_double_round=False, + ) diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py new file mode 100644 index 0000000000..c152e8759e --- /dev/null +++ b/backends/arm/operators/op_mul.py @@ -0,0 +1,89 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import cast, List + +import executorch.backends.arm.tosa_quant_utils as tqutils +import executorch.backends.arm.tosa_utils as tutils + +import serializer.tosa_serializer as ts +import torch + +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class MulVisitor(NodeVisitor): + target = "aten.mul.Tensor" + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + if is_quant_node: + input_A = inputs[0] + input_B = inputs[1] + input_A_qargs = tqutils.get_quant_node_args( + cast(torch.fx.Node, node.args[0]) + ) + input_B_qargs = tqutils.get_quant_node_args( + cast(torch.fx.Node, node.args[1]) + ) + + input_A.shape = tutils.tosa_shape(input_A.shape, input_A.dim_order) + input_B.shape = tutils.tosa_shape(input_B.shape, input_B.dim_order) + output_shape = tutils.tosa_shape(output.shape, output.dim_order) + + # Rescale inputs to INT32 with zp=0 + input_A_rescaled = tqutils.build_rescale_to_int32( + tosa_graph, + input_A, + input_A_qargs.zp, + rescale_scale=1.0, + ) + input_B_rescaled = tqutils.build_rescale_to_int32( + tosa_graph, + input_B, + input_B_qargs.zp, + rescale_scale=1.0, + ) + + mul_output = tosa_graph.addIntermediate(output_shape, ts.DType.INT32) + + # Do the INT32 Mul + attr = ts.TosaSerializerAttribute() + attr.MulAttribute(shift=0) + tosa_graph.addOperator( + TosaOp.Op().MUL, + [ + input_A_rescaled.name, + input_B_rescaled.name, + ], + [mul_output.name], + attr, + ) + + tqutils.rescale_node_back_to_int8( + node, mul_output, input_A_qargs.scale * input_B_qargs.scale, tosa_graph + ) + + else: + attr = ts.TosaSerializerAttribute() + attr.MulAttribute(shift=0) + tosa_graph.addOperator( + TosaOp.Op().MUL, [inputs[0].name, inputs[1].name], [output.name], attr + ) diff --git a/backends/arm/operators/op_output.py b/backends/arm/operators/op_output.py index 7d163114aa..1b053b18ed 100644 --- a/backends/arm/operators/op_output.py +++ b/backends/arm/operators/op_output.py @@ -3,6 +3,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + +from typing import cast + import serializer.tosa_serializer as ts import torch @@ -11,7 +15,7 @@ def process_output( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, ): - for output in node.args[0]: + for output in cast(tuple[torch.fx.Node, ...], node.args[0]): tosa_graph.addOutputTensor( tosa_graph.currRegion.currBasicBlock.tensors[output.name] ) diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py index eafd6af367..167a0c382f 100644 --- a/backends/arm/operators/op_permute.py +++ b/backends/arm/operators/op_permute.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py index 0b2e65f45d..b5dcf3f987 100644 --- a/backends/arm/operators/op_placeholder.py +++ b/backends/arm/operators/op_placeholder.py @@ -3,9 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import numpy as np import serializer.tosa_serializer as ts -import torch +import torch.fx from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_quant_utils import ( get_quant_arg_dtype, @@ -130,6 +132,21 @@ def process_inputs_to_buffers( ) +def process_inputs_to_lifted_tensor_constants( + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + edge_program: ExportedProgram, +): + arg = TosaArg(node) + tensor_name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[ + arg.name + ] + tensor = edge_program.tensor_constants[tensor_name] + tensor_data = tensor.detach().numpy() + + tosa_graph.addConst(tensor_data.shape, arg.dtype, tensor_data, name=arg.name) + + def process_placeholder( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, @@ -145,5 +162,11 @@ def process_placeholder( process_inputs_to_parameters(node, tosa_graph, edge_program) elif node.name in edge_program.graph_signature.inputs_to_buffers: process_inputs_to_buffers(node, tosa_graph, edge_program) + elif node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants: + process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program) + elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs: + raise NotImplementedError( + "Placeholder is of type 'lifted custom object' which is not supported." + ) else: - raise RuntimeError(f"Unknown placeholder {node.name}") + raise RuntimeError(f"Placeholder '{node.name}' is of unknown type.") diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py index e6a62b3f20..8f83e79442 100644 --- a/backends/arm/operators/op_quant.py +++ b/backends/arm/operators/op_quant.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py new file mode 100644 index 0000000000..20bba3f654 --- /dev/null +++ b/backends/arm/operators/op_relu.py @@ -0,0 +1,57 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import executorch.backends.arm.tosa_quant_utils as tqutils +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class ReluVisitor(NodeVisitor): + target = "aten.relu.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + attr = ts.TosaSerializerAttribute() + + clamp_min_fp = 0.0 + clamp_max_fp = 0.0 + clamp_min_qs = 0 + clamp_max_qs = 0 + if is_quant_node: + out_qargs = tqutils.get_quant_node_args(list(node.users)[0]) + clamp_min_qs = tqutils.quantize_value(0, out_qargs) + clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs) + + else: + clamp_min_fp = 0 + clamp_max_fp = float("inf") + + attr.ClampAttribute( + tosa_graph.builder, + clamp_min_qs, + clamp_max_qs, + clamp_min_fp, + clamp_max_fp, + ) + + tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr) diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py new file mode 100644 index 0000000000..20de9e0846 --- /dev/null +++ b/backends/arm/operators/op_repeat.py @@ -0,0 +1,68 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import serializer.tosa_serializer as ts +import torch +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_utils import tosa_shape +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class RepeatVisitor(NodeVisitor): + target = "aten.repeat.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + item_name = inputs[0].name + shape = inputs[0].shape + rank = len(shape) + multiples = inputs[1].special + new_rank = len(multiples) + + assert new_rank >= rank + + # TILE only supports rank(in) == rank(out). To add more dims, we need a reshape first. + if new_rank > rank: + # Add length 1 dimensions to shape to match multiples + num_new_dims = new_rank - rank + expanded_shape = tuple( + 1 if i < num_new_dims else shape[i - num_new_dims] + for i in range(new_rank) + ) + expanded_shape = tosa_shape(expanded_shape, output.dim_order) + dtype = ( + ts.dtype_str_to_val("INT8") + if is_quant_node + else ts.dtype_str_to_val("FP32") + ) + + rescale_out = tosa_graph.addIntermediate(expanded_shape, dtype) + rescale_attr = ts.TosaSerializerAttribute() + rescale_attr.ReshapeAttribute(expanded_shape) + tosa_graph.addOperator( + TosaOp.Op().RESHAPE, [item_name], [rescale_out.name], rescale_attr + ) + item_name = rescale_out.name + + attr = ts.TosaSerializerAttribute() + attr.TileAttribute(tosa_shape(multiples, output.dim_order)) + tosa_graph.addOperator(TosaOp.Op().TILE, [item_name], [output.name], attr) diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py index 884c803482..0087b1f7a8 100644 --- a/backends/arm/operators/op_sigmoid.py +++ b/backends/arm/operators/op_sigmoid.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py index 8d59835ff0..0dfb287cd7 100644 --- a/backends/arm/operators/op_slice.py +++ b/backends/arm/operators/op_slice.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts @@ -40,6 +42,8 @@ def define_node( shape = input_node.shape dim = dim.number end = (shape[dim] + end.number) % shape[dim] + if end == 0: + end = shape[dim] size = end - start.number assert size > 0 assert size <= shape[dim] diff --git a/backends/arm/operators/op_softmax.py b/backends/arm/operators/op_softmax.py index 627fa64aed..1ac4241318 100644 --- a/backends/arm/operators/op_softmax.py +++ b/backends/arm/operators/op_softmax.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts @@ -33,7 +35,7 @@ def define_node( input_name = inputs[0].name dim_order = inputs[0].dim_order input_shape = tosa_shape(inputs[0].shape, dim_order) - dim_value = dim_order.index(inputs[1].number) + dim_value = dim_order.index(inputs[1].number % len(dim_order)) ## softmax = exp(logits - max(logits)) / reduce_sum(exp(logits - max(logits)), -1) # FP32 diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py index 3dc1519f37..2089b6e9e9 100644 --- a/backends/arm/operators/op_sub.py +++ b/backends/arm/operators/op_sub.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_unsqueeze.py b/backends/arm/operators/op_unsqueeze.py new file mode 100644 index 0000000000..c14128fdc8 --- /dev/null +++ b/backends/arm/operators/op_unsqueeze.py @@ -0,0 +1,53 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Follows this specification: https://pytorch.org/docs/stable/generated/torch.unsqueeze.html + +# pyre-unsafe + +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_utils import tosa_shape +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class UnsqueezeVisitor(NodeVisitor): + target = "aten.unsqueeze_copy.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + dim = inputs[1].number + shape = inputs[0].shape + rank = len(shape) + + assert -rank - 1 <= dim < rank + 1 + if dim < 0: + dim = dim + rank + 1 + + new_shape = list(shape) + new_shape.insert(dim, 1) + new_shape = tosa_shape(new_shape, output.dim_order) + + attr = ts.TosaSerializerAttribute() + attr.ReshapeAttribute(new_shape) + tosa_graph.addOperator( + TosaOp.Op().RESHAPE, [inputs[0].name], [output.name], attr + ) diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index 682eacd5e3..8667df590d 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -2,10 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts import torch +import tosa.Op as TosaOp from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, @@ -13,7 +16,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import tosa_shape -from serializer.tosa_serializer import TosaOp @register_node_visitor diff --git a/backends/arm/passes/TARGETS b/backends/arm/passes/TARGETS new file mode 100644 index 0000000000..ca20b03fcc --- /dev/null +++ b/backends/arm/passes/TARGETS @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "passes", + srcs = glob(["*.py"]), + typing = True, + deps = [ + "//executorch/backends/arm:tosa_quant_utils", + "//executorch/backends/arm:tosa_utils", + "//executorch/exir:lib", + ], +) diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py index 9bb45c504a..222c0a7cb3 100644 --- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py @@ -4,6 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + +from typing import cast + import torch from executorch.backends.arm.tosa_quant_utils import dq_op from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d @@ -28,9 +32,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node): if node.target != dq_op: return False prev_node = node.args[0] - if prev_node.op != "placeholder": + if cast(torch.fx.Node, prev_node).op != "placeholder": return False - return is_consumer_node_depthwise_conv2d(node) + if is_consumer_node_depthwise_conv2d(node): + consumer_node = list(node.users)[0] + return consumer_node.args[1] == node elif node.op == "placeholder": # node is an input, weight or bias node consumer_node = list(node.users)[0] @@ -46,7 +52,9 @@ def call(self, graph_module: torch.fx.GraphModule): NHWC_Order = (0, 2, 3, 1) HWCM_Order = (2, 3, 0, 1) for node in graph_module.graph.nodes: - if isinstance(node.meta["val"], tuple): + if isinstance( + node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list) + ): node_data = node.meta["val"][0].data else: node_data = node.meta["val"].data diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index c2453f701f..75ef551171 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -5,25 +5,41 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.backends.arm.passes.annotate_channels_last_dim_order_pass import ( AnnotateChannelsLastDimOrder, ) +from executorch.backends.arm.passes.convert_expand_copy_to_repeat import ( + ConvertExpandCopyToRepeatPass, +) +from executorch.backends.arm.passes.convert_split_to_slice import ( + ConvertSplitToSlicePass, +) +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass +from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.pass_manager import PassManager class ArmPassManager(PassManager): - def _transform(self, graph_module: torch.fx.Graph): + def _transform(self, graph_module: torch.fx.GraphModule): return self(graph_module).graph_module def transform_to_backend_pipeline( - self, graph_module: torch.fx.Graph, compile_spec: CompileSpec + self, graph_module: torch.fx.GraphModule, compile_spec: list[CompileSpec] ): """Apply passes before transforming program to backend""" + self.add_pass(SizeAdjustConv2DPass()) self.add_pass(RemoveClonePass()) + self.add_pass(ConvertExpandCopyToRepeatPass()) + self.add_pass(ConvertMeanDimToAveragePool()) + self.add_pass(ConvertSplitToSlicePass()) for spec in compile_spec: if spec.key == "permute_memory_format": memory_format = spec.value.decode() diff --git a/backends/arm/passes/convert_expand_copy_to_repeat.py b/backends/arm/passes/convert_expand_copy_to_repeat.py new file mode 100644 index 0000000000..249c014ae6 --- /dev/null +++ b/backends/arm/passes/convert_expand_copy_to_repeat.py @@ -0,0 +1,63 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import cast + +import torch.fx +from executorch.backends.arm.tosa_mapping import extract_tensor_meta +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + + +class ConvertExpandCopyToRepeatPass(ExportPass): + """ + Replace expand copy with repeat since it is a repeat that can only repeat singleton dimensions. + """ + + expand_copy = exir_ops.edge.aten.expand_copy.default + repeat = exir_ops.edge.aten.repeat.default + patterns = [{expand_copy: 1}] + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + partitions = get_source_partitions( + graph, [torch.expand_copy, torch.Tensor.expand, "expand"] + ) + for _, src_partitions in partitions.items(): + for src_partition in src_partitions: + assert len(src_partition.nodes) == 1 + + expand_node = src_partition.nodes[0] + _, shape, _ = extract_tensor_meta(expand_node.all_input_nodes[0].meta) + multiples = cast(tuple[int], expand_node.args[1]) + expanded_rank = len(multiples) + + # Expanded shape is 'shape' front-padded with ones. + padding = expanded_rank - len(shape) + extended_shape = [ + shape[i] if i >= 0 else 1 for i in range(-padding, len(shape)) + ] + + # To convert expand arg to repeat arg, non-repeated dims should have + # multiples[dim] = 1. + multiples = [ + multiples[i] if extended_shape[i] == 1 else 1 + for i in range(expanded_rank) + ] + args = (expand_node.args[0], multiples) + + with graph_module.graph.inserting_before(expand_node): + repeat_node = graph.create_node("call_function", self.repeat, args) + repeat_node.meta = expand_node.meta + for user in expand_node.users.copy(): + user.replace_input_with(expand_node, repeat_node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/arm/passes/convert_split_to_slice.py b/backends/arm/passes/convert_split_to_slice.py new file mode 100644 index 0000000000..29aae37fe9 --- /dev/null +++ b/backends/arm/passes/convert_split_to_slice.py @@ -0,0 +1,72 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import torch.fx +from executorch.backends.arm.tosa_mapping import extract_tensor_meta +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ConvertSplitToSlicePass(ExportPass): + """ + Replace a split operation with many slice operations. + """ + + split_ops = ( + exir_ops.edge.aten.split_with_sizes_copy.default, + exir_ops.edge.aten.split_copy.Tensor, + ) + slice = exir_ops.edge.aten.slice_copy.Tensor + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + for node in graph.nodes: + if node.target not in self.split_ops: + continue + + # Get useful variables + split_node = node + input_node = split_node.all_input_nodes[0] + output_nodes = split_node.users.copy() + _, shape, _ = extract_tensor_meta(input_node.meta) + rank = len(shape) + split_lengths = split_node.args[1] + dim = split_node.args[2] if len(split_node.args) > 2 else 0 + dim = (dim + rank) % rank + + assert ( + sum(split_lengths) == shape[dim] + ), "Given split lengths don't sum up to the size of the dimension." + + # Convert split argument 'split_lengths' to slice arguments start and end. + starts = [0] * len(split_lengths) + ends = [0] * len(split_lengths) + start = 0 + end = 0 + for i, split_length in enumerate(split_lengths): + end = start + split_length + starts[i] = start + ends[i] = end + start = end + + # Output nodes are of type getitem + # Create one slice node for each output node with matching argumetns. + with graph_module.graph.inserting_before(split_node): + for output_node in output_nodes: + index = output_node.args[1] + slice_node = graph.create_node( + "call_function", + self.slice, + (input_node, dim, starts[index], ends[index]), + ) + slice_node.meta = split_node.meta.copy() + slice_node.meta["val"] = slice_node.meta["val"][index] + output_node.replace_input_with(split_node, slice_node) + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py new file mode 100644 index 0000000000..0974eac740 --- /dev/null +++ b/backends/arm/passes/meandim_to_averagepool_pass.py @@ -0,0 +1,54 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import Any, cast, Dict, Tuple + +import torch.fx + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue + +Argument = Any + + +class ConvertMeanDimToAveragePool(ExportPass): + """ + Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation. + """ + + def call_operator( + self, + op: torch.fx.node.Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != exir_ops.edge.aten.mean.dim: + return super().call_operator(op, args, kwargs, meta) + + input_value = cast(ProxyValue, args[0]) + dim = cast(list, args[1]) + keep_dim = cast(bool, args[2]) if len(args) > 2 else False + + # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True + # so check the dim argument for this case + if dim == [-1, -2] and keep_dim is True: + # Given the shape format of input is (N, C, H, W) + kernel_size = [ + input_value.to_tensor().size()[2], + input_value.to_tensor().size()[3], + ] + stride = [1, 1] + return super().call_operator( + exir_ops.edge.aten.avg_pool2d.default, + (input_value, kernel_size, stride), + {}, + meta, + ) + else: + return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/passes/remove_clone_pass.py b/backends/arm/passes/remove_clone_pass.py index 6108080cb0..64a1ae8f43 100644 --- a/backends/arm/passes/remove_clone_pass.py +++ b/backends/arm/passes/remove_clone_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult diff --git a/backends/arm/passes/size_adjust_conv2d_pass.py b/backends/arm/passes/size_adjust_conv2d_pass.py new file mode 100644 index 0000000000..980ab09e59 --- /dev/null +++ b/backends/arm/passes/size_adjust_conv2d_pass.py @@ -0,0 +1,131 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import cast, Optional + +import torch.fx +from executorch.backends.arm.tosa_quant_utils import is_quant_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch._ops import OpOverload + + +def conv_remainder(input_length, pad, dilation, weight, stride): + """ + Returns the size + """ + return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride + + +def insert_q_dq_pair( + graph: torch.fx.Graph, + anchor: torch.fx.Node, + q_params: tuple, +): + with graph.inserting_after(anchor): + q = create_node( + graph=graph, + op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + args=(), # We add the argument last + ) + q.meta = anchor.meta + + with graph.inserting_after(q): + dq = create_node( + graph=graph, + op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + args=(q,) + q_params, + ) + dq.meta = q.meta + + anchor.replace_all_uses_with(dq) + # We add this last so the replace all uses above does not replace the quantized + # node's first use + q.args = (anchor,) + q_params + return dq + + +def create_node( + graph: torch.fx.Graph, + op_target: OpOverload, + args: tuple = (), + kwargs: Optional[dict] = None, +): + return graph.create_node( + "call_function", + op_target, + args=args, + kwargs=kwargs or {}, + ) + + +class SizeAdjustConv2DPass(ExportPass): + """ + Adjust the convolution input size to match perfectly with the + weight size, padding, stride and dilation parameters. + This is done by inserting a slice op to remove the uneven end of the input. + """ + + conv2d_op = exir_ops.edge.aten.convolution.default + slice_op = exir_ops.edge.aten.slice_copy.Tensor + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + modified_graph = False + for node in graph.nodes: + if node.op != "call_function": + continue + if node.target != self.conv2d_op: + continue + + conv_node = cast(torch.fx.Node, node) + input_node, weight, _, stride_hw, pad_hw, dilation_hw, _, _, _ = ( + conv_node.args + ) + weight_shape = cast(torch.fx.Node, weight).meta["val"].shape + input_shape = cast(torch.fx.Node, input_node).meta["val"].shape + + slice_args = [] + for stride, pad, dilation, dim in zip( + cast(list, stride_hw), + cast(list, pad_hw), + cast(list, dilation_hw), + (2, 3), + ): + remainder = conv_remainder( + input_shape[dim], pad, dilation, weight_shape[dim], stride + ) + if remainder > pad: + adjustment = remainder - pad + args = (dim, 0, input_shape[dim] - adjustment) + slice_args.append(args) + if len(slice_args) == 0: + continue + + with graph_module.graph.inserting_before(node): + last_node = cast(torch.fx.Node, input_node) + for args in slice_args: + slice_node = graph.create_node( + "call_function", self.slice_op, (last_node,) + args + ) + if is_quant_node(last_node): + q_params = last_node.args[1:] + dq_node = insert_q_dq_pair( + graph_module.graph, slice_node, q_params + ) + last_node = dq_node + else: + last_node = slice_node + conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node) + modified_graph = True + + if modified_graph: + graph_module = super().call(graph_module).graph_module + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/arm/passes/tag_io_quant_pass.py b/backends/arm/passes/tag_io_quant_pass.py index d2bf74462e..2fce6cf3fd 100644 --- a/backends/arm/passes/tag_io_quant_pass.py +++ b/backends/arm/passes/tag_io_quant_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult diff --git a/backends/arm/quantizer/TARGETS b/backends/arm/quantizer/TARGETS new file mode 100644 index 0000000000..840586488b --- /dev/null +++ b/backends/arm/quantizer/TARGETS @@ -0,0 +1,31 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "arm_quantizer", + srcs = ["arm_quantizer.py"], + typing = True, + deps = [ + ":arm_quantizer_utils", + "//caffe2:torch", + "//executorch/backends/arm/quantizer/quantization_annotation:quantization_annotation", + "//executorch/exir:lib", + ], +) + +python_library( + name = "quantization_config", + srcs = ["quantization_config.py"], + typing = True, + deps = [ + "//caffe2:torch", + ], +) + +python_library( + name = "arm_quantizer_utils", + srcs = ["arm_quantizer_utils.py"], + typing = True, + deps = [ + ":quantization_config", + ], +) diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index 397ba68565..853fd47c29 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Quantizer for Arm backend # @@ -73,6 +75,7 @@ def _supported_symmetric_quantized_operators() -> Dict[str, List[OperatorPattern [torch.nn.AdaptiveAvgPool2d], [F.adaptive_avg_pool2d], ], + "mul": [torch.mul], "sub": [[torch.sub]], } return copy.deepcopy(supported_operators) @@ -265,6 +268,9 @@ class ArmQuantizer(Quantizer): "sub", "mul", "sigmoid", + "mm", + "cat", + "one_to_one", ] def __init__(self) -> None: @@ -385,7 +391,7 @@ def _annotate_io( for node in model.graph.nodes: if arm_quantizer_utils.is_annotated(node): continue - if node.op == "placeholder": + if node.op == "placeholder" and len(node.users) > 0: _annotate_output_qspec( node, quantization_config.get_output_act_qspec(), diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index 89703f89b0..fe9c5e34e6 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -5,11 +5,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Utility functions for ArmQuantizer # -from typing import Callable, cast, List +import operator +from typing import Callable, cast, List, Union import torch from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig @@ -71,7 +74,7 @@ def get_shared_qspec( Both outputs are None if one of the inputs is a node that can't be quantized. """ - input_act0 = node.args[0] + input_act0 = cast(Node, node.args[0]) input_act1 = node.args[1] input_act_qspec = quantization_config.get_input_act_qspec() @@ -101,12 +104,19 @@ def is_input_ok_for_quantization(input_act: Node, gm: GraphModule): ) +def get_node_target(module: torch.nn.Module | GraphModule, target_str: str): + targets = target_str.split(".") + for target in targets[:-1]: + module = module.get_submodule(target) + return getattr(module, targets[-1]) + + def is_input_large_scalar(node: Node, gm: GraphModule): """Check if input is a large scalar value. So that we can skip quantization for the node since histc op (in HistogramObserver) only works for values up to certain upper bound """ if node.op == "get_attr" and isinstance(node.target, str): - tensor = getattr(gm, node.target) + tensor = get_node_target(gm, node.target) # torch.histc works until this upper bound HISTC_UPPER_BOUND = 3.4028235e15 return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND @@ -130,6 +140,7 @@ def is_share_obs_or_fq_op(op: Callable) -> bool: return op in [ torch.ops.aten.hardtanh.default, torch.ops.aten.hardtanh_.default, + torch.ops.aten.relu.default, torch.ops.aten.mean.default, torch.ops.aten.mean.dim, torch.ops.aten.permute.default, @@ -141,8 +152,11 @@ def is_share_obs_or_fq_op(op: Callable) -> bool: torch.ops.aten.view_copy.default, torch.ops.aten.view.default, torch.ops.aten.slice.Tensor, + torch.ops.aten.split.Tensor, + torch.ops.aten.split_with_sizes.default, torch.ops.aten.flatten.using_ints, torch.ops.aten.dropout.default, + operator.getitem, ] @@ -157,7 +171,9 @@ def propagate_annotation(model: GraphModule) -> None: n = cast(Node, n) if is_annotated(n): continue - if n.op != "call_function" or not is_share_obs_or_fq_op(n.target): + if n.op != "call_function" or not is_share_obs_or_fq_op( + cast(Callable, n.target) + ): continue prev_node = n.args[0] @@ -205,7 +221,7 @@ def convert_scalars_to_attrs(model: GraphModule) -> GraphModule: prefix = "_tensor_constant_" get_new_attr_name = get_new_attr_name_with_prefix(prefix) tensor_constant_name = get_new_attr_name(model) - float_tensor = torch.tensor(float(args[i])) + float_tensor = torch.tensor(float(cast(Union[int, float], args[i]))) model.register_buffer(tensor_constant_name, float_tensor) fake_mode = n.meta["val"].fake_mode with model.graph.inserting_before(n): diff --git a/backends/arm/quantizer/quantization_annotation/TARGETS b/backends/arm/quantizer/quantization_annotation/TARGETS new file mode 100644 index 0000000000..4ce8b5cad2 --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/TARGETS @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "quantization_annotation", + srcs = glob(["*.py"]), + typing = True, + deps = [ + "//caffe2:torch", + "//executorch/backends/arm/quantizer:arm_quantizer_utils", + "//executorch/backends/arm/quantizer:quantization_config", + ], +) diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py index d162bfd479..f7219201de 100644 --- a/backends/arm/quantizer/quantization_annotation/__init__.py +++ b/backends/arm/quantizer/quantization_annotation/__init__.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, Dict, List, NamedTuple, Optional @@ -49,10 +51,13 @@ def decorator(annotator: AnnotatorType): from . import ( # noqa adaptive_ang_pool2d_annotator, add_annotator, + cat_annotator, conv_annotator, linear_annotator, max_pool2d_annotator, + mm_annotator, mul_annotator, + one_to_one_annotator, sigmoid_annotator, sub_annotator, ) diff --git a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py index acbdc45b6b..723a48f664 100644 --- a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/add_annotator.py b/backends/arm/quantizer/quantization_annotation/add_annotator.py index 2926e92f24..35801bd568 100644 --- a/backends/arm/quantizer/quantization_annotation/add_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/add_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/cat_annotator.py b/backends/arm/quantizer/quantization_annotation/cat_annotator.py new file mode 100644 index 0000000000..6e138cd9de --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/cat_annotator.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import itertools +from typing import Callable, cast, List, Optional + +import torch.fx +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer import ( + QuantizationAnnotation, + SharedQuantizationSpec, +) +from torch.fx import Node +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + + +@register_annotator("cat") +def _annotate_cat( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn) + cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values())) + annotated_partitions = [] + for cat_partition in cat_partitions: + annotated_partitions.append(cat_partition.nodes) + cat_node = cat_partition.output_nodes[0] + if arm_quantizer_utils.is_annotated(cat_node): + continue + + input_acts = cast(list[torch.fx.Node], cat_node.args[0]) + input_act0 = input_acts[0] + + input_act_qspec = quantization_config.get_input_act_qspec() + shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node)) + + input_qspec_map = {} + + # First input is set to input qspec from the quantization config. + if isinstance(input_act0, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act0, gm): + continue + input_qspec_map[input_act0] = input_act_qspec + + # For the rest of the inputs, share qspec with first. + # If we can't quantize any of the inputs, abort annotation. + for input_act in input_acts[1:]: + if isinstance(input_act, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act, gm): + continue + if input_act is not input_act0: + input_qspec_map[input_act] = shared_with_input0_qspec + + if input_qspec_map is not None: + cat_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_with_input0_qspec, + _annotated=True, + ) + return annotated_partitions diff --git a/backends/arm/quantizer/quantization_annotation/conv_annotator.py b/backends/arm/quantizer/quantization_annotation/conv_annotator.py index 40a1f1ee9e..4ff7dd9e80 100644 --- a/backends/arm/quantizer/quantization_annotation/conv_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/conv_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree.f +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/linear_annotator.py b/backends/arm/quantizer/quantization_annotation/linear_annotator.py index 95b881a954..7c3f91ec70 100644 --- a/backends/arm/quantizer/quantization_annotation/linear_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/linear_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py index 3d9d8b2e6c..0ef2ee39fe 100644 --- a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/mm_annotator.py b/backends/arm/quantizer/quantization_annotation/mm_annotator.py new file mode 100644 index 0000000000..b48c6d5990 --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/mm_annotator.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import itertools +from typing import Callable, List, Optional + +import torch +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer import QuantizationAnnotation +from torch.fx import Node +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + + +@register_annotator("mm") +def _annotate_mm( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn) + mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values())) + annotated_partitions = [] + for mm_partition in mm_partitions: + annotated_partitions.append(mm_partition.nodes) + mm_node = mm_partition.output_nodes[0] + + if arm_quantizer_utils.is_annotated(mm_node): + continue + + input_act_qspec = quantization_config.get_input_act_qspec() + output_act_qspec = quantization_config.get_output_act_qspec() + + input_qspec_map = {} + input_act0 = mm_node.args[0] + if isinstance(input_act0, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act0, gm): + continue + input_qspec_map[input_act0] = input_act_qspec + + input_act1 = mm_node.args[1] + if isinstance(input_act1, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act1, gm): + continue + input_qspec_map[input_act1] = input_act_qspec + + mm_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=output_act_qspec, + _annotated=True, + ) + return annotated_partitions diff --git a/backends/arm/quantizer/quantization_annotation/mul_annotator.py b/backends/arm/quantizer/quantization_annotation/mul_annotator.py index 6ec8f95531..4717eac320 100644 --- a/backends/arm/quantizer/quantization_annotation/mul_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/mul_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py new file mode 100644 index 0000000000..8d507c11ef --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import Callable, List, Optional + +import torch +import torch.fx +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer.utils import ( + _annotate_input_qspec_map, + _annotate_output_qspec, +) +from torch.fx import Node + + +@register_annotator("one_to_one") +def _annotate_one_to_one( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + """ + This annotator adds the input and output qspec from the quantization config to + ops in 'one_to_one_ops' that have the following properties: + - Have a single input and single output. + - Can handle different qspecs on the input and output. + + Typical ops are ops implemented with a lookup table. + """ + annotated_partitions = [] + one_to_one_ops = (torch.ops.aten.exp.default, torch.ops.aten.log.default) + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in one_to_one_ops: + continue + if filter_fn and not filter_fn(node): + continue + input_node = node.args[0] + + if not arm_quantizer_utils.is_annotated(node): + _annotate_input_qspec_map( + node, + input_node, + quantization_config.get_input_act_qspec(), + ) + _annotate_output_qspec(node, quantization_config.get_output_act_qspec()) + + arm_quantizer_utils.mark_nodes_as_annotated([node]) + annotated_partitions.append([node]) + + return annotated_partitions diff --git a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py index bd683d81f0..3d24269483 100644 --- a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/sub_annotator.py b/backends/arm/quantizer/quantization_annotation/sub_annotator.py index 4686d480ed..92f1808d02 100644 --- a/backends/arm/quantizer/quantization_annotation/sub_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/sub_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py index f94c3e18da..1e776d37a6 100644 --- a/backends/arm/quantizer/quantization_config.py +++ b/backends/arm/quantizer/quantization_config.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from dataclasses import dataclass import torch diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 1419fe8f48..6d9ab6b009 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -11,15 +11,15 @@ */ #include - -#include -#include -#include - -#include +#include #include -#include + +#include "executorch/backends/arm/runtime/VelaBinStream.h" +#include "executorch/runtime/backend/interface.h" +#include "executorch/runtime/core/error.h" +#include "executorch/runtime/core/evalue.h" +#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h" using namespace std; @@ -31,7 +31,22 @@ typedef struct { bool permuted_io_flag; } ExecutionHandle; -class ArmBackend final : public PyTorchBackendInterface { +extern "C" { +void __attribute__((weak)) ArmBackend_execute_begin() {} +void __attribute__((weak)) ArmBackend_execute_end() {} +} + +class ArmBackendExecuteCallbacks { + public: + ArmBackendExecuteCallbacks() { + ArmBackend_execute_begin(); + } + ~ArmBackendExecuteCallbacks() { + ArmBackend_execute_end(); + } +}; + +class ArmBackend final : public ::executorch::runtime::BackendInterface { public: ArmBackend() {} @@ -50,7 +65,6 @@ class ArmBackend final : public PyTorchBackendInterface { char* data = (char*)processed->data(); size_t size = processed->size(); - char* foot = data + size - sizeof(VelaBinBlock); // Verify format of vela_bin if (vela_bin_validate(data, size) == false) { @@ -63,6 +77,7 @@ class ArmBackend final : public PyTorchBackendInterface { ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle); handle->processed = processed; + handle->permuted_io_flag = false; for (auto& compile_spec : compile_specs) { if (0 == std::strcmp(compile_spec.key, "permute_memory_format") && 0 == std::memcmp(compile_spec.value.buffer, "nhwc", 4)) { @@ -82,6 +97,7 @@ class ArmBackend final : public PyTorchBackendInterface { ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle; VelaHandles handles; + ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; // Command stream - we know at this point it's aligned char* data = (char*)execution_handle->processed->data(); ET_LOG(Info, "ArmBackend::execute %p", data); @@ -95,7 +111,7 @@ class ArmBackend final : public PyTorchBackendInterface { ET_LOG( Debug, - "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + "ArmBackend::execute: Running program data:\n cmd %p %zu\n weight %p %zu\n scratch %p %zu\n", handles.cmd_data, handles.cmd_data_size, handles.weight_data, @@ -108,7 +124,6 @@ class ArmBackend final : public PyTorchBackendInterface { // or DRAM output for compatible data layouts. for (int i = 0; i < handles.inputs->count; i++) { auto tensor_in = args[i]->toTensor(); - VelaIO* scratch_in = &handles.inputs->io[i]; char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset; // We accept: @@ -124,9 +139,9 @@ class ArmBackend final : public PyTorchBackendInterface { if (!supported) { ET_LOG( Error, - "Input %d expected Integer (4 byte) or Char (1 byte) integer inputs, got ScalarType id %d", + "Input %d expected Integer (4 byte) or Char (1 byte) integer inputs, got ScalarType id %s", i, - tensor_in.scalar_type()); + toString(tensor_in.scalar_type())); return Error::InvalidProgram; } @@ -148,8 +163,9 @@ class ArmBackend final : public PyTorchBackendInterface { if (both_char and permuted_input_shape) { // permuted byte copy CHW to HWC permute_CHW_to_HWC( - scratch_addr, tensor_in.mutable_data_ptr(), + scratch_addr, + tensor_in.size(1), tensor_in.size(2), tensor_in.size(3)); } else if (both_char or both_int) { @@ -165,8 +181,10 @@ class ArmBackend final : public PyTorchBackendInterface { } // Allocate driver handle and synchronously invoke driver - ethosu_driver* drv = ethosu_reserve_driver(); - if (drv == NULL) { + auto driver = + std::unique_ptr( + ethosu_reserve_driver(), ethosu_release_driver); + if (driver == NULL) { ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed"); return Error::InvalidState; } @@ -179,7 +197,7 @@ class ArmBackend final : public PyTorchBackendInterface { size_t bases_size[2] = { handles.weight_data_size, handles.scratch_data_size}; int result = ethosu_invoke_v3( - drv, + driver.get(), (void*)handles.cmd_data, handles.cmd_data_size, bases, @@ -202,17 +220,34 @@ class ArmBackend final : public PyTorchBackendInterface { // Process input EValue into scratch // Outputs are in the index immediately after inputs auto tensor_out = args[handles.inputs->count + i]->toTensor(); - for (int j = 0; j < tensor_out.numel(); j++) { - if (tensor_out.scalar_type() == ScalarType::Char) { - char* output_address = (char*)output_addr; - tensor_out.mutable_data_ptr()[j] = output_address[j]; - } else { - int* output_address = (int*)output_addr; - tensor_out.mutable_data_ptr()[j] = output_address[j]; + bool permuted_output_shape; + ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( + i, + tensor_out, + &handles.outputs->io[i], + execution_handle->permuted_io_flag, + &permuted_output_shape)); + if (tensor_out.scalar_type() == ScalarType::Char and + permuted_output_shape) { + char* output_address = (char*)output_addr; + permute_HWC_to_CHW( + output_address, + tensor_out.mutable_data_ptr(), + tensor_out.size(1), + tensor_out.size(2), + tensor_out.size(3)); + } else { + for (int j = 0; j < tensor_out.numel(); j++) { + if (tensor_out.scalar_type() == ScalarType::Char) { + char* output_address = (char*)output_addr; + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } else { + int* output_address = (int*)output_addr; + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } } } } - return Error::Ok; } @@ -223,51 +258,71 @@ class ArmBackend final : public PyTorchBackendInterface { private: Error check_requires_permute( int index, - const exec_aten::Tensor tensor_in, - VelaIO* input, + const exec_aten::Tensor tensor, + VelaIO* io, bool permuted_io_flag, bool* is_permuted) const { - bool permuted_input_shape = false; - if (tensor_in.dim() == 4) { + bool permuted_shape = false; + if (tensor.dim() == 4) { // special case for NHWC workaround in AOT; as the compilation has // permuted to channel last in an undetectable way, we assume here - // that the application has similarly permuted any input tensors. - permuted_input_shape = tensor_in.size(0) == input->shape[0] && - tensor_in.size(1) == input->shape[3] && - tensor_in.size(2) == input->shape[1] && - tensor_in.size(3) == input->shape[2]; - if (permuted_input_shape) { - ET_LOG(Info, "Tensor input %d will be permuted", index); + // that the application has similarly permuted any input/output tensors. + permuted_shape = tensor.size(0) == io->shape[0] && + tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] && + tensor.size(3) == io->shape[2]; + if (permuted_shape) { + ET_LOG(Info, "Tensor input/output %d will be permuted", index); } - if (permuted_io_flag != permuted_input_shape) { - ET_LOG(Error, "Permute compile flag and permuted input don't agree"); + if (permuted_io_flag != permuted_shape) { + ET_LOG( + Error, + "Permute compile flag and permuted input/output don't agree"); return Error::InvalidProgram; } } - if (!permuted_input_shape) { - // Error check matching shapes in the general case - for (int i = 0; i < tensor_in.dim(); i++) { - if (tensor_in.size(i) != input->shape[i]) { - ET_LOG(Error, "Tensor input %d mismatched shape", index); - ET_LOG( - Error, - "dimension %d mismatch, %d != %d", - index, - tensor_in.size(i), - input->shape[i]); - return Error::InvalidProgram; - } + if (!permuted_shape) { + // Check the number of elements in each tensor match + int tensor_count = 1; + int io_count = 1; + + for (int i = 0; i < tensor.dim(); i++) { + tensor_count = tensor_count * tensor.size(i); + } + + // The VelaIO type has a shape of fixed size 4 + for (int i = 0; i < 4; i++) { + io_count = io_count * io->shape[i]; + } + + if (tensor_count != io_count) { + ET_LOG(Error, "Input tensor sizes do not match"); + ET_LOG( + Error, + "Program expects %d elements but got %d", + io_count, + tensor_count); + return Error::InvalidProgram; } } - *is_permuted = permuted_input_shape; + *is_permuted = permuted_shape; return Error::Ok; } - void permute_CHW_to_HWC(char* input, char* output, int H, int W) const { + void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W) + const { + for (int i = 0; i != H * W; ++i) { + for (int j = 0; j < C; ++j) { + output[i * C + j] = input[i + j * W * H]; + } + } + } + + void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W) + const { for (int i = 0; i != H * W; ++i) { - output[i * 3 + 0] = input[i + 0 * W * H]; - output[i * 3 + 1] = input[i + 1 * W * H]; - output[i * 3 + 2] = input[i + 2 * W * H]; + for (int j = 0; j < C; ++j) { + output[i + j * W * H] = input[i * C + j]; + } } } }; diff --git a/backends/arm/runtime/TARGETS b/backends/arm/runtime/TARGETS new file mode 100644 index 0000000000..67f2bab681 --- /dev/null +++ b/backends/arm/runtime/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp index f9f8b0aca1..e2badbbd9f 100644 --- a/backends/arm/runtime/VelaBinStream.cpp +++ b/backends/arm/runtime/VelaBinStream.cpp @@ -12,9 +12,8 @@ #include -#include - -#include +#include "executorch/backends/arm/runtime/VelaBinStream.h" +#include "executorch/runtime/core/error.h" // get next mul of 16 ptr, return n if already aligned static uintptr_t next_mul_16(uintptr_t n) { @@ -25,17 +24,26 @@ bool vela_bin_validate(const char* data, int size) { const char* foot = data + size - sizeof(VelaBinBlock); // Check 16 byte alignment - if ((uintptr_t)data != next_mul_16((uintptr_t)data)) - return false; - if ((uintptr_t)foot != next_mul_16((uintptr_t)foot)) - return false; + bool valid = true; + if ((uintptr_t)data != next_mul_16((uintptr_t)data)) { + ET_LOG(Error, "Vela bin ptr not aligned to 16 bytes: %p", (void*)data); + valid = false; + } + if ((uintptr_t)foot != next_mul_16((uintptr_t)foot)) { + ET_LOG(Error, "End of vela bin not aligned to 16 bytes: %p", (void*)foot); + valid = false; + } // Check header and footer blocks are the right format - if (strncmp(data, "vela_bin_stream", strlen("vela_bin_stream")) != 0) - return false; - if (strncmp(foot, "vela_end_stream", strlen("vela_end_stream")) != 0) - return false; + if (strncmp(data, "vela_bin_stream", strlen("vela_bin_stream")) != 0) { + ET_LOG(Error, "Incorrect header in vela_bin_stream"); + valid = false; + } + if (strncmp(foot, "vela_end_stream", strlen("vela_end_stream")) != 0) { + ET_LOG(Error, "Incorrect footer in vela_bin_stream"); + valid = false; + } - return true; + return valid; } bool vela_bin_read(const char* data, VelaHandles* handles, int size) { diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h index 51e4cd3452..e946078f5a 100644 --- a/backends/arm/runtime/VelaBinStream.h +++ b/backends/arm/runtime/VelaBinStream.h @@ -15,6 +15,7 @@ #pragma once +#include #include // Standard block name size diff --git a/backends/arm/runtime/targets.bzl b/backends/arm/runtime/targets.bzl new file mode 100644 index 0000000000..da401daf0b --- /dev/null +++ b/backends/arm/runtime/targets.bzl @@ -0,0 +1,31 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "vela_bin_stream", + srcs = ["VelaBinStream.cpp"], + exported_headers = ["VelaBinStream.h"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/core:core", + ], + ) + runtime.cxx_library( + name = "arm_backend", + srcs = ["ArmBackendEthosU.cpp"], + headers = [], + compatible_with = ["ovr_config//cpu:arm32-embedded"], + # arm_executor_runner.cpp needs to compile with executor as whole + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/backend:interface", + ":vela_bin_stream", + "//executorch/runtime/core:core", + "fbsource//third-party/ethos-u-core-driver:core_driver", + ], + ) diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index f85fd1f2da..0d50f1882d 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -14,6 +14,7 @@ import torch from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder +from executorch.exir.backend.compile_spec_schema import CompileSpec _enabled_options: list[str] = [] @@ -85,7 +86,9 @@ def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool: return False -def get_tosa_compile_spec(permute_memory_to_nhwc=True, custom_path=None): +def get_tosa_compile_spec( + permute_memory_to_nhwc=True, custom_path=None +) -> list[CompileSpec]: """ Default compile spec for TOSA tests. """ @@ -112,8 +115,8 @@ def get_tosa_compile_spec_unbuilt( def get_u55_compile_spec( - permute_memory_to_nhwc=False, quantize_io=False, custom_path=None -): + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None +) -> list[CompileSpec]: """ Default compile spec for Ethos-U55 tests. """ @@ -122,10 +125,21 @@ def get_u55_compile_spec( ).build() +def get_u85_compile_spec( + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None +) -> list[CompileSpec]: + """ + Default compile spec for Ethos-U85 tests. + """ + return get_u85_compile_spec_unbuilt( + permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path + ).build() + + def get_u55_compile_spec_unbuilt( - permute_memory_to_nhwc=False, quantize_io=False, custom_path=None + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None ) -> ArmCompileSpecBuilder: - """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify + """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify the compile spec before calling .build() to finalize it. """ artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_") @@ -137,7 +151,29 @@ def get_u55_compile_spec_unbuilt( "ethos-u55-128", system_config="Ethos_U55_High_End_Embedded", memory_mode="Shared_Sram", - extra_flags=None, + extra_flags="--debug-force-regor --output-format=raw", + ) + .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) + .set_permute_memory_format(permute_memory_to_nhwc) + .dump_intermediate_artifacts_to(artifact_path) + ) + return compile_spec + + +def get_u85_compile_spec_unbuilt( + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None +) -> list[CompileSpec]: + """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify + the compile spec before calling .build() to finalize it. + """ + artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u85_") + compile_spec = ( + ArmCompileSpecBuilder() + .ethosu_compile_spec( + "ethos-u85-128", + system_config="Ethos_U85_SYS_DRAM_Mid", + memory_mode="Shared_Sram", + extra_flags="--output-format=raw", ) .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) .set_permute_memory_format(permute_memory_to_nhwc) diff --git a/backends/arm/test/misc/test_lifted_tensor.py b/backends/arm/test/misc/test_lifted_tensor.py new file mode 100644 index 0000000000..90aa7e2950 --- /dev/null +++ b/backends/arm/test/misc/test_lifted_tensor.py @@ -0,0 +1,42 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + + +class LiftedTensor(torch.nn.Module): + + def __init__(self): + super().__init__() + self.lifted_tensor = torch.Tensor([[1, 2], [3, 4]]) + + def forward(self, x: torch.Tensor, length) -> torch.Tensor: + sliced = self.lifted_tensor[:, :length] + return sliced + x + + +class TestLiftedTensor(unittest.TestCase): + """Tests the ArmPartitioner with a placeholder of type lifted tensor.""" + + def test_partition_lifted_tensor(self): + tester = ( + ArmTester( + LiftedTensor(), + example_inputs=(torch.ones(2, 2), 2), + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .to_edge() + .dump_artifact() + ) + signature = tester.get_artifact().exported_program().graph_signature + assert len(signature.lifted_tensor_constants) > 0 + tester.partition() + tester.to_executorch() + tester.run_method_and_compare_outputs((torch.ones(2, 2), 2)) diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index 248153a518..bb2c0d103f 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -84,7 +84,7 @@ def test_mv2_tosa_BI(self): ) def test_mv2_u55_BI(self): - ( + tester = ( ArmTester( self.mv2, example_inputs=self.model_inputs, @@ -96,4 +96,9 @@ def test_mv2_u55_BI(self): .check(list(self.operators_after_quantization)) .partition() .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs( + atol=1.0, qtol=1, inputs=self.model_inputs + ) diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 3bd2b2605c..63023327f7 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -37,9 +37,9 @@ class Add2(torch.nn.Module): torch.FloatTensor([1, 2, 3, 5, 7]), (torch.FloatTensor([2, 1, 2, 1, 10])), ), - (torch.ones(1, 1, 4, 4), torch.ones(1, 1, 4, 4)), + (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)), (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)), - (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)), + (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)), (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)), ] @@ -101,7 +101,7 @@ def _test_add_u55_BI_pipeline( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), ) .quantize() .export() diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py new file mode 100644 index 0000000000..30f4526124 --- /dev/null +++ b/backends/arm/test/ops/test_bmm.py @@ -0,0 +1,135 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +torch.manual_seed(1) + + +class TestBMM(unittest.TestCase): + """Tests Batch MatMul""" + + class BMM(torch.nn.Module): + test_parameters = [ + (torch.rand(5, 3, 5), torch.rand(5, 5, 2)), + (torch.rand(2, 1, 1), torch.rand(2, 1, 1)), + (torch.ones(1, 55, 3), torch.ones(1, 3, 44)), + (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)), + (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)), + ] + + def forward(self, x, y): + return torch.bmm(x, y) + + class BMMSingleInput(torch.nn.Module): + test_parameters = [ + (torch.rand(20, 3, 3),), + (torch.ones(2, 128, 128),), + (10000 * torch.randn(4, 25, 25),), + (5 + 5 * torch.randn(3, 64, 64),), + ] + + def forward(self, x): + return torch.bmm(x, x) + + def _test_bmm_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_bmm_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_bmm_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + + # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy + @parameterized.expand(BMMSingleInput.test_parameters) + @unittest.expectedFailure + def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_u55_BI_pipeline(self.BMMSingleInput(), test_data) diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py new file mode 100644 index 0000000000..a40ae43b67 --- /dev/null +++ b/backends/arm/test/ops/test_cat.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + + +class TestCat(unittest.TestCase): + + class Cat(torch.nn.Module): + test_parameters = [ + ((torch.ones(1), torch.ones(1)), 0), + ((torch.ones(1, 2), torch.randn(1, 5), torch.randn(1, 1)), 1), + ( + ( + torch.ones(1, 2, 5), + torch.randn(1, 2, 4), + torch.randn(1, 2, 2), + torch.randn(1, 2, 1), + ), + -1, + ), + ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3), + ( + ( + 10000 * torch.randn(2, 3, 1, 4), + torch.randn(2, 7, 1, 4), + torch.randn(2, 1, 1, 4), + ), + -3, + ), + ] + + def __init__(self): + super().__init__() + + def forward(self, tensors: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor: + return torch.cat(tensors, dim=dim) + + def _test_cat_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_cat_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_cat_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Cat.test_parameters) + def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_tosa_MI_pipeline(self.Cat(), test_data) + + def test_cat_4d_tosa_MI(self): + square = torch.ones((2, 2, 2, 2)) + for dim in range(-3, 3): + test_data = ((square, square), dim) + self._test_cat_tosa_MI_pipeline(self.Cat(), test_data) + + @parameterized.expand(Cat.test_parameters) + def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_tosa_BI_pipeline(self.Cat(), test_data) + + # TODO: Remove @unittest.expectedFailure when this issue is fixed in Regor + @parameterized.expand(Cat.test_parameters) + @unittest.expectedFailure + def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_u55_BI_pipeline(self.Cat(), test_data) diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py index 9ebfe77da2..8274879953 100644 --- a/backends/arm/test/ops/test_conv.py +++ b/backends/arm/test/ops/test_conv.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import List, Tuple, Union @@ -15,9 +14,6 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - class Conv2d(torch.nn.Module): """ @@ -159,14 +155,14 @@ def forward(self, x): batches=1, ) -conv2d_2x2_1x1x14x14_st2 = Conv2d( +conv2d_2x2_1x1x14x13_st2 = Conv2d( in_channels=1, out_channels=1, kernel_size=(2, 2), stride=2, padding=0, width=14, - height=14, + height=13, batches=1, ) @@ -192,6 +188,18 @@ def forward(self, x): batches=1, ) +conv2d_5x5_1x3x14x15_st3_pd1 = Conv2d( + in_channels=3, + out_channels=16, + kernel_size=(5, 5), + stride=3, + padding=1, + width=14, + height=15, + batches=1, +) + + two_conv2d_nobias = Conv2d( nbr_conv=2, width=256, @@ -225,7 +233,8 @@ def forward(self, x): ("3x3_1x3x256x256_st1", conv2d_3x3_1x3x256x256_st1), ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1), ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1), - ("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2), + ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2), + ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1), ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1), ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1), ("two_conv2d_nobias", two_conv2d_nobias), @@ -240,7 +249,10 @@ def forward(self, x): testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1)) # Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191. -testsuite_u55.remove(("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2)) +testsuite_u55.remove(("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2)) +testsuite_u55.remove( + ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1) +) class TestConv2D(unittest.TestCase): diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 88006df1a0..31051ef8f7 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -102,7 +102,7 @@ def forward(self, x): return self.adaptive_avg_pool2d(x) -class ComboConvBatchnormRelu(torch.nn.Module): +class ComboConvBatchnormRelu6(torch.nn.Module): edge_op_list = [ "executorch_exir_dialects_edge__ops_aten_convolution_default", "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default", @@ -235,16 +235,16 @@ def test_conv_meandim_u55_BI(self): ############################## ## Conv + batch norm + relu ## ############################## - def test_conv_batchnorm_relu_tosa_MI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_tosa_MI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs()) - def test_conv_batchnorm_relu_tosa_BI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_tosa_BI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs()) - def test_conv_batchnorm_relu_u55_BI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_u55_BI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs()) ################## diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 9b3f79e6a1..5545a5ce4a 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -130,6 +130,11 @@ # Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191. testsuite_u55.remove(("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1)) +# Add failing test (set_quantize_io=True) temporarily to investigate +testsuite_u55.append( + ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1, True) +) + class TestDepthwiseConv2D(unittest.TestCase): """Tests Conv2D where groups == in_channels and out_channels = K * in_channels. This @@ -173,13 +178,18 @@ def _test_dw_conv2d_tosa_BI_pipeline( ) def _test_dw_conv2d_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + self, + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], + set_quantize_io: bool = False, ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=common.get_u55_compile_spec( + permute_memory_to_nhwc=True, quantize_io=set_quantize_io + ), ) .quantize() .export() @@ -202,5 +212,7 @@ def test_dw_conv2d_tosa_BI(self, test_name, model): @parameterized.expand(testsuite_u55, skip_on_empty=True) @unittest.expectedFailure - def test_dw_conv2d_u55_BI(self, test_name, model): - self._test_dw_conv2d_u55_BI_pipeline(model, model.get_inputs()) + def test_dw_conv2d_u55_BI(self, test_name, model, set_quantize_io=False): + self._test_dw_conv2d_u55_BI_pipeline( + model, model.get_inputs(), set_quantize_io=set_quantize_io + ) diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py new file mode 100644 index 0000000000..4f4935d482 --- /dev/null +++ b/backends/arm/test/ops/test_exp.py @@ -0,0 +1,108 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestExp(unittest.TestCase): + """Tests lowering of aten.exp""" + + class Exp(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.exp(x) + + def _test_exp_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.exp.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_exp_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check(["torch.ops.aten.exp.default"]) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_exp_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.exp.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_exp_tosa_MI_pipeline(self.Exp(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_tosa_BI_pipeline(self.Exp(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_tosa_u55_BI_pipeline(self.Exp(), (test_data,)) diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py new file mode 100644 index 0000000000..66c081a544 --- /dev/null +++ b/backends/arm/test/ops/test_expand.py @@ -0,0 +1,109 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Tests the expand op which copies the data of the input tensor (possibly with new data format) +# + +import unittest +from typing import Sequence, Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +class TestSimpleExpand(unittest.TestCase): + """Tests the Tensor.expand which should be converted to a repeat op by a pass.""" + + class Expand(torch.nn.Module): + # (input tensor, multiples) + test_parameters = [ + (torch.ones(1), (2,)), + (torch.ones(1, 4), (1, -1)), + (torch.ones(1, 1, 2, 2), (4, 3, -1, 2)), + (torch.ones(1), (2, 2, 4)), + (torch.ones(3, 2, 4, 1), (-1, -1, -1, 3)), + ] + + def forward(self, x: torch.Tensor, multiples: Sequence): + return x.expand(multiples) + + def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.expand.default": 1}) + .to_edge() + .partition() + .check_not(["torch.ops.aten.expand.default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.expand.default": 1}) + .to_edge() + .partition() + .check_not(["torch.ops.aten.expand.default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_expand_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.expand.default": 1}) + .to_edge() + .partition() + .check_not(["torch.ops.aten.expand.default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Expand.test_parameters) + def test_expand_tosa_MI(self, test_input, multiples): + self._test_expand_tosa_MI_pipeline(self.Expand(), (test_input, multiples)) + + @parameterized.expand(Expand.test_parameters) + def test_expand_tosa_BI(self, test_input, multiples): + self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples)) + + # Expected failure since tosa.TILE is unsupported by Vela. + @parameterized.expand(Expand.test_parameters) + @unittest.expectedFailure + def test_expand_u55_BI(self, test_input, multiples): + self._test_expand_tosa_u55_pipeline(self.Expand(), (test_input, multiples)) diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 33f62955ec..6fdbb2127e 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -26,17 +26,17 @@ ( "model_linear_rank1_zeros", torch.zeros(10), - 10, + 15, ), ( "model_linear_rank1_ones", torch.ones(10), - 10, + 15, ), ( "model_linear_rank1_negative_ones", torch.ones(10) * (-1), - 10, + 20, ), ( "model_linear_rank1_rand", @@ -46,12 +46,12 @@ ( "model_linear_rank1_negative_large_rand", torch.rand(10) * (-100), - 10, + 30, ), ( "model_linear_rank1_large_randn", - torch.randn(10) * 100, - 10, + torch.randn(15) * 100, + 20, ), ] diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py new file mode 100644 index 0000000000..90066b3a63 --- /dev/null +++ b/backends/arm/test/ops/test_log.py @@ -0,0 +1,108 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +test_data_suite = [ + # (test_name, test_data) + ("ones_rank4", torch.ones(1, 10, 10, 10)), + ("ones_rank3", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) + 0.001), + ("randn_pos", torch.randn(10) + 10), + ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)), + ("ramp", torch.arange(0.01, 20, 0.2)), +] + + +class TestLog(unittest.TestCase): + """Tests lowering of aten.log""" + + class Log(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.log(x) + + def _test_log_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.log.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_log_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check(["torch.ops.aten.log.default"]) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_log_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.log.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_log_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_log_tosa_MI_pipeline(self.Log(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_log_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_tosa_BI_pipeline(self.Log(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_log_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_tosa_u55_BI_pipeline(self.Log(), (test_data,)) diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index e0db958f74..e48d749c19 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -106,7 +106,12 @@ def _test_meandim_tosa_u55_BI_pipeline( .check(["torch.ops.quantized_decomposed"]) .to_edge() .partition() - .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mean_dim", + "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default", + ] + ) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() ) diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py new file mode 100644 index 0000000000..9a9b3ef579 --- /dev/null +++ b/backends/arm/test/ops/test_mm.py @@ -0,0 +1,141 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +torch.manual_seed(0) + + +class TestMM(unittest.TestCase): + """Tests MatMul""" + + class MM(torch.nn.Module): + test_parameters = [ + (torch.rand(3, 5), torch.rand(5, 2)), + (torch.rand(1, 1), torch.rand(1, 1)), + (torch.ones(55, 3), torch.ones(3, 44)), + (10000 * torch.randn(1, 10), torch.randn(10, 5)), + (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)), + ] + + def forward(self, x, y): + return torch.mm(x, y) + + class MMSingleInput(torch.nn.Module): + test_parameters = [ + (torch.rand(3, 3),), + (torch.ones(128, 128),), + (10000 * torch.randn(25, 25),), + (5 + 5 * torch.randn(64, 64),), + ] + + def forward(self, x): + return torch.mm(x, x) + + def _test_mm_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.mm.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_mm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_mm_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.mm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_mm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_mm_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.mm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(MM.test_parameters) + def test_mm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_mm_tosa_MI_pipeline(self.MM(), test_data) + + @parameterized.expand(MMSingleInput.test_parameters) + def test_mm_single_input_tosa_MI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_mm_tosa_MI_pipeline(self.MMSingleInput(), test_data) + + @parameterized.expand(MM.test_parameters) + def test_mm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_mm_tosa_BI_pipeline(self.MM(), test_data) + + @parameterized.expand(MMSingleInput.test_parameters) + def test_mm_single_input_tosa_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_mm_tosa_BI_pipeline(self.MMSingleInput(), test_data) + + # Expected to fail with error: CPU performance estimation for "MatMul" not implemented + @parameterized.expand(MM.test_parameters) + @unittest.expectedFailure + def test_mm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_mm_u55_BI_pipeline(self.MM(), test_data) + + # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy + @parameterized.expand(MMSingleInput.test_parameters) + @unittest.expectedFailure + def test_mm_single_input_u55_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_mm_u55_BI_pipeline(self.MMSingleInput(), test_data) diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py new file mode 100644 index 0000000000..2aac3f22f1 --- /dev/null +++ b/backends/arm/test/ops/test_mul.py @@ -0,0 +1,152 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +test_data_sute = [ + # (test_name, input, other,) See torch.mul() for info + ( + "op_mul_rank1_ones", + torch.ones(5), + torch.ones(5), + ), + ( + "op_mul_rank2_rand", + torch.rand(4, 5), + torch.rand(1, 5), + ), + ( + "op_mul_rank3_randn", + torch.randn(10, 5, 2), + torch.randn(10, 5, 2), + ), + ( + "op_mul_rank4_randn", + torch.randn(5, 10, 25, 20), + torch.randn(5, 10, 25, 20), + ), + ( + "op_mul_rank4_ones_mul_negative", + torch.ones(1, 10, 25, 20), + (-1) * torch.ones(5, 10, 25, 20), + ), + ( + "op_mul_rank4_negative_large_rand", + (-200) * torch.rand(5, 10, 25, 20), + torch.rand(5, 1, 1, 20), + ), + ( + "op_mul_rank4_large_randn", + 200 * torch.randn(5, 10, 25, 20), + torch.rand(5, 10, 25, 1), + ), +] + + +class TestMul(unittest.TestCase): + class Mul(torch.nn.Module): + + def forward( + self, + input_: torch.Tensor, + other_: torch.Tensor, + ): + return input_ * other_ + + def _test_mul_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True), + ) + .export() + .check_count({"torch.ops.aten.mul.Tensor": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_mul_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=True), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.mul.Tensor": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1.0) + ) + + def _test_mul_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.mul.Tensor": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_sute) + def test_mul_tosa_MI( + self, + test_name: str, + input_: torch.Tensor, + other_: torch.Tensor, + ): + test_data = (input_, other_) + self._test_mul_tosa_MI_pipeline(self.Mul(), test_data) + + @parameterized.expand(test_data_sute) + def test_mul_tosa_BI( + self, + test_name: str, + input_: torch.Tensor, + other_: torch.Tensor, + ): + + test_data = (input_, other_) + self._test_mul_tosa_BI_pipeline(self.Mul(), test_data) + + @parameterized.expand(test_data_sute) + def test_mul_u55_BI( + self, + test_name: str, + input_: torch.Tensor, + other_: torch.Tensor, + ): + test_data = (input_, other_) + self._test_mul_u55_BI_pipeline(self.Mul(), test_data) diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py new file mode 100644 index 0000000000..d2ca8540f4 --- /dev/null +++ b/backends/arm/test/ops/test_relu.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestRelu(unittest.TestCase): + class Relu(torch.nn.Module): + def __init__(self): + super().__init__() + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(x) + + def _test_relu_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.relu.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_relu_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.relu.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_relu_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.relu.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_relu_tosa_MI_pipeline(self.Relu(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_tosa_BI_pipeline(self.Relu(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_tosa_u55_BI_pipeline(self.Relu(), (test_data,)) diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py new file mode 100644 index 0000000000..a6fad03345 --- /dev/null +++ b/backends/arm/test/ops/test_repeat.py @@ -0,0 +1,110 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Tests the repeat op which copies the data of the input tensor (possibly with new data format) +# + +import unittest +from typing import Sequence, Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +class TestSimpleRepeat(unittest.TestCase): + """Tests Tensor.repeat for different ranks and dimensions.""" + + class Repeat(torch.nn.Module): + # (input tensor, multiples) + test_parameters = [ + (torch.randn(3), (2,)), + (torch.randn(3, 4), (2, 1)), + (torch.randn(1, 1, 2, 2), (1, 2, 3, 4)), + (torch.randn(3), (2, 2)), + (torch.randn(3), (1, 2, 3)), + (torch.randn((3, 3)), (2, 2, 2)), + ] + + def forward(self, x: torch.Tensor, multiples: Sequence): + return x.repeat(multiples) + + def _test_repeat_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.repeat.default": 1}) + .to_edge() + .partition() + .check_not(["torch.ops.aten.repeat.default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.repeat.default": 1}) + .to_edge() + .partition() + .check_not(["torch.ops.aten.repeat.default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_repeat_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.repeat.default": 1}) + .to_edge() + .partition() + .check_not(["torch.ops.aten.repeat.default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Repeat.test_parameters) + def test_repeat_tosa_MI(self, test_input, multiples): + self._test_repeat_tosa_MI_pipeline(self.Repeat(), (test_input, multiples)) + + @parameterized.expand(Repeat.test_parameters) + def test_repeat_tosa_BI(self, test_input, multiples): + self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples)) + + # Expected failure since tosa.TILE is unsupported by Vela. + @parameterized.expand(Repeat.test_parameters) + @unittest.expectedFailure + def test_repeat_u55_BI(self, test_input, multiples): + self._test_repeat_tosa_u55_pipeline(self.Repeat(), (test_input, multiples)) diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py index 7a0435689f..369019774f 100644 --- a/backends/arm/test/ops/test_sigmoid.py +++ b/backends/arm/test/ops/test_sigmoid.py @@ -145,8 +145,6 @@ def test_sigmoid_add_sigmoid_tosa_BI(self): self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1]) ) - # Fails due to Vela diff from Tosa spec, expected to work with Regor. @parameterized.expand(test_data_suite) - @unittest.expectedFailure def test_sigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): self._test_sigmoid_tosa_u55_BI_pipeline(self.Sigmoid(), (test_data,)) diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py index a1c1e29cbc..14874df156 100644 --- a/backends/arm/test/ops/test_slice.py +++ b/backends/arm/test/ops/test_slice.py @@ -33,7 +33,7 @@ def forward(self, x: torch.Tensor): elif x.dim() == 3: return x[0:7, 0:1, 0:8] elif x.dim() == 4: - return x[:, 2:5, 3:5, 4:5] + return x[:, 2:5, 3:5, 4:10] def _test_slice_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: torch.Tensor diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index b3b6230daa..20da65b687 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -15,15 +14,17 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) test_data_suite = [ # (test_name, test_data, dim) - ("zeros", torch.zeros(10, 10, 10, 10), 1), + ("zeros", torch.zeros(10, 10, 10, 10), 0), + ("zeros_neg_dim", torch.zeros(10, 10, 10, 10), -4), ("ones", torch.ones(10, 10, 10, 10), 1), + ("ones_neg_dim", torch.ones(10, 10, 10, 10), -1), ("rand", torch.rand(10, 10, 10, 10), 2), + ("rand_neg_dim", torch.rand(10, 10, 10, 10), -2), ("randn", torch.randn(10, 10, 10, 10), 3), + ("randn_neg_dim", torch.randn(10, 10, 10, 10), -3), ] diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py new file mode 100644 index 0000000000..bc998179c0 --- /dev/null +++ b/backends/arm/test/ops/test_split.py @@ -0,0 +1,139 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + +test_data_t = tuple[torch.Tensor, int | list[int], int] + + +class TestSimpleSplit(unittest.TestCase): + class Split(torch.nn.Module): + + test_data: list[tuple[test_data_t]] = [ + ((torch.rand(10), 2, 0),), + ((torch.rand(10, 10), 3, 1),), + ((torch.rand(10, 10), 4, -1),), + ((torch.rand(10, 15, 10), [2, 2, 11], 1),), + ((torch.rand(4, 4, 4, 4), 2, 0),), + ((torch.rand(4, 4, 4, 4), [1, 1, 1, 1], -2),), + ] + + def forward( + self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int + ): + return x.split(split_size=split_size_or_sections, dim=dim) + + class SplitWithSizes(torch.nn.Module): + def forward(self, x: torch.Tensor, split_sizes: list[int], dim: int): + return x.split_with_sizes(split_sizes=split_sizes, dim=dim) + + class SplitSingleOut(torch.nn.Module): + def forward( + self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int + ): + return x.split(split_size=split_size_or_sections, dim=dim)[1] + + class SplitTwoOut(torch.nn.Module): + def forward( + self, x: torch.Tensor, split_size_or_sections: int | list[int], dim: int + ): + return x.split(split_size=split_size_or_sections, dim=dim)[1:3] + + def _test_split_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: test_data_t + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .to_edge() + .check( + [ + "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default" + ] + ) + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_split_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: test_data_t + ): + + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_split_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: test_data_t + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check(["torch.ops.aten.split.Tensor"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Split.test_data) + def test_split_tosa_MI(self, test_data: test_data_t): + self._test_split_tosa_MI_pipeline(self.Split(), test_data) + + @parameterized.expand([Split.test_data[3], Split.test_data[5]]) + def test_split_with_sizes_tosa_MI(self, test_data: test_data_t): + assert isinstance(test_data[1], list) + self._test_split_tosa_MI_pipeline(self.SplitWithSizes(), test_data) + + @parameterized.expand(Split.test_data) + def test_split_n_out_tosa_MI(self, test_data: test_data_t): + self._test_split_tosa_MI_pipeline(self.SplitSingleOut(), test_data) + self._test_split_tosa_MI_pipeline(self.SplitTwoOut(), test_data) + + @parameterized.expand(Split.test_data) + def test_split_tosa_BI(self, test_data: test_data_t): + self._test_split_tosa_BI_pipeline(self.Split(), test_data) + + # Fails during Vela compilation when trying to use a Tuple as a Named tuple, + # Could be Vela Issue, wait until Regor. + @parameterized.expand(Split.test_data) + @unittest.expectedFailure + def test_split_u55_BI(self, test_data: test_data_t): + self._test_split_u55_BI_pipeline(self.Split(), test_data) diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py index 2ae7c3ab36..0a9f159f36 100644 --- a/backends/arm/test/ops/test_sub.py +++ b/backends/arm/test/ops/test_sub.py @@ -104,9 +104,7 @@ def test_sub_tosa_BI(self, test_data: torch.Tensor): test_data = (test_data,) self._test_sub_tosa_BI_pipeline(self.Sub(), test_data) - # Expected to fail since RESCALE cannot be fused with SUB in Vela. @parameterized.expand(Sub.test_parameters) - @unittest.expectedFailure def test_sub_u55_BI(self, test_data: torch.Tensor): test_data = (test_data,) self._test_sub_u55_BI_pipeline(self.Sub(), test_data) diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py new file mode 100644 index 0000000000..6da6a196c0 --- /dev/null +++ b/backends/arm/test/ops/test_unsqueeze.py @@ -0,0 +1,103 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Tests the unsqueeze op which copies the data of the input tensor (possibly with new data format) +# + +import unittest +from typing import Sequence, Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +class TestSimpleUnsqueeze(unittest.TestCase): + class Unsqueeze(torch.nn.Module): + shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 5), (5, 5, 5)] + test_parameters: list[tuple[torch.Tensor]] = [(torch.ones(n),) for n in shapes] + + def forward(self, x: torch.Tensor, dim): + return x.unsqueeze(dim) + + def _test_unsqueeze_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_unsqueeze_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_unsqueeze_tosa_u55_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor): + for i in range(-test_tensor.dim() - 1, test_tensor.dim() + 1): + self._test_unsqueeze_tosa_MI_pipeline(self.Unsqueeze(), (test_tensor, i)) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0)) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_tosa_u55_pipeline(self.Unsqueeze(), (test_tensor, 0)) diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py new file mode 100644 index 0000000000..1cd63e6e52 --- /dev/null +++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py @@ -0,0 +1,75 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import RunPasses + + +class MeanDim(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=[-1, -2], keepdim=True) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class MeanDim2(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=1) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class TestMeandimToAveragePool2dPass(unittest.TestCase): + """ + Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d + for the special case where dim is [-1, -2] and keepdim is True. + """ + + def test_tosa_BI_meandim_to_averagepool(self): + module = MeanDim() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) + + def test_tosa_BI_meandim_no_modification(self): + module = MeanDim2() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 58c99a9201..6e8b9b25ed 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -10,6 +10,7 @@ import subprocess import tempfile +from pathlib import Path from typing import Dict, List, Optional, Tuple import numpy as np @@ -202,7 +203,7 @@ def set_timeout(self, timeout: int): def run_corstone300( self, inputs: Tuple[torch.Tensor], - ) -> torch.Tensor: + ) -> list[torch.Tensor]: assert ( self._has_init_run @@ -265,20 +266,23 @@ def run_corstone300( raise RuntimeError( f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}" ) + elif "E [" in result_stdout: + logger.error(result_stdout) tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32) - tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(inputs[0].shape) - return tosa_ref_output + output_shape = self.output_node.args[0][0].meta["val"].shape + tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape) + return [tosa_ref_output] def run_tosa_ref_model( self, inputs: Tuple[torch.Tensor], - ) -> torch.Tensor: + ) -> list[torch.Tensor]: """ - Run TOSA reference model using the tosa_refence_model program. + Run TOSA reference model using the tosa_reference_model program. In order to do that we need: - 1. desc.json, which points to files needed by tosa_refence_model. + 1. desc.json, which points to files needed by tosa_reference_model. 2. output.tosa, which is the TOSA buffer that describes the model we're trying to run. @@ -287,12 +291,6 @@ def run_tosa_ref_model( All these files are saved on disk in self.intermediate_path. Args: - params_input (Tuple[List[str], List[QuantizationParams]]): A tuple - containing a list of input node names and a list of their - quantization parameters (if model is quantized). - param_output (Tuple[str, QuantizationParams]): A tuple containing - the output node name and its quantization parameters (if - model is quantized). inputs (Tuple[torch.Tensor]): The input data to run the TOSA Returns: @@ -328,7 +326,18 @@ def run_tosa_ref_model( self._has_init_run ), "RunnerUtil needs to be initialized using init_run() before running tosa reference." - desc_file_path = os.path.join(self.intermediate_path, "desc.json") + all_desc_file_paths = [ + str(path) for path in Path(self.intermediate_path).glob("desc*.json") + ] + assert ( + all_desc_file_paths + ), f"No TOSA description file found in '{self.intermediate_path}'." + if len(all_desc_file_paths) != 1: + raise NotImplementedError( + "Graphs with more than one partition are currently not supported." + ) + + desc_file_path = all_desc_file_paths[0] assert os.path.exists( desc_file_path ), f"desc_file_path: {desc_file_path} does not exist" @@ -369,23 +378,26 @@ def run_tosa_ref_model( # Load desc.json, just to get the name of the output file above with open(desc_file_path) as f: desc_json = json.load(f) - ofm_file_npy = os.path.join(self.intermediate_path, desc_json["ofm_file"][0]) - # Load the output file (OFM) and return it as a numpy array - tosa_ref_output = np.load(ofm_file_npy) + tosa_ref_outputs = [] + for ofm_file in desc_json["ofm_file"]: + ofm_file_npy = os.path.join(self.intermediate_path, ofm_file) + + # Load the output file (OFM) and return it as a numpy array + tosa_ref_output = np.load(ofm_file_npy) - if self.is_quantized: - # Need to dequant back to FP32 for comparison with torch output - quant_param = self.qp_output - assert ( - quant_param is not None - ), "There are no quantization parameters, check output parameters" - tosa_ref_output = (tosa_ref_output - quant_param.zp) * quant_param.scale + if self.is_quantized: + # Need to dequant back to FP32 for comparison with torch output + quant_param = self.qp_output + assert ( + quant_param is not None + ), "There are no quantization parameters, check output parameters" + tosa_ref_output = (tosa_ref_output - quant_param.zp) * quant_param.scale - # tosa_output is a numpy array, convert to torch tensor for comparison - tosa_ref_output = torch.from_numpy(tosa_ref_output.astype("float32")) + # tosa_output is a numpy array, convert to torch tensor for comparison + tosa_ref_outputs.append(torch.from_numpy(tosa_ref_output.astype("float32"))) - return tosa_ref_output + return tosa_ref_outputs def prep_data_for_save( @@ -420,7 +432,7 @@ def save_npy( Parameters: path: the directory where to save the data. data: the data to save. - is_quantize: whether to quantize the data before saving it. + is_quantized: whether to quantize the data before saving it. input_name: the name of the file, without file-ending. quant_param: the parameters to use for quantization. Returns: @@ -445,7 +457,7 @@ def save_bytes( Parameters: path: the directory where to save the data. data: the data to save. - is_quantize: whether to quantize the data before saving it. + is_quantized: whether to quantize the data before saving it. input_name: the name of the file, without file-ending. quant_param: the parameters to use for quantization. Returns: diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index be5ea7dd71..2fe8c07e7d 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -34,6 +34,7 @@ from executorch.backends.xnnpack.test.tester import Tester from executorch.exir import EdgeCompileConfig from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.lowered_backend_module import LoweredBackendModule from torch.fx import Graph logger = logging.getLogger(__name__) @@ -44,21 +45,42 @@ class Partition(tester.Partition): def dump_artifact(self, path_to_dump: Optional[str]): super().dump_artifact(path_to_dump) - to_print = None - for spec in self.graph_module.lowered_module_0.compile_specs: - if spec.key == "output_format": - if spec.value == b"tosa": - tosa_fb = self.graph_module.lowered_module_0.processed_bytes + def get_output_format(lowered_module) -> str | None: + for spec in lowered_module.compile_specs: + if spec.key == "output_format": + return spec.value.decode() + return None + + output = "" + for node in self.graph_module.graph.nodes: + if node.op == "get_attr" and node.name.startswith("lowered_module_"): + lowered_module = getattr(self.graph_module, node.name) + assert isinstance( + lowered_module, LoweredBackendModule + ), f"Attribute {node.name} must be of type LoweredBackendModule." + + output_format = get_output_format(lowered_module) + if output_format == "tosa": + tosa_fb = lowered_module.processed_bytes to_print = dbg_tosa_fb_to_json(tosa_fb) to_print = pformat(to_print, compact=True, indent=1) - to_print = f"\n TOSA deserialized: \n{to_print}" - elif spec.value == b"vela": - vela_cmd_stream = self.graph_module.lowered_module_0.processed_bytes - to_print = str(vela_cmd_stream) - to_print = f"\n Vela command stream: \n{to_print}" - break - assert to_print is not None, "No TOSA nor Vela compile spec found" - _dump_str(to_print, path_to_dump) + output += f"\nTOSA deserialized {node.name}: \n{to_print}\n" + elif output_format == "vela": + vela_cmd_stream = lowered_module.processed_bytes + output += ( + f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n" + ) + else: + logger.warning( + f"No TOSA nor Vela compile spec found in compile specs of {node.name}." + ) + continue + + if not output: + logger.warning("No output to print generated from artifact.") + return + + _dump_str(output, path_to_dump) class Serialize(tester.Serialize): @@ -242,16 +264,21 @@ def run_method_and_compare_outputs( # Loop inputs and compare reference stage with the compared stage. for run_iteration in range(num_runs): reference_input = inputs if inputs else next(self.generate_random_inputs()) - if is_nhwc: - test_input = self.transpose_data_format(reference_input, "NHWC") - else: - test_input = reference_input # Test parameters can include constants that are used in eager mode but are already set as attributes # in TOSA. Therefore, only accept torch.Tensor inputs. - test_input = [ - tensor for tensor in test_input if isinstance(tensor, torch.Tensor) - ] + test_input: list[torch.Tensor] = [] + for arg in reference_input: + if isinstance(arg, torch.Tensor): + test_input.append(arg) + if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor): + test_input.extend(list(arg)) + + if ( + is_nhwc + and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] + ): + test_input = self.transpose_data_format(test_input, "NHWC") input_shapes = [ generated_input.shape if hasattr(generated_input, "shape") else (1,) @@ -260,8 +287,11 @@ def run_method_and_compare_outputs( print(f"Run {run_iteration} with input shapes: {input_shapes}") reference_output = reference_stage.run_artifact(reference_input) - test_output = (test_stage.run_artifact(test_input),) - if is_nhwc: + test_output = tuple(test_stage.run_artifact(test_input)) + if ( + is_nhwc + and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] + ): test_output = self.transpose_data_format(test_output, "NCHW") self._compare_outputs( diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py index 5749d1e204..0baf3e2ec1 100644 --- a/backends/arm/tosa_mapping.py +++ b/backends/arm/tosa_mapping.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # PyTorch to Tosa mapping - simple mapping functions and multi-type extraction # of key information. These are used by the initial compile stage which captures diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py index 55649f4bef..8a90e432a6 100644 --- a/backends/arm/tosa_quant_utils.py +++ b/backends/arm/tosa_quant_utils.py @@ -3,18 +3,21 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # Utiliy functions for TOSA quantized lowerings import math -from typing import NamedTuple +from typing import NamedTuple, Sequence import numpy as np import serializer.tosa_serializer as ts import torch.fx +import tosa.Op as TosaOp from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg from executorch.exir.dialects._ops import ops as exir_ops -from serializer.tosa_serializer import TosaOp, TosaSerializerTensor +from serializer.tosa_serializer import TosaSerializerTensor from torch.fx import Node q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default @@ -65,6 +68,7 @@ def is_quant_node(node: torch.fx.Node): def get_quant_node_dtype(node: torch.fx.Node): + # pyre-ignore[16]: Undefined attribute. if "tosa" in node.target.__name__: return node.meta["val"].dtype @@ -171,7 +175,7 @@ def build_rescale( output_shape, input_zp, output_zp, - is_double_round, + is_double_round=False, ): scale_width = 32 if is_scale32(output_type) else 16 multiplier, shift = compute_multiplier_and_shift(scale, scale_width) @@ -197,7 +201,7 @@ def build_rescale( def build_rescale_to_int32( - tosa_fb, input, input_zp, rescale_scale, is_scale32=True, is_double_round=True + tosa_fb, input, input_zp, rescale_scale, is_scale32=True, is_double_round=False ) -> TosaSerializerTensor: multiplier, shift = compute_multiplier_and_shift(rescale_scale) attr_rescale = ts.TosaSerializerAttribute() @@ -230,8 +234,8 @@ def build_rescale_from_int32( output_zp, rescale_scale, is_scale32=True, - is_double_round=True, -) -> TosaSerializerTensor: + is_double_round=False, +) -> None: multiplier, shift = compute_multiplier_and_shift(rescale_scale) attr_rescale_output = ts.TosaSerializerAttribute() attr_rescale_output.RescaleAttribute( @@ -254,7 +258,7 @@ def build_rescale_from_int32( def rescale_nodes_to_int32( - nodes: list[Node], tosa_graph: ts.TosaSerializer + nodes: Sequence[Node], tosa_graph: ts.TosaSerializer ) -> tuple[list[TosaSerializerTensor], float]: """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'. The scales are adjusted using the smallest scale of all 'nodes'. @@ -329,9 +333,6 @@ def build_rescale_conv_output( output_scale, output_zp, ): - # Only use double round if we are doing 32 bit scaling - double_round = is_scale32(output_type) - # TODO add check to verify if this is a Per-channel quantization. post_conv2d_scale = (input_scale.number * weight_scale.number) / output_scale.number @@ -345,6 +346,5 @@ def build_rescale_conv_output( op.shape, 0, output_zp.number, - double_round, ) return diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index 4dc0204516..cfafac1676 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -3,9 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import logging import os -from typing import Dict +from typing import Any, cast, Dict import numpy as np import serializer.tosa_serializer as ts @@ -48,10 +50,10 @@ def dbg_node(node): # Output TOSA flatbuffer and test harness file -def dbg_tosa_dump(tosa_graph, path): - filename = "output.tosa" +def dbg_tosa_dump(tosa_graph: ts.TosaSerializer, path: str, suffix: str = ""): + filename = f"output{suffix}.tosa" - logger.info(f"Emitting debug output to {path}") + logger.info(f"Emitting debug output to: {path=}, {suffix=}") os.makedirs(path, exist_ok=True) @@ -63,7 +65,7 @@ def dbg_tosa_dump(tosa_graph, path): f.write(fb) assert os.path.exists(filepath_tosa_fb), "Failed to write TOSA flatbuffer" - filepath_desc_json = os.path.join(path, "desc.json") + filepath_desc_json = os.path.join(path, f"desc{suffix}.json") with open(filepath_desc_json, "w") as f: f.write(js) assert os.path.exists(filepath_desc_json), "Failed to write TOSA JSON" @@ -74,7 +76,7 @@ def dbg_fail(node, tosa_graph, path): logger.warn("Internal error due to poorly handled node:") dbg_node(node) logger.warn(f"Debug output captured in '{path}'.") - raise RuntimeError("TOSA Internal Error on node, enable logging for further info") + raise RuntimeError("TOSA Internal Error on node, enable logging for further info.") # Helper function to match TOSA's broadcasting rank requirement @@ -235,7 +237,7 @@ def build_avg_pool_2d_common( output_zp = 0 if is_quant_node: - input_zp = get_quant_node_args(node.args[0]).zp + input_zp = get_quant_node_args(cast(torch.fx.Node, node.args[0])).zp output_zp = get_quant_node_args(list(node.users)[0]).zp attr = ts.TosaSerializerAttribute() @@ -306,7 +308,9 @@ def process_call_function( ) # Visiting each Node + # pyre-ignore[16]: Undefined attribute. if node.target.__name__ in node_visitors: + # pyre-ignore[16]: Undefined attribute. node_visitors[node.target.__name__].define_node( node, tosa_graph, @@ -316,3 +320,32 @@ def process_call_function( ) else: raise RuntimeError(f"Unknown operator {node.target}") + + +def expand_dims( + tosa_graph: ts.TosaSerializer, + input_node: TosaArg, + dtype: int, + dim: int, +) -> Any: + """Inserts TOSA operators into the tosa_graph, that perform the equivalent + of the expand_dims (a.k.a unsqueeze) operation. A new axis is created at the + dim location. + + Args: + tosa_graph (ts.TosaSerializer): The TOSA graph to manipulate. + input_node (TosaArg): The parent node of the expand dim operations. + dtype (ts.DType): The data type expand dims operations. + dim (int): The dimension to expand. + + Returns: + Any: The output tensor of the inserted operation in the TOSA graph. + """ + new_shape = list(input_node.shape) + new_shape.insert(dim, 1) + + intermediate = tosa_graph.addIntermediate(new_shape, dtype) + + build_reshape(tosa_graph, input_node.name, new_shape, intermediate.name) + + return intermediate diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index 5ad0192d92..d786142f08 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -20,15 +20,64 @@ if(NOT EXECUTORCH_ROOT) endif() include(${EXECUTORCH_ROOT}/build/Utils.cmake) +include(${EXECUTORCH_ROOT}/build/Codegen.cmake) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(TARGET_DIR reference) if(EXECUTORCH_NNLIB_OPT) -set(TARGET_DIR hifi) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) + set(TARGET_DIR hifi) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) endif() +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +if(NOT PYTHON_EXECUTABLE) + resolve_python_executable() +endif() + +set(_common_compile_options -Wno-deprecated-declarations -fPIC) + +# Find prebuilt libraries. executorch package should contain portable_ops_lib, +# etdump, bundled_program. +find_package(executorch CONFIG REQUIRED) +target_link_options_shared_lib(executorch) +target_link_options_shared_lib(portable_ops_lib) + +target_include_directories(executorch INTERFACE ${_common_include_directories}) + +find_package( + gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party +) + +add_executable(cadence_runner cadence_runner/cadence_runner.cpp) +target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) + +target_include_directories( + etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include + ${EXECUTORCH_ROOT}/third-party/flatcc/include +) + +target_include_directories( + cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) + +target_link_libraries( + cadence_runner + executorch + gflags + etdump + extension_data_loader + bundled_program + cadence_ops_lib + flatccrt +) diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 79646c1293..08093efe31 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -4,7 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +load("@fbcode_macros//build_defs:export_files.bzl", "export_file") load("@fbcode_macros//build_defs:python_library.bzl", "python_library") +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "CXX", +) +load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib") oncall("odai_jarvis") @@ -53,3 +59,30 @@ python_library( "//executorch/exir/passes:spec_prop_pass", ], ) + +python_library( + name = "ops_registrations", + srcs = [ + "ops_registrations.py", + ], + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/backends/cadence/aot:utils", + ], +) + +export_file(name = "functions.yaml") + +executorch_generated_lib( + name = "cadence_aot_lib", + custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml", + functions_yaml_target = ":functions.yaml", + platforms = CXX, + visibility = ["PUBLIC"], + deps = [ + "//executorch/backends/cadence/reference/kernels:cadence_kernels", + "//executorch/backends/cadence/reference/operators:cadence_cpu_ops", + "//executorch/kernels/portable:executorch_all_ops", + "//executorch/kernels/portable:operators", + ], +) diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 39511ae917..e1494f8d20 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -7,6 +7,7 @@ # pyre-strict import logging +from typing import Optional import torch @@ -17,12 +18,13 @@ ReplaceLogicalNotBooleanWhereWithWherePass, ReplacePT2DequantWithCadenceDequantPass, ReplacePT2QuantWithCadenceQuantPass, + ReplaceSafeSoftmaxWithSoftmax, ReplaceScalarTensorWithFullPass, ReplaceSqueezeAndUnsqueezeWithViewPass, ) from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer -from executorch.backends.cadence.aot.utils import model_is_quantized +from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized from executorch.backends.transforms.decompose_sdpa import ( DecomposeScaledDotProductAttention, ) @@ -36,25 +38,40 @@ from torch.export.exported_program import ExportedProgram -def quantize_pt2( +# Note: this is not meant as a primary API since it can create inconsistencies +# if the quantizer here is different from the quantizer used to convert. It is +# however useful for unit tests to separate the converted model from the fused +# model, to be able to get reference numerics. +# If this does not apply, please use quantize_and_fuse_pt2 instead. +def convert_pt2( model: torch.nn.Module, inputs: tuple[object, ...], + quantizer: CadenceQuantizer, ) -> torch.fx.GraphModule: """ - Instantiate the CadenceQuantizer (PTQ), prepare, convert and fuse the model. - Returns a GraphModule with the quantized model. + Prepare and convert a model using the given quantizer. + The quantizer must be supplied and be the same as the one used to + fuse the model later, if applicable. If you do not expect that behavior, + please use quantize_and_fuse_pt2 instead, which will instantiate a + default quantizer for you if needed. + Returns a GraphModule with the converted model. """ - # Quantizer - quantizer = CadenceQuantizer() # Export with dynamo - model_exp = capture_pre_autograd_graph(model, inputs) + model_gm = capture_pre_autograd_graph(model, inputs) - # Decompose SDPA - DecomposeScaledDotProductAttention(False)(model_exp) + if model_gm_has_SDPA(model_gm): # pyre-fixme[6] + # Decompose SDPA + DecomposeScaledDotProductAttention(False)(model_gm) # pyre-fixme[6] + + # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882 + # for details). + result = ReplaceSafeSoftmaxWithSoftmax()(model_gm) # pyre-fixme[6] + assert result is not None + model_gm = result.graph_module # Prepare - prepared_model = prepare_pt2e(model_exp, quantizer) + prepared_model = prepare_pt2e(model_gm, quantizer) # Calibrate prepared_model(*inputs) @@ -62,12 +79,54 @@ def quantize_pt2( # Convert converted_model = convert_pt2e(prepared_model) + return converted_model + + +# Note: this is not meant as a primary API since it can create inconsistencies +# if the quantizer here is different from the quantizer used to convert. It is +# however useful for unit tests to separate the converted model from the fused +# model, to be able to get reference numerics. +# If this does not apply, please use quantize_and_fuse_pt2 instead. +def fuse_pt2( + converted_graph_module: torch.fx.GraphModule, + quantizer: CadenceQuantizer, +) -> torch.fx.GraphModule: + """ + Fuse a converted graph module using the given quantizer. + The quantizer must be the same as the one used to convert the model. + If you do not expect that behavior, please use quantize_and_fuse_pt2 instead, + which will instantiate a default quantizer for you if needed. + Returns a GraphModule with the fused model. + """ # Get patterns and apply fusion of dq -> op -> q to qop # pyre-ignore[16]: no attribute patterns = [q.pattern for q in quantizer.quantizers] - QuantFusion(patterns)(converted_model) + QuantFusion(patterns)(converted_graph_module) - return converted_model + return converted_graph_module + + +# Note: this is the one-liner API to quantize and fuse a model. +def quantize_pt2( + model: torch.nn.Module, + inputs: tuple[object, ...], + quantizer: Optional[CadenceQuantizer] = None, +) -> torch.fx.GraphModule: + """ + Prepare, convert and fuse the model using the given quantizer. + Returns a GraphModule with the quantized model. + """ + # Quantizer + if not quantizer: + quantizer = CadenceQuantizer() + + # Get converted graph module + converted_gm = convert_pt2(model, inputs, quantizer) + + # Get fused model + fused_gm = fuse_pt2(converted_gm, quantizer) + + return fused_gm # Export the model and lower it to an ExportedProgram (in aten IR) diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py index c0e1727ec9..f7920f0b8f 100644 --- a/backends/cadence/aot/export_example.py +++ b/backends/cadence/aot/export_example.py @@ -7,17 +7,21 @@ # Example script for exporting simple models to flatbuffer import logging +import tempfile from executorch.backends.cadence.aot.ops_registrations import * # noqa - import os from typing import Any, Tuple from executorch.backends.cadence.aot.compiler import ( + convert_pt2, export_to_cadence, export_to_edge, quantize_pt2, ) +from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer +from executorch.backends.cadence.runtime import runtime +from executorch.backends.cadence.runtime.executor import BundledProgramManager from executorch.exir import ExecutorchProgramManager from torch import nn @@ -44,23 +48,50 @@ def _save_pte_program( logging.error(f"Error while saving to {filename}: {e}") +def _save_bpte_program( + buffer: bytes, + model_name: str, + output_dir: str = "", +) -> None: + if model_name.endswith(".bpte"): + filename = model_name + else: + filename = os.path.join(output_dir, f"{model_name}.bpte") + try: + with open(filename, "wb") as f: + f.write(buffer) + logging.info(f"Saved exported program to {filename}") + except Exception as e: + logging.error(f"Error while saving to {output_dir}: {e}") + + def export_model( model: nn.Module, example_inputs: Tuple[Any, ...], file_name: str = "CadenceDemoModel", ): + # create work directory for outputs and model binary + working_dir = tempfile.mkdtemp(dir="/tmp") + logging.debug(f"Created work directory {working_dir}") + + # convert the model (also called in quantize_pt2) + converted_model = convert_pt2(model, example_inputs, CadenceQuantizer()) + + # Get reference outputs from quantized_model + ref_outputs = converted_model(*example_inputs) + # Quantize the model quantized_model = quantize_pt2(model, example_inputs) - # Get edge program + # Get edge program (also called in export_to_cadence) edge_prog_manager = export_to_edge(quantized_model, example_inputs) # Get edge program after Cadence specific passes cadence_prog_manager = export_to_cadence(quantized_model, example_inputs) - exec_prog = cadence_prog_manager.to_executorch() + exec_prog: ExecutorchProgramManager = cadence_prog_manager.to_executorch() - logging.info("Final exported graph:") + logging.info("Final exported graph:\n") exec_prog.exported_program().graph_module.graph.print_tabular() # Print some information to terminal @@ -69,5 +100,28 @@ def export_model( cadence_prog_manager.exported_program().graph_module, ) - # Save the program as (default name is CadenceDemoModel.pte) - _save_pte_program(exec_prog, file_name) + forward_test_data = BundledProgramManager.bundled_program_test_data_gen( + method="forward", inputs=example_inputs, expected_outputs=ref_outputs + ) + bundled_program_manager = BundledProgramManager([forward_test_data]) + buffer = bundled_program_manager._serialize( + exec_prog, + bundled_program_manager.get_method_test_suites(), + forward_test_data, + ) + # Save the program as pte (default name is CadenceDemoModel.pte) + _save_pte_program(exec_prog, file_name, working_dir) + # Save the program as btpe (default name is CadenceDemoModel.bpte) + _save_bpte_program(buffer, file_name, working_dir) + + logging.debug( + f"Executorch bundled program buffer saved to {file_name} is {len(buffer)} total bytes" + ) + + # TODO: move to test infra + runtime.run_and_compare( + executorch_prog=exec_prog, + inputs=example_inputs, + ref_outputs=ref_outputs, + working_dir=working_dir, + ) diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index dbfe1e3639..71246df868 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -57,11 +57,26 @@ - arg_meta: null kernel_name: torch::executor::embedding_out +- op: empty.out + kernels: + - arg_meta: null + kernel_name: torch::executor::empty_out + +- op: expand_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::expand_copy_out + - op: full.out kernels: - arg_meta: null kernel_name: torch::executor::full_out +- op: gelu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::gelu_out + - op: mean.out kernels: - arg_meta: null @@ -145,12 +160,12 @@ - arg_meta: null kernel_name: impl::reference::quantized_linear_out -- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null kernel_name: impl::reference::quantized_relu_out -func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null kernel_name: impl::reference::quantized_matmul_out diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index adcf086873..e73de6ab7c 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -4,12 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-strict + from math import prod from typing import Optional, Tuple import torch -from executorch.exir.scalar_type import ScalarType -from torch.library import impl, Library +from torch.library import Library, register_fake from .utils import get_conv1d_output_size, get_conv2d_output_size @@ -43,9 +44,11 @@ "quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)" ) -lib.define("quantized_relu(Tensor X, Tensor X_zero_point) -> (Tensor Y)") lib.define( - "quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor (a!)" + "quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)" +) +lib.define( + "quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor (a!)" ) lib.define( @@ -65,31 +68,31 @@ m = Library("cadence", "IMPL", "Meta") -@impl(m, "quantize_per_tensor") +@register_fake("cadence::quantize_per_tensor") def quantize_per_tensor_meta( input: torch.Tensor, scale: float, zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=dtype) -@impl(m, "dequantize_per_tensor") +@register_fake("cadence::dequantize_per_tensor") def dequantize_per_tensor_meta( input: torch.Tensor, scale: float, zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.float) -@impl(m, "quantized_linear") +@register_fake("cadence::quantized_linear") def quantized_linear_meta( src: torch.Tensor, weight: torch.Tensor, @@ -100,7 +103,7 @@ def quantized_linear_meta( out_shift: torch.Tensor, out_zero_point: int, offset: Optional[torch.Tensor], -): +) -> torch.Tensor: # src comes in shape [leading_dims, in_dim] # weight comes in shape [out_dim, in_dim] # output comes in empty with shape [leading_dims, out_dim] @@ -111,7 +114,7 @@ def quantized_linear_meta( return src.new_empty(out_size, dtype=torch.uint8) -@impl(m, "quantized_conv") +@register_fake("cadence::quantized_conv") def quantized_conv_meta( input: torch.Tensor, weight: torch.Tensor, @@ -149,7 +152,7 @@ def quantized_conv_meta( return input.new_empty(output_size, dtype=input.dtype) -@impl(m, "quantized_layer_norm") +@register_fake("cadence::quantized_layer_norm") def quantized_layer_norm_meta( input: torch.Tensor, X_scale: torch.Tensor, @@ -160,19 +163,22 @@ def quantized_layer_norm_meta( eps: float, output_scale: float, output_zero_point: int, -): +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.uint8) -@impl(m, "quantized_relu") +@register_fake("cadence::quantized_relu") def quantized_relu_meta( X: torch.Tensor, X_zero_point: torch.Tensor, -): + out_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: return X.new_empty(X.size(), dtype=torch.uint8) -@impl(m, "quantized_matmul") +@register_fake("cadence::quantized_matmul") def quantized_matmul_meta( X: torch.Tensor, X_zero_point: int, diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py index db419bfb5e..83ef43d151 100644 --- a/backends/cadence/aot/passes.py +++ b/backends/cadence/aot/passes.py @@ -266,3 +266,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: result = SpecPropPass()(graph_module) assert result is not None return result + + +class ReplaceSafeSoftmaxWithSoftmax(ExportPass): + """ + Replace _safe_softmax with _softmax + """ + + def call_operator( + self, + op, # pyre-ignore + args: tuple[Argument, ...], + kwargs: dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != torch.ops.aten._safe_softmax.default: + return super().call_operator(op, args, kwargs, meta) + + # Add False for the half_to_float argument of softmax + softmax_args = list(args) + [False] + + return super().call_operator( + torch.ops.aten._softmax.default, + tuple(softmax_args), + kwargs, + meta, + ) diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py index 4c43172a92..7c05e9b867 100644 --- a/backends/cadence/aot/quantizer/fusion_pass.py +++ b/backends/cadence/aot/quantizer/fusion_pass.py @@ -287,7 +287,15 @@ def get_args_and_kwargs_relu( graph_module: GraphModule, inputs_inputs: List[fx.Node], dequants_inputs: List[fx.Node], + quant_node: fx.Node, ) -> Tuple[Tuple[ArgsType], Dict[str, ArgsType]]: + input_scale = dequants_inputs[0].args[1] + # pyre-fixme[58]: Unsupported operand types + requantize_scale = input_scale / quant_node.args[1] + requantize_scale_t = torch.tensor([requantize_scale]) + + (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t) + # Make the args and kwargs for the replacement op args = tuple(inputs_inputs) @@ -296,9 +304,22 @@ def get_args_and_kwargs_relu( ([1], dequants_inputs[0].args[2]), {"dtype": torch.int32}, ) + out_multiplier_ = graph_module.graph.call_function( + torch.ops.aten.full.default, + ([1], out_multiplier[0].item()), + {"dtype": torch.int32}, + ) + out_shift_ = graph_module.graph.call_function( + torch.ops.aten.full.default, + ([1], out_shift[0].item()), + {"dtype": torch.int32}, + ) kwargs = { "X_zero_point": X_zero_point, + "out_zero_point": quant_node.args[2], + "out_multiplier": out_multiplier_, + "out_shift": out_shift_, } return args, kwargs @@ -420,6 +441,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 graph_module, inputs_inputs, dequants_inputs, + quant_node, ) fused = graph_module.graph.call_function( pattern.replacement_op(), diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index 7043bae571..c5eb3b964d 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -303,9 +303,7 @@ def get_anchors( inputs=[(relu_node, 0)], weights=[], biases=[], - output=[ - (relu_node, SharedQuantizationSpec((relu_node.args[0], relu_node))) - ], + output=[(relu_node,)], ) def replacement_op(self) -> OpOverload: diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py index 2afe5aba32..0f9c939978 100644 --- a/backends/cadence/aot/quantizer/utils.py +++ b/backends/cadence/aot/quantizer/utils.py @@ -145,7 +145,7 @@ def get_aten_node_target_partitions( """ Args: graph: The graph we want to partition - wanted_sources: List of orginal_aten ops (OpOverload) + wanted_original_aten_op: List of original_aten ops (OpOverload) Returns: Dictionary mapping aten ops that were given to a list of SourcePartitions diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py index f0c294260a..f081036ccc 100644 --- a/backends/cadence/aot/utils.py +++ b/backends/cadence/aot/utils.py @@ -104,11 +104,11 @@ def get_ops_count(graph_module: torch.fx.GraphModule) -> Dict[str, int]: ): continue # If the op is already present, increment the count - if get_edge_overload_packet(node.target).__name__ in freq: - freq[get_edge_overload_packet(node.target).__name__] += 1 + if node.target._name in freq: + freq[node.target._name] += 1 # else, add a new entry else: - freq[get_edge_overload_packet(node.target).__name__] = 1 + freq[node.target._name] = 1 return freq @@ -177,3 +177,11 @@ def print_ops_info( tablefmt="outline", ) ) + + +def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool: + for node in model_gm.graph.nodes: + if node.op == "call_function": + if node.target == torch.ops.aten.scaled_dot_product_attention.default: + return True + return False diff --git a/backends/cadence/build_cadence_runner.sh b/backends/cadence/build_cadence_runner.sh new file mode 100755 index 0000000000..693a320bdf --- /dev/null +++ b/backends/cadence/build_cadence_runner.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Builds cadence_runner and prints its path. + +set -euo pipefail + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +readonly SCRIPT_DIR + +readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../.." + +# Allow overriding the number of build jobs. Default to 9. +export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}" + +main() { + cd "${EXECUTORCH_ROOT}" + + rm -rf cmake-out + cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_HOST_TARGETS=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -Bcmake-out . + cmake --build cmake-out --target install --config Release + + local example_dir=backends/cadence + local build_dir="cmake-out/${example_dir}" + local cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" + rm -rf ${build_dir} + cmake -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \ + -DCMAKE_BUILD_TYPE=Release \ + -B"${build_dir}" \ + "${example_dir}" + cmake --build "${build_dir}" --config Release + + local runner="${PWD}/${build_dir}/cadence_runner" + if [[ ! -f "${runner}" ]]; then + echo "ERROR: Failed to build ${build_dir}/cadence_runner" >&2 + exit 1 + else + echo "Built ${build_dir}/cadence_runner" + fi +} + +main "$@" diff --git a/backends/cadence/build_cadence_xtensa.sh b/backends/cadence/build_cadence_xtensa.sh new file mode 100644 index 0000000000..f96436e65d --- /dev/null +++ b/backends/cadence/build_cadence_xtensa.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +unset CMAKE_PREFIX_PATH +git submodule sync +git submodule update --init +./install_requirements.sh + +rm -rf cmake-out + +STEPWISE_BUILD=false + +if $STEPWISE_BUILD; then + echo "Building ExecuTorch" + cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=OFF \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -Bcmake-out . + + echo "Building any Cadence-specific binaries on top" + cmake -DBUCK2="$BUCK" \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_HOST_TARGETS=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=ON \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_NNLIB_OPT=ON \ + -DEXECUTORCH_BUILD_GFLAGS=ON \ + -DHAVE_FNMATCH_H=OFF \ + -Bcmake-out/backends/cadence \ + backends/cadence + cmake --build cmake-out/backends/cadence -j16 +else + echo "Building Cadence toolchain with ExecuTorch packages" + cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" + cmake -DBUCK2="$BUCK" \ + -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_HOST_TARGETS=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=OFF \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_NNLIB_OPT=ON \ + -DEXECUTORCH_BUILD_GFLAGS=ON \ + -DHAVE_FNMATCH_H=OFF \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -Bcmake-out + cmake --build cmake-out --target install --config Release -j16 +fi + +echo "Run simple model to verify cmake build" +python3 -m examples.portable.scripts.export --model_name="add" +xt-run --turbo cmake-out/executor_runner --model_path=add.pte diff --git a/backends/cadence/cadence_runner/TARGETS b/backends/cadence/cadence_runner/TARGETS new file mode 100644 index 0000000000..21f36a9bae --- /dev/null +++ b/backends/cadence/cadence_runner/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp new file mode 100644 index 0000000000..a269ed5a8e --- /dev/null +++ b/backends/cadence/cadence_runner/cadence_runner.cpp @@ -0,0 +1,298 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * @file + * + * This tool can run ExecuTorch model files that only use operators that + * are covered by the portable kernels, with possible delegate to the + * test_backend_compiler_lib. + * + * It sets all input tensor data to ones, and assumes that the outputs are + * all fp32 tensors. + */ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB + +DEFINE_string( + bundled_program_path, + "CadenceDemoModel.bpte", + "Model serialized in flatbuffer format."); + +DEFINE_int32( + testset_idx, + 0, + "Index of bundled verification set to be run " + "by bundled model for verification"); + +DEFINE_string( + etdump_path, + "etdump.etdp", + "If etdump generation is enabled an etdump will be written out to this path"); + +DEFINE_bool( + output_verification, + false, + "Comapre the model output to the reference outputs present in the BundledProgram."); + +DEFINE_bool( + print_output, + false, + "Print the output of the ET model to stdout, if needs."); + +DEFINE_bool(dump_outputs, true, "Dump outputs to etdump file"); + +DEFINE_bool( + dump_intermediate_outputs, + false, + "Dump intermediate outputs to etdump file."); + +DEFINE_string( + debug_output_path, + "debug_output.bin", + "Path to dump debug outputs to."); + +DEFINE_int32( + debug_buffer_size, + 262144, // 256 KB + "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging."); + +using namespace torch::executor; + +std::vector load_file_or_die(const char* path) { + std::ifstream file(path, std::ios::binary | std::ios::ate); + const size_t nbytes = file.tellg(); + file.seekg(0, std::ios::beg); + auto file_data = std::vector(nbytes); + ET_CHECK_MSG( + file.read(reinterpret_cast(file_data.data()), nbytes), + "Could not load contents of file '%s'", + path); + return file_data; +} + +int main(int argc, char** argv) { + runtime_init(); + + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (argc != 1) { + std::string msg = "Extra commandline args:"; + for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) { + msg += std::string(" ") + argv[i]; + } + ET_LOG(Error, "%s", msg.c_str()); + return 1; + } + + // Read in the entire file. + const char* bundled_program_path = FLAGS_bundled_program_path.c_str(); + std::vector file_data = load_file_or_die(bundled_program_path); + + // Find the offset to the embedded Program. + const void* program_data; + size_t program_data_len; + Error status = torch::executor::bundled_program::GetProgramData( + reinterpret_cast(file_data.data()), + file_data.size(), + &program_data, + &program_data_len); + ET_CHECK_MSG( + status == Error::Ok, + "GetProgramData() failed on file '%s': 0x%x", + bundled_program_path, + (unsigned int)status); + + auto buffer_data_loader = + util::BufferDataLoader(program_data, program_data_len); + + // Parse the program file. This is immutable, and can also be reused + // between multiple execution invocations across multiple threads. + Result program = Program::load(&buffer_data_loader); + if (!program.ok()) { + ET_LOG(Error, "Failed to parse model file %s", bundled_program_path); + return 1; + } + ET_LOG(Info, "Model file %s is loaded.", bundled_program_path); + + // Use the first method in the program. + const char* method_name = nullptr; + { + const auto method_name_result = program->get_method_name(0); + ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); + method_name = *method_name_result; + } + ET_LOG(Info, "Running method %s", method_name); + + // MethodMeta describes the memory requirements of the method. + Result method_meta = program->method_meta(method_name); + ET_CHECK_MSG( + method_meta.ok(), + "Failed to get method_meta for %s: 0x%x", + method_name, + (unsigned int)method_meta.error()); + + // + // The runtime does not use malloc/new; it allocates all memory using the + // MemoryManger provided by the client. Clients are responsible for allocating + // the memory ahead of time, or providing MemoryAllocator subclasses that can + // do it dynamically. + // + + // The method allocator is used to allocate all dynamic C++ metadata/objects + // used to represent the loaded method. This allocator is only used during + // loading a method of the program, which will return an error if there was + // not enough memory. + // + // The amount of memory required depends on the loaded method and the runtime + // code itself. The amount of memory here is usually determined by running the + // method and seeing how much memory is actually used, though it's possible to + // subclass MemoryAllocator so that it calls malloc() under the hood (see + // MallocMemoryAllocator). + // + // In this example we use a statically allocated memory pool. + MemoryAllocator method_allocator{ + MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + + // The memory-planned buffers will back the mutable tensors used by the + // method. The sizes of these buffers were determined ahead of time during the + // memory-planning pasees. + // + // Each buffer typically corresponds to a different hardware memory bank. Most + // mobile environments will only have a single buffer. Some embedded + // environments may have more than one for, e.g., slow/large DRAM and + // fast/small SRAM, or for memory associated with particular cores. + std::vector> planned_buffers; // Owns the memory + std::vector> planned_spans; // Passed to the allocator + size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); + for (size_t id = 0; id < num_memory_planned_buffers; ++id) { + // .get() will always succeed because id < num_memory_planned_buffers. + size_t buffer_size = + static_cast(method_meta->memory_planned_buffer_size(id).get()); + ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size); + planned_buffers.push_back(std::make_unique(buffer_size)); + planned_spans.push_back({planned_buffers.back().get(), buffer_size}); + } + HierarchicalAllocator planned_memory( + {planned_spans.data(), planned_spans.size()}); + + // Assemble all of the allocators into the MemoryManager that the Executor + // will use. + MemoryManager memory_manager(&method_allocator, &planned_memory); + + // + // Load the method from the program, using the provided allocators. Running + // the method can mutate the memory-planned buffers, so the method should only + // be used by a single thread at at time, but it can be reused. + // + torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); + Result method = + program->load_method(method_name, &memory_manager, &etdump_gen); + ET_CHECK_MSG( + method.ok(), + "Loading of method %s failed with status 0x%" PRIx32, + method_name, + method.error()); + ET_LOG(Info, "Method loaded."); + + void* debug_buffer = malloc(FLAGS_debug_buffer_size); + if (FLAGS_dump_intermediate_outputs) { + Span buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size); + etdump_gen.set_debug_buffer(buffer); + etdump_gen.set_event_tracer_debug_level( + EventTracerDebugLogLevel::kIntermediateOutputs); + } else if (FLAGS_dump_outputs) { + Span buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size); + etdump_gen.set_debug_buffer(buffer); + etdump_gen.set_event_tracer_debug_level( + EventTracerDebugLogLevel::kProgramOutputs); + } + // Use the inputs embedded in the bundled program. + status = torch::executor::bundled_program::LoadBundledInput( + *method, file_data.data(), FLAGS_testset_idx); + ET_CHECK_MSG( + status == Error::Ok, + "LoadBundledInput failed with status 0x%" PRIx32, + status); + + ET_LOG(Info, "Inputs prepared."); + + // Run the model. + status = method->execute(); + ET_CHECK_MSG( + status == Error::Ok, + "Execution of method %s failed with status 0x%" PRIx32, + method_name, + status); + ET_LOG(Info, "Model executed successfully."); + + // Print the outputs. + if (FLAGS_print_output) { + std::vector outputs(method->outputs_size()); + status = method->get_outputs(outputs.data(), outputs.size()); + ET_CHECK(status == Error::Ok); + for (EValue& output : outputs) { + // TODO(T159700776): This assumes that all outputs are fp32 tensors. Add + // support for other EValues and Tensor dtypes, and print tensors in a + // more readable way. + auto output_tensor = output.toTensor(); + auto data_output = output_tensor.const_data_ptr(); + for (size_t j = 0; j < output_tensor.numel(); ++j) { + ET_LOG(Info, "%f", data_output[j]); + } + } + } + + // Dump the etdump data containing profiling/debugging data to the specified + // file. + etdump_result result = etdump_gen.get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+"); + fwrite((uint8_t*)result.buf, 1, result.size, f); + fclose(f); + free(result.buf); + } + + if (FLAGS_output_verification) { + // Verify the outputs. + status = + torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput( + *method, + file_data.data(), + FLAGS_testset_idx, + 1e-3, // rtol + 1e-5 // atol + ); + ET_CHECK_MSG( + status == Error::Ok, + "Bundle verification failed with status 0x%" PRIx32, + status); + ET_LOG(Info, "Model verified successfully."); + } + + if (FLAGS_dump_outputs || FLAGS_dump_intermediate_outputs) { + FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+"); + fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f); + fclose(f); + } + free(debug_buffer); + + return 0; +} diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl new file mode 100644 index 0000000000..b59a98cd75 --- /dev/null +++ b/backends/cadence/cadence_runner/targets.bzl @@ -0,0 +1,29 @@ +load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native") +load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary") +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") + +def define_common_targets(): + fb_native.export_file( + name = "cadence_runner.cpp", + src = "cadence_runner.cpp", + visibility = [ + "PUBLIC", + ], + ) + + fb_xplat_cxx_binary( + name = "cadence_runner", + srcs = ["cadence_runner.cpp"], + headers = [], + platforms = CXX, + visibility = ["PUBLIC"], + deps = [ + "fbsource//arvr/third-party/gflags:gflags", + "fbsource//xplat/executorch/devtools/etdump:etdump_flatcc", + "fbsource//xplat/executorch/devtools/bundled_program:runtime", + "fbsource//xplat/executorch/extension/data_loader:file_data_loader", + "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader", + "fbsource//xplat/executorch/kernels/portable:generated_lib", + "fbsource//xplat/executorch/runtime/executor:program", + ], + ) diff --git a/backends/cadence/executor_runner.cpp b/backends/cadence/executor_runner.cpp index 0769aeccb7..dd24105179 100644 --- a/backends/cadence/executor_runner.cpp +++ b/backends/cadence/executor_runner.cpp @@ -83,10 +83,10 @@ void et_pal_emit_log_message( et_timestamp_t timestamp, et_pal_log_level_t level, const char* filename, - __ET_UNUSED const char* function, + ET_UNUSED const char* function, size_t line, const char* message, - __ET_UNUSED size_t length) { + ET_UNUSED size_t length) { PRINTF("\r%s\n", message); } diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index e46aa745df..0ff3d1fde6 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -17,11 +17,12 @@ add_library( target_include_directories( cadence_kernels - PUBLIC . - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/ - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ + PUBLIC + . + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/ + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ ) target_link_libraries(cadence_kernels PRIVATE xa_nnlib) diff --git a/backends/cadence/hifi/kernels/TARGETS b/backends/cadence/hifi/kernels/TARGETS new file mode 100644 index 0000000000..67f2bab681 --- /dev/null +++ b/backends/cadence/hifi/kernels/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index 47a5c1cfc0..4d9183e4cc 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -6,9 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#include "kernels.h" -#include "xa_nnlib_common.h" -#include "xa_nnlib_common_macros.h" +#include +#include +#include namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 209bc192c8..8faf06713b 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -8,9 +8,9 @@ #pragma once -#include "inttypes.h" -#include "stddef.h" -#include "xa_type_def.h" +#include +#include +#include /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" diff --git a/backends/cadence/hifi/kernels/targets.bzl b/backends/cadence/hifi/kernels/targets.bzl new file mode 100644 index 0000000000..acdc39dd16 --- /dev/null +++ b/backends/cadence/hifi/kernels/targets.bzl @@ -0,0 +1,18 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "kernels", + srcs = ["kernels.cpp"], + exported_headers = [ + "kernels.h", + ], + visibility = [ + "//executorch/backends/cadence/...", + ], + exported_deps = [ + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common", + ], + platforms = CXX, + ) diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 74b64b18ff..d56d19fc37 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -53,18 +53,20 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) # Custom ops that are needed to run the test model. add_library( - custom_ops "quantized_linear_out.cpp" - "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp") -target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) + custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" + "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) target_link_libraries(custom_ops PUBLIC executorch) target_link_libraries(custom_ops PRIVATE cadence_kernels) @@ -76,12 +78,11 @@ gen_selected_ops( "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_hifi.yaml" "" "" ) generate_bindings_for_kernels( - LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML - FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml ) message("Generated files ${gen_command_sources}") gen_operators_lib( - LIB_NAME "cadence_ops_lib" - KERNEL_LIBS custom_ops - DEPS aten_ops_cadence) + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/cadence/hifi/operators/TARGETS b/backends/cadence/hifi/operators/TARGETS new file mode 100644 index 0000000000..67f2bab681 --- /dev/null +++ b/backends/cadence/hifi/operators/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp index 8a296307ee..79645f5381 100644 --- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp +++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp @@ -6,19 +6,20 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" +#include namespace impl { namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; void dequantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp index aea6c1b943..e280f6bcff 100644 --- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp +++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp @@ -6,21 +6,22 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" +#include namespace impl { namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; // Quantize the input tensor (PT2 version). Note that quant_ are not // used in any computation. void quantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp index 27d86e5622..3974d6ee5e 100644 --- a/backends/cadence/hifi/operators/quantized_layer_norm.cpp +++ b/backends/cadence/hifi/operators/quantized_layer_norm.cpp @@ -6,15 +6,14 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" - #include #include #include using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; namespace impl { namespace HiFi { @@ -76,9 +75,11 @@ void quantized_layer_norm_( for (size_t j = 0; j < last_dim; ++j) { // Since X is quantized, we dequantize it, compute fp32 result, and // quantize the result to an int8/uint8 value. - float val = kernels::dequantize(x[j], input_scale, input_zero_point); + float val = impl::HiFi::kernels::dequantize( + x[j], input_scale, input_zero_point); val = (val - mean) * inv_std * weight_data[j] + bias_data[j]; - y[j] = kernels::quantize(val, output_inv_scale, output_zero_point); + y[j] = impl::HiFi::kernels::quantize( + val, output_inv_scale, output_zero_point); } } } @@ -114,7 +115,7 @@ void quantized_layer_norm_( } void quantized_layer_norm_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_scale, const Tensor& in_zero_point, diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index 2fdd900008..fb186abbb1 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -6,8 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include "kernels.h" - +#include #include #include #include @@ -17,10 +16,10 @@ namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; void quantized_linear_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& src, const Tensor& weight, const Tensor& bias, diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl new file mode 100644 index 0000000000..c7b24d790f --- /dev/null +++ b/backends/cadence/hifi/operators/targets.bzl @@ -0,0 +1,30 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + # Define build targets for all operators registered in the tables above. + + runtime.cxx_library( + name = "cadence_hifi_ops", + srcs = glob([ + "*.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib", + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common", + "//executorch/backends/cadence/hifi/kernels:kernels", + ], + visibility = [ + "//executorch/backends/cadence/...", + ], + ) diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt index e93e0759d2..90eca6b47e 100644 --- a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt +++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt @@ -1,30 +1,19 @@ - cmake_minimum_required(VERSION 3.10.0) project(cadence_nnlib) - -add_custom_target( nnlib_target ALL COMMAND - make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build - OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj - LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib - -j8 ) +add_custom_target( + nnlib_target ALL + COMMAND + make install_nnlib -f makefile -C + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build + OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj + LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib -j8 +) add_library(xa_nnlib STATIC IMPORTED GLOBAL) add_dependencies(xa_nnlib nnlib_target) set_property( - TARGET xa_nnlib - PROPERTY - IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" + TARGET xa_nnlib PROPERTY IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" ) - - - - - - - - - - - diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt index eadb01f54d..fba66e9b27 100644 --- a/backends/cadence/reference/kernels/CMakeLists.txt +++ b/backends/cadence/reference/kernels/CMakeLists.txt @@ -5,12 +5,6 @@ # LICENSE file in the root directory of this source tree. # lint_cmake: -linelength -add_library( - cadence_kernels - kernels.cpp -) +add_library(cadence_kernels kernels.cpp) -target_include_directories( - cadence_kernels - PUBLIC . -) +target_include_directories(cadence_kernels PUBLIC .) diff --git a/backends/cadence/reference/kernels/TARGETS b/backends/cadence/reference/kernels/TARGETS new file mode 100644 index 0000000000..67f2bab681 --- /dev/null +++ b/backends/cadence/reference/kernels/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp index 735d390bc7..ae3e1bad2d 100644 --- a/backends/cadence/reference/kernels/kernels.cpp +++ b/backends/cadence/reference/kernels/kernels.cpp @@ -6,10 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#include "kernels.h" - +#include #include -#include +#include +#include namespace impl { namespace reference { diff --git a/backends/cadence/reference/kernels/targets.bzl b/backends/cadence/reference/kernels/targets.bzl new file mode 100644 index 0000000000..d3fe3fa39d --- /dev/null +++ b/backends/cadence/reference/kernels/targets.bzl @@ -0,0 +1,15 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "cadence_kernels", + srcs = ["kernels.cpp"], + exported_headers = [ + "kernels.h", + ], + visibility = [ + "//executorch/backends/cadence/...", + ], + platforms = CXX, + ) diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt index c81e934850..605c43ef71 100644 --- a/backends/cadence/reference/operators/CMakeLists.txt +++ b/backends/cadence/reference/operators/CMakeLists.txt @@ -47,7 +47,11 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp") + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp" +) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) @@ -55,19 +59,26 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) # Custom ops that are needed to run the test model. add_library( - custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp" - "quantized_relu_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" - "quantized_matmul_out.cpp") -target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) + custom_ops + "quantized_linear_out.cpp" + "quantized_conv_out.cpp" + "quantized_relu_out.cpp" + "quantized_layer_norm.cpp" + "quantize_per_tensor.cpp" + "dequantize_per_tensor.cpp" + "quantized_matmul_out.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) target_link_libraries(custom_ops PUBLIC executorch) target_link_libraries(custom_ops PRIVATE cadence_kernels) @@ -79,12 +90,11 @@ gen_selected_ops( "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions.yaml" "" "" ) generate_bindings_for_kernels( - LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML - FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml ) -message("Generated files ${gen_command_sources}") +message("Generated cadence x86 files ${gen_command_sources}") gen_operators_lib( - LIB_NAME "cadence_ops_lib" - KERNEL_LIBS custom_ops - DEPS aten_ops_cadence) + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/cadence/reference/operators/TARGETS b/backends/cadence/reference/operators/TARGETS new file mode 100644 index 0000000000..67f2bab681 --- /dev/null +++ b/backends/cadence/reference/operators/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp index 4d6a618034..9c6cf6ecc5 100644 --- a/backends/cadence/reference/operators/dequantize_per_tensor.cpp +++ b/backends/cadence/reference/operators/dequantize_per_tensor.cpp @@ -6,19 +6,19 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" namespace impl { namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; void dequantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/reference/operators/op_add.cpp b/backends/cadence/reference/operators/op_add.cpp index 946a1ee858..89b6746760 100644 --- a/backends/cadence/reference/operators/op_add.cpp +++ b/backends/cadence/reference/operators/op_add.cpp @@ -8,7 +8,6 @@ #include #include -#include #include #include @@ -17,7 +16,7 @@ namespace executor { namespace native { Tensor& add_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& a, const Tensor& b, const Scalar& alpha, diff --git a/backends/cadence/reference/operators/op_embedding.cpp b/backends/cadence/reference/operators/op_embedding.cpp index f0b625c963..e1e4984b56 100644 --- a/backends/cadence/reference/operators/op_embedding.cpp +++ b/backends/cadence/reference/operators/op_embedding.cpp @@ -13,10 +13,10 @@ namespace executor { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; void embedding_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& weight, const Tensor& indices, int64_t padding_idx, diff --git a/backends/cadence/reference/operators/op_full.cpp b/backends/cadence/reference/operators/op_full.cpp index 75d1d51901..00be188965 100644 --- a/backends/cadence/reference/operators/op_full.cpp +++ b/backends/cadence/reference/operators/op_full.cpp @@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; Tensor& full_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const IntArrayRef sizes, const Scalar& fill_value, Tensor& out) { diff --git a/backends/cadence/reference/operators/op_view_copy.cpp b/backends/cadence/reference/operators/op_view_copy.cpp index a363125c37..ac0a859849 100644 --- a/backends/cadence/reference/operators/op_view_copy.cpp +++ b/backends/cadence/reference/operators/op_view_copy.cpp @@ -13,10 +13,10 @@ namespace executor { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; Tensor& view_copy_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const IntArrayRef size, Tensor& out) { diff --git a/backends/cadence/reference/operators/quantize_per_tensor.cpp b/backends/cadence/reference/operators/quantize_per_tensor.cpp index 8e25b58a07..bc200fd376 100644 --- a/backends/cadence/reference/operators/quantize_per_tensor.cpp +++ b/backends/cadence/reference/operators/quantize_per_tensor.cpp @@ -6,21 +6,21 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" namespace impl { namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; // Quantize the input tensor (PT2 version). Note that quant_ are not // used in any computation. void quantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp index 95236b4397..47234a7cd9 100644 --- a/backends/cadence/reference/operators/quantized_conv_out.cpp +++ b/backends/cadence/reference/operators/quantized_conv_out.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include "kernels.h" +#include #include #include @@ -17,7 +17,7 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; // This implements a generic 2d conv kernel that operates on raw pointers. // The version handles both quantized and fp32 convolutions. @@ -156,7 +156,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( // quantized::conv1d or quantized::conv2d based on the dimensionality of // activation tensor. void quantized_conv_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, const Tensor& bias, diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp index 22075f632e..a2dd644a97 100644 --- a/backends/cadence/reference/operators/quantized_layer_norm.cpp +++ b/backends/cadence/reference/operators/quantized_layer_norm.cpp @@ -6,15 +6,15 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" #include #include #include using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; namespace impl { namespace reference { @@ -25,7 +25,7 @@ namespace native { template void quantized_layer_norm_( const Tensor& input, - float input_scale, + double input_scale, int64_t input_zero_point, const Tensor& weight, const Tensor& bias, @@ -39,7 +39,7 @@ void quantized_layer_norm_( const float* __restrict__ weight_data = weight.const_data_ptr(); const float* __restrict__ bias_data = bias.const_data_ptr(); - float output_inv_scale = XT_RECIP_S(output_scale); + float output_inv_scale = 1.0f / output_scale; size_t last_dim = input.size(input.dim() - 1); size_t leading_dims = getLeadingDims(input, input.dim() - 1); @@ -47,15 +47,14 @@ void quantized_layer_norm_( // Visualize the input tensor as a set of 1d vectors, and compute the // layer_norm for each vector. for (size_t i = 0; i < leading_dims; ++i) { - const T* __restrict__ x = in_data + i * last_dim; - T* __restrict__ y = out_data + i * last_dim; + const T* x = in_data + i * last_dim; + T* y = out_data + i * last_dim; // compute sum and squared sum. The fp32 sum can be approximated as: // (X_1 - in_zero_point) * in_scale + (X_2 - in_zero_point) * in_scale + ... // (X_N - in_zero_point) * in_scale. int32_t sum = 0; int32_t sq_sum = last_dim * input_zero_point * input_zero_point; -#pragma simd for (size_t j = 0; j < last_dim; ++j) { int32_t val = x[j]; sum += val; @@ -64,19 +63,18 @@ void quantized_layer_norm_( sq_sum -= (2 * sum * input_zero_point); sum -= (last_dim * input_zero_point); - float mean = XT_DIV_S(XT_MUL_S(input_scale, sum), last_dim); + float mean = (input_scale * sum) / last_dim; float variance = - XT_DIV_S( - XT_MUL_S(sq_sum, XT_MUL_S(input_scale, input_scale)), last_dim) - - XT_MUL_S(mean, mean); - float inv_std = XT_RECIP_S(XT_SQRT_S(XT_ADD_S(variance, (float)eps))); + (sq_sum * input_scale * input_scale) / last_dim - mean * mean; + float inv_std = 1.0f / std::sqrt(variance + eps); // y = (x - mean) / std * kGamma + kBeta -#pragma simd - for (size_t j = 0; j < last_dim; ++j) { + for (int j = 0; j < last_dim; ++j) { + // y[j] = (x[j] - mean) / std * kGamma + kBeta; // Since X is quantized, we dequantize it, compute fp32 result, and // quantize the result to an int8/uint8 value. float val = kernels::dequantize(x[j], input_scale, input_zero_point); + val = (val - mean) * inv_std * weight_data[j] + bias_data[j]; y[j] = kernels::quantize(val, output_inv_scale, output_zero_point); } @@ -114,7 +112,7 @@ void quantized_layer_norm_( } void quantized_layer_norm_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_scale, const Tensor& in_zero_point, diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp index fa40f16427..300158d8e5 100644 --- a/backends/cadence/reference/operators/quantized_linear_out.cpp +++ b/backends/cadence/reference/operators/quantized_linear_out.cpp @@ -6,18 +6,18 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" namespace impl { namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; void quantized_linear_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& src, const Tensor& weight, const Tensor& bias, diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp index 49dd222a96..b381a8ee39 100644 --- a/backends/cadence/reference/operators/quantized_matmul_out.cpp +++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp @@ -6,15 +6,15 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" namespace impl { namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; // The quantized matmul. The quantized matmul accumulates in a wider register, // whose type is TA. @@ -105,11 +105,10 @@ void inline _typed_quantized_matmul( out_dim); } } - break; } void quantized_matmul_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& X, int64_t X_zero_point, const Tensor& Y, @@ -120,7 +119,7 @@ void quantized_matmul_out( int64_t out_zero_point, bool transposed, Tensor& out) { - if (out.scalar_type() == at::ScalarType::Byte) { + if (out.scalar_type() == exec_aten::ScalarType::Byte) { _typed_quantized_matmul( X, X_zero_point, @@ -132,7 +131,7 @@ void quantized_matmul_out( out_zero_point, transposed, out); - } else if (out.scalar_type() == at::ScalarType::Char) { + } else if (out.scalar_type() == exec_aten::ScalarType::Char) { _typed_quantized_matmul( X, X_zero_point, diff --git a/backends/cadence/reference/operators/quantized_relu_out.cpp b/backends/cadence/reference/operators/quantized_relu_out.cpp index 54f6b723c6..04cb2c8833 100644 --- a/backends/cadence/reference/operators/quantized_relu_out.cpp +++ b/backends/cadence/reference/operators/quantized_relu_out.cpp @@ -6,41 +6,67 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include "kernels.h" namespace impl { namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; -// Note: this kernel assumes that the input and output share quantization -// parameters. If that is not the case, it will produce incorrect results. template void quantized_relu_( const Tensor& input, const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, Tensor& output) { T q_zero_point = in_zero_point.const_data_ptr()[0]; const T* __restrict__ in = input.const_data_ptr(); T* __restrict__ out = output.mutable_data_ptr(); + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = + -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); + for (size_t i = 0, e = input.numel(); i < e; ++i) { - out[i] = in[i] > q_zero_point ? in[i] : q_zero_point; + const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0; + out[i] = kernels::quantize(temp, out_scale, out_zero_point); } } void quantized_relu_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, Tensor& output) { if (input.scalar_type() == exec_aten::ScalarType::Byte) { - quantized_relu_(input, in_zero_point, output); + quantized_relu_( + input, + in_zero_point, + out_zero_point, + out_multiplier, + out_shift, + output); } else if (input.scalar_type() == exec_aten::ScalarType::Char) { - quantized_relu_(input, in_zero_point, output); + quantized_relu_( + input, + in_zero_point, + out_zero_point, + out_multiplier, + out_shift, + output); } else { ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type()); } diff --git a/backends/cadence/reference/operators/targets.bzl b/backends/cadence/reference/operators/targets.bzl new file mode 100644 index 0000000000..347d476239 --- /dev/null +++ b/backends/cadence/reference/operators/targets.bzl @@ -0,0 +1,20 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "cadence_cpu_ops", + srcs = glob([ + "*.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/reference/kernels:cadence_kernels", + ], + visibility = [ + "//executorch/backends/cadence/...", + ], + ) diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS new file mode 100644 index 0000000000..1b55a7d541 --- /dev/null +++ b/backends/cadence/runtime/TARGETS @@ -0,0 +1,21 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("odai_jarvis") + +python_library( + name = "runtime", + srcs = [ + "__init__.py", + "executor.py", + ] + glob([ + "xtsc-cfg/**/*", + ]), + typing = True, + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/exir:lib", + ], +) diff --git a/backends/cadence/runtime/__init__.py b/backends/cadence/runtime/__init__.py new file mode 100644 index 0000000000..802e218f0d --- /dev/null +++ b/backends/cadence/runtime/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from .executor import ( # noqa: F401 + BundledProgramManager, + BundledProgramTestData, + Executor, +) diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py new file mode 100644 index 0000000000..d07b1b6a52 --- /dev/null +++ b/backends/cadence/runtime/executor.py @@ -0,0 +1,201 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + + +import logging +import os +import selectors +import subprocess +import sys + +from dataclasses import dataclass +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import torch + +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.core import BundledProgram + +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) +from executorch.exir import ExecutorchProgram, ExecutorchProgramManager + +# If quiet is true, suppress the printing of stdout and stderr output. +quiet = False + + +def _execute_subprocess(cmd: List[str], cwd: Optional[str] = None) -> Tuple[str, str]: + """ + `subprocess.run(cmd, capture_output=True)` captures stdout/stderr and only + returns it at the end. This functions not only does that, but also prints out + stdout/stderr non-blockingly when running the command. + """ + logging.debug(f"cmd = \33[33m{cmd}\33[0m, cwd = {cwd}") + stdout = "" + stderr = "" + + PIPE = subprocess.PIPE + with subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd) as p: + sel = selectors.DefaultSelector() + # pyre-fixme[6]: For 1st argument expected `Union[HasFileno, int]` but got + # `Optional[IO[bytes]]`. + sel.register(p.stdout, selectors.EVENT_READ) + # pyre-fixme[6]: For 1st argument expected `Union[HasFileno, int]` but got + # `Optional[IO[bytes]]`. + sel.register(p.stderr, selectors.EVENT_READ) + + done = False + while not done: + for key, _ in sel.select(): + # pyre-fixme[16]: Item `HasFileno` of `Union[HasFileno, int]` has no + # attribute `read1`. + data = key.fileobj.read1().decode() + if not data: + done = True + break + + if key.fileobj is p.stdout: + if not quiet: + print(data, end="") + stdout += data + else: + if not quiet: + print(data, end="", file=sys.stderr) + stderr += data + + # flush stdout and stderr in case there's no newline character at the end + # from the subprocess + sys.stdout.flush() + sys.stderr.flush() + + if p.returncode != 0: + raise subprocess.CalledProcessError(p.returncode, p.args, stdout, stderr) + + return stdout, stderr + + +def execute(args: List[str]) -> Tuple[str, str]: + """ + Either a local execution (through subprocess.run) or a remote execution (in Hargow). + Run the command described by args (the same way subprocess.run does). Ex: if you want to + run "ls -al", you need to pass args = ["ls", "-al"] + """ + # `import torch` will mess up PYTHONPATH. delete the messed up PYTHONPATH + if "PYTHONPATH" in os.environ: + del os.environ["PYTHONPATH"] + + try: + return _execute_subprocess(args) + except subprocess.CalledProcessError as e: + fdb_cmd = f"fdb {' '.join(e.cmd)}" + raise RuntimeError( + f"Failed to execute. Use the following to debug:\n{fdb_cmd}" + ) from e + + +class Executor: + # pyre-fixme[3]: Return type must be annotated. + def __init__( + self, + working_dir: str = "", + ): + self.working_dir = working_dir + self.executor_builder = "./backends/cadence/build_cadence_runner.sh" + self.execute_runner = "./cmake-out/backends/cadence/cadence_runner" + self.bundled_program_path: str = "CadenceDemoModel.bpte" + + def __call__(self) -> None: + # build executor + args = self.get_bash_command(self.executor_builder) + logging.info(f"\33[33m{' '.join(args)}\33[0m") + execute(args) + + # run executor + cmd_args = { + "bundled_program_path": os.path.join( + self.working_dir, self.bundled_program_path + ), + "etdump_path": os.path.join(self.working_dir, "etdump.etdp"), + "debug_output_path": os.path.join(self.working_dir, "debug_output.bin"), + } + args = self.get_bash_command(self.execute_runner, cmd_args) + logging.info(f"\33[33m{' '.join(args)}\33[0m") + execute(args) + + @staticmethod + def get_bash_command( + executable: str, + cmd_args: Optional[Dict[str, str]] = None, + ) -> List[str]: + # go through buck config and turn the dict into a list of "{key}=={value}" + if cmd_args is None: + cmd_args = {} + + cmd_args_strs = [] + for key, value in cmd_args.items(): + cmd_args_strs.extend([f"--{key}={value}"]) + + return [executable] + cmd_args_strs + + +@dataclass +class BundledProgramTestData: + method: str + inputs: Sequence[Union[bool, float, int, torch.Tensor]] + expected_outputs: Sequence[torch.Tensor] + testset_idx: int = 0 # There is only one testset in the bundled program + + +class BundledProgramManager: + """ + Stateful bundled program object + Takes a BundledProgramTestData and generates a bundled program + """ + + def __init__(self, bundled_program_test_data: List[BundledProgramTestData]) -> None: + self.bundled_program_test_data: List[BundledProgramTestData] = ( + bundled_program_test_data + ) + + @staticmethod + # pyre-fixme[2]: Parameter `**args` has no type specified. + def bundled_program_test_data_gen(**args) -> BundledProgramTestData: + return BundledProgramTestData(**args) + + def get_method_test_suites(self) -> List[MethodTestSuite]: + return [ + self._gen_method_test_suite(bptd) for bptd in self.bundled_program_test_data + ] + + def _gen_method_test_suite(self, bptd: BundledProgramTestData) -> MethodTestSuite: + method_test_case = MethodTestCase( + inputs=bptd.inputs, + expected_outputs=bptd.expected_outputs, + ) + return MethodTestSuite( + method_name=bptd.method, + test_cases=[method_test_case], + ) + + def _serialize( + self, + executorch_program: Union[ + ExecutorchProgram, + ExecutorchProgramManager, + ], + method_test_suites: Sequence[MethodTestSuite], + bptd: BundledProgramTestData, + ) -> bytes: + bundled_program = BundledProgram( + executorch_program=executorch_program, method_test_suites=method_test_suites + ) + bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer( + bundled_program + ) + return bundled_program_buffer diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/runtime/executor_main.sh new file mode 100644 index 0000000000..7d6cba09b8 --- /dev/null +++ b/backends/cadence/runtime/executor_main.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Test the end-to-end flow of building devtools/example_runner and use it to run +# an actual model. + + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh" + +cmake_install_executorch_devtools_lib() { + echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a" + rm -rf cmake-out + + retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config Release +} + +test_cmake_devtools_example_runner() { + local example_dir=examples/devtools + local build_dir=cmake-out/${example_dir} + CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" + rm -rf ${build_dir} + retry cmake \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ + -DCMAKE_BUILD_TYPE=Release \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -B${build_dir} \ + ${example_dir} + + echo "Building ${example_dir}" + cmake --build ${build_dir} -j9 --config Release + + echo 'Running devtools/example_runner' + ${build_dir}/example_runner --bundled_program_path="./CadenceDemoModel.bpte" +} + +if [[ -z $PYTHON_EXECUTABLE ]]; +then + PYTHON_EXECUTABLE=python3 +fi + +if [[ -z $BUCK ]]; +then + BUCK=buck2 +fi + +cmake_install_executorch_devtools_lib +test_cmake_devtools_example_runner diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py new file mode 100644 index 0000000000..33bb20719c --- /dev/null +++ b/backends/cadence/runtime/runtime.py @@ -0,0 +1,241 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import logging +import numbers +import os +import tempfile +from typing import Any, Optional, Sequence, Tuple, Union + +import executorch.exir.schema as et_schema + +import numpy as np +import torch + +from executorch.backends.cadence.runtime import utils +from executorch.backends.cadence.runtime.executor import Executor +from executorch.devtools import Inspector +from executorch.exir import ExecutorchProgramManager +from executorch.exir._serialize._program import deserialize_pte_binary +from executorch.exir.schema import DataLocation + +from numpy import ndarray + +from torch.utils._pytree import TreeSpec + + +class JarvisETDump: + def __init__(self, output_dir: str) -> None: + self.tensor_dump_dir: str = os.path.join(output_dir, "tensors") + self.etdump_path: str = os.path.join(output_dir, "etdump.etdp") + self.etrecord_path: Optional[str] = os.path.join(output_dir, "etrecord.bin") + self.debug_buffer_path: Optional[str] = os.path.join( + output_dir, "debug_output.bin" + ) + + if not os.path.exists(self.etdump_path): + raise RuntimeError(f"{self.etdump_path} does not exist") + # pyre-ignore[6]: os.path.exists expects str, but got Optional[str] + if not os.path.exists(self.etrecord_path): + logging.warning( + "ETRecord not found, intermediate tensors will not be dumped" + ) + self.etrecord_path = None + # pyre-ignore[6]: os.path.exists expects str, but got Optional[str] + if not os.path.exists(self.debug_buffer_path): + logging.warning( + "Debug buffer not found, intermediate tensors will not be dumped" + ) + self.debug_buffer_path = None + + self.et_inspector: Inspector = Inspector( + etdump_path=self.etdump_path, + debug_buffer_path=self.debug_buffer_path, + etrecord=self.etrecord_path, + ) + + def get_outputs(self, log_to_stdout: bool = False) -> Tuple[torch.Tensor]: + output = [ + event_block.run_output + for event_block in self.et_inspector.event_blocks + if event_block.name == "Execute" + ] + logging.debug(f"[Jarvis][ETdump] output: {output}") + return output[0] + + def print_event_block(self) -> None: + logging.debug("[Jarvis][ETdump] data tabular:") + if logging.getLogger().level <= logging.DEBUG: + self.et_inspector.print_data_tabular() + + def print_event_data(self) -> None: + logging.debug("[Jarvis][ETdump] event data ") + for event_block in self.et_inspector.event_blocks: + for event in event_block.events: + logging.debug(event) + + def dump_intermediate_tensors(self) -> None: + if self.etrecord_path is None: + logging.info("[Jarvis][ETdump] Intermediate tensors not available") + return + + logging.info( + f"[Jarvis][ETdump] Dumping intermediate tensors to {self.tensor_dump_dir}" + ) + os.makedirs(self.tensor_dump_dir, exist_ok=True) + exec_blocks = [ + eb for eb in self.et_inspector.event_blocks if eb.name == "Execute" + ] + if len(exec_blocks) > 1: + logging.warning( + f'Found {len(exec_blocks)} "Execute" blocks, using the first one and ignoring the rest.' + ) + block = exec_blocks[0] + + # OPERATOR_CALL events are duplicates that contain framework tax data. We don't need them + op_events = [e for e in block.events if e.name != "OPERATOR_CALL"] + torch.set_printoptions(profile="full") + + for event in op_events: + instr_id = event._instruction_id + if not event.debug_data: + logging.debug( + f"Missing intermediate tensor data for {event.name} ({instr_id=})" + ) + continue + + with open(f"{self.tensor_dump_dir}/{instr_id}.txt", "w") as f: + for dd in event.debug_data: + f.write(f"{str(dd)}\n\n") + torch.set_printoptions(profile="default") + + +def get_op_names(program: et_schema.Program, execution_plan_id: int = 0) -> set[str]: + """ + Get the list of operators from a Program + """ + + op_names = { + f"{op.name}.{op.overload}" + for op in program.execution_plan[execution_plan_id].operators + } + for delegate in program.execution_plan[execution_plan_id].delegates: + logging.debug(f"Delegate: {delegate.id}") + if delegate.id == "CadenceExecutorchBackend": + assert delegate.processed.location == DataLocation.INLINE + op_names |= get_op_names( + deserialize_pte_binary( + program.backend_delegate_data[delegate.processed.index].data + ) + ) + return op_names + + +# Run an ExecutorchProgram using the specified inputs and backend +def run( + executorch_prog: ExecutorchProgramManager, + inputs: Any, + ref_outputs: Optional[Sequence[torch.Tensor]] = None, + working_dir: Optional[str] = None, +) -> Any: + # Get the Program + program = executorch_prog.executorch_program + out_spec = executorch_prog.exported_program().call_spec.out_spec + # Run the program and return the outputs + assert isinstance( + program, et_schema.Program + ), f"program must be Program. Got {type(program)} instead." + + if working_dir is None: + working_dir = tempfile.mkdtemp(dir="/tmp") + + # initialize Jarvis e2e Executor with executorch_cfg. + executor = Executor(working_dir) + + # run Executor + executor() + + etdump = JarvisETDump(output_dir=working_dir) + outputs = etdump.get_outputs() + + assert isinstance(out_spec, TreeSpec) + outputs = torch.utils._pytree.tree_unflatten(outputs, out_spec) + + return outputs + + +def compare( + # pyre-fixme[2]: Parameter annotation cannot be `Any`. + outputs: Any, + # pyre-fixme[2]: Parameter annotation cannot be `Any`. + ref_outputs: Any, + name: str = "", + eps_error: float = 1e-1, + eps_warn: float = 1e-5, +) -> None: + if isinstance(ref_outputs, dict): + for k, v in outputs.items(): + compare(v, ref_outputs[k], f"{name}/{k}", eps_error, eps_warn) + return + + if isinstance(ref_outputs, (list, tuple)): + for i, (output, ref_output) in enumerate(zip(outputs, ref_outputs)): + compare(output, ref_output, f"{name}/{i}", eps_error, eps_warn) + return + + assert isinstance(ref_outputs, torch.Tensor), f"Got {type(ref_outputs)} instead." + + ref_outputs = to_nd_array(ref_outputs) + outputs = to_nd_array(outputs) + + # compare + rms = utils.rms(outputs, ref_outputs) + norm_rms = utils.normalized_rms(outputs, ref_outputs) + max_abs_diff = utils.max_abs_diff(outputs, ref_outputs) + max_rel_diff = utils.max_rel_diff(outputs, ref_outputs) + stats = ( + f"{rms = }, {norm_rms = }, {max_abs_diff = }, {max_rel_diff = :.2f}%, " + f"{outputs.shape = }[{outputs.dtype}], {ref_outputs.shape = }[{ref_outputs.dtype}]" + ) + + if np.isnan(rms) or rms > eps_error: + logging.error(f"\33[31m[Error]\33[0m Output {name} mismatched! {stats}") + logging.error(f"Expected: {ref_outputs}\n") + logging.error(f"Got instead: {outputs}\n") + raise RuntimeError(f"\33[31m[Error]\33[0m Output {name} mismatched! {stats}") + elif rms > eps_warn: + logging.warning(f"\33[33m[Warning]\33[0m Output {name} mismatched!. {stats}") + else: + logging.info(f"\33[32m[Passed]\33[0m Output {name} matched. {stats}") + + +def run_and_compare( + executorch_prog: ExecutorchProgramManager, + inputs: Any, + ref_outputs: Optional[Sequence[torch.Tensor]] = None, + working_dir: Optional[str] = None, + eps_error: float = 1e-1, + eps_warn: float = 1e-5, +) -> Any: + outputs = run(executorch_prog, inputs, ref_outputs, working_dir) + compare(outputs, ref_outputs, eps_error=eps_error, eps_warn=eps_warn) + + +# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. +def to_nd_array(v: Union[bool, numbers.Number, ndarray, torch.Tensor]) -> np.ndarray: + if isinstance(v, np.ndarray): + return v + + if isinstance(v, torch.Tensor): + # If v was quantized, we compare its int representation. + v = v.int_repr() if v.is_quantized else v + return v.cpu().detach().numpy() + + if isinstance(v, (numbers.Number, bool)): + return np.array([v]) + + raise RuntimeError(f"Unknown type {type(v)}") diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/runtime/utils.py new file mode 100644 index 0000000000..b3ed622e8b --- /dev/null +++ b/backends/cadence/runtime/utils.py @@ -0,0 +1,108 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import logging +import typing +from typing import Callable, Union + +import numpy as np +import torch + + +# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. +def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[ + [ + # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. + typing.Union[np.ndarray, torch._tensor.Tensor], + # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. + typing.Union[np.ndarray, torch._tensor.Tensor], + ], + float, +]: + # A distance decorator that performs all the necessary checkes before calculating + # the distance between two N-D tensors given a function. This can be a RMS + # function, maximum abs diff, or any kind of distance function. + def wrapper( + # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. + a: Union[np.ndarray, torch.Tensor], + # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. + b: Union[np.ndarray, torch.Tensor], + ) -> float: + # convert a and b to np.ndarray type fp64 + a = to_np_arr_fp64(a) + b = to_np_arr_fp64(b) + + # return NaN if shape mismatches + if a.shape != b.shape: + return np.nan + + # After we make sure shape matches, check if it's empty. If yes, return 0 + if a.size == 0: + return 0 + + # np.isinf and np.isnan returns a Boolean mask. Check if Inf or NaN occur at + # the same places in a and b. If not, return NaN + if np.any(np.isinf(a) != np.isinf(b)) or np.any(np.isnan(a) != np.isnan(b)): + return np.nan + + # mask out all the values that are either Inf or NaN + mask = np.isinf(a) | np.isnan(a) + if np.any(mask): + logging.warning("Found inf/nan in tensor when calculating the distance") + + a_masked = a[~mask] + b_masked = b[~mask] + + # after masking, the resulting tensor might be empty. If yes, return 0 + if a_masked.size == 0: + return 0 + + # only compare the rest (those that are actually numbers) using the metric + return fn(a_masked, b_masked) + + return wrapper + + +@distance +# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. +def rms(a: np.ndarray, b: np.ndarray) -> float: + return ((a - b) ** 2).mean() ** 0.5 + + +@distance +# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. +def max_abs_diff(a: np.ndarray, b: np.ndarray) -> float: + return np.abs(a - b).max() + + +@distance +# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. +def max_rel_diff(x: np.ndarray, x_ref: np.ndarray) -> float: + return np.abs((x - x_ref) / x_ref).max() + + +# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. +def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray: + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + if isinstance(x, np.ndarray): + x = x.astype(np.float64) + return x + + +# pyre-fixme[3]: Return type must be annotated. +def normalized_rms( + # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. + predicted: Union[np.ndarray, torch.Tensor], + # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. + ground_truth: Union[np.ndarray, torch.Tensor], +): + num = rms(predicted, ground_truth) + if num == 0: + return 0 + den = np.linalg.norm(to_np_arr_fp64(ground_truth)) + return np.float64(num) / np.float64(den) diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py index 973b457bad..d830c1bb31 100644 --- a/backends/example/test_example_delegate.py +++ b/backends/example/test_example_delegate.py @@ -46,7 +46,7 @@ def get_example_inputs(): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() # print("original model:", m) quantizer = ExampleQuantizer() # quantizer = XNNPACKQuantizer() @@ -82,7 +82,7 @@ def test_delegate_mobilenet_v2(self): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() quantizer = ExampleQuantizer() m = prepare_pt2e(m, quantizer) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt new file mode 100644 index 0000000000..744b1193d5 --- /dev/null +++ b/backends/mediatek/CMakeLists.txt @@ -0,0 +1,49 @@ +#[[ +/* +* Copyright (c) 2024 MediaTek Inc. +* +* Licensed under the BSD License (the "License"); you may not use this file +* except in compliance with the License. See the license file in the root +* directory of this source tree for more details. +*/ +]] + +# Let include directory as "executorch/..." +set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +set(NEURON_BUFFER_ALLOCATOR_LIB + "" + CACHE PATH "Path to Neuron Buffer Allocator library" +) +message( + STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}" +) + +include_directories(BEFORE ${_common_include_directories}) + +# shortcut include directory for neuron headers +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) + +# targets +add_library(neuron_backend SHARED) +target_link_libraries(neuron_backend + PRIVATE + executorch_no_prim_ops + portable_ops_lib + android + log + ${NEURON_BUFFER_ALLOCATOR_LIB} +) +target_sources( + neuron_backend + INTERFACE ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp + ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp +) +target_link_options_shared_lib(neuron_backend) + +install(TARGETS neuron_backend DESTINATION lib) diff --git a/backends/mediatek/__init__.py b/backends/mediatek/__init__.py new file mode 100644 index 0000000000..d95b85eabc --- /dev/null +++ b/backends/mediatek/__init__.py @@ -0,0 +1,5 @@ +from .partitioner import NeuropilotPartitioner +from .preprocess import NeuropilotBackend +from .quantizer import NeuropilotQuantizer, Precision + +__all__ = [NeuropilotBackend, NeuropilotPartitioner, NeuropilotQuantizer, Precision] diff --git a/backends/mediatek/partitioner.py b/backends/mediatek/partitioner.py new file mode 100644 index 0000000000..056336d606 --- /dev/null +++ b/backends/mediatek/partitioner.py @@ -0,0 +1,101 @@ +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. + +from typing import Callable, final, List, Optional, Tuple + +import torch +from executorch.backends.mediatek.preprocess import NeuropilotBackend +from executorch.exir.backend.backend_details import CompileSpec +from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, +) +from executorch.exir.backend.utils import tag_constant_data + +from mtk_converter.python.converters.pytorch import importer_v2 +from torch.export.exported_program import ExportedProgram +from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner +from torch.fx.passes.operator_support import OperatorSupportBase + + +class NeuropilotOperatorsSupport(OperatorSupportBase): + + def __init__( + self, + op_types_to_skip: Optional[set] = None, + op_names_to_skip: Optional[set] = None, + ) -> None: + if op_types_to_skip is None: + op_types_to_skip = set() + if op_names_to_skip is None: + op_names_to_skip = set() + + self._op_types_to_skip = op_types_to_skip + self._op_names_to_skip = op_names_to_skip + + def is_node_supported(self, _, node: torch.fx.Node) -> bool: + # Handle 'call_function' only cause 'placeholder' and 'output' cannot be tagged. + # Ref: https://github.com/pytorch/executorch/pull/1398 + if node.op != "call_function": + return False + + op_type = node.target.__name__ + if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip: + print( + f"[Neuropilot Backend] The {op_type} operator with name '{node.name}' is skipped." + ) + return False + + return importer_v2.is_fx_node_supported(node) + + +@final +class NeuropilotPartitioner(Partitioner): + + def __init__( + self, + compile_spec: List[CompileSpec], + op_types_to_skip: Optional[set] = None, + op_names_to_skip: Optional[set] = None, + ) -> None: + self.delegation_spec = DelegationSpec(NeuropilotBackend.__name__, compile_spec) + self._op_types_to_skip = op_types_to_skip + self._op_names_to_skip = op_names_to_skip + + def ops_to_not_decompose( + self, + ep: ExportedProgram, + ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: + ops_not_decompose = [ + torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.upsample_bilinear2d.default, + torch.ops.aten.upsample_bilinear2d.vec, + torch.ops.aten.upsample_nearest2d.default, + torch.ops.aten.upsample_nearest2d.vec, + ] + return (ops_not_decompose, None) + + def partition(self, exported_program: ExportedProgram) -> PartitionResult: + capability_partitioner = CapabilityBasedPartitioner( + exported_program.graph_module, + NeuropilotOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip), + allows_single_node_partition=True, + ) + partition_list = capability_partitioner.propose_partitions() + + partition_tags = {} + for partition in partition_list: + for node in partition.nodes: + tag = f"tag{partition.id}" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + + tag_constant_data(exported_program) + + return PartitionResult( + tagged_exported_program=exported_program, partition_tags=partition_tags + ) diff --git a/backends/mediatek/passes/__init__.py b/backends/mediatek/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backends/mediatek/passes/decompose_scaled_dot_product_attention.py b/backends/mediatek/passes/decompose_scaled_dot_product_attention.py new file mode 100644 index 0000000000..9ce9e8faa6 --- /dev/null +++ b/backends/mediatek/passes/decompose_scaled_dot_product_attention.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. + +import torch + +from executorch.exir.pass_base import ExportPass, PassResult +from torch._decomp import get_decompositions +from torch.fx import Graph +from torch.fx.experimental.proxy_tensor import make_fx + + +def _get_input_node_names(graph: Graph): + input_names = [] + for node in graph.nodes: + if node.op == "placeholder": + input_names.append(node.name) + return input_names + + +class DecomposeScaledDotProductAttention(ExportPass): + """Decompose the single SDPA operator.""" + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + for node in graph.nodes: + if node.target != torch.ops.aten.scaled_dot_product_attention.default: + continue + + decom_mappings = get_decompositions( + [torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default] + ) + input_tensors = (arg.meta["val"] for arg in node.args) + decomposed_module = make_fx(node.target, decom_mappings, "fake", True)( + *input_tensors + ) + decomposed_input_names = _get_input_node_names(decomposed_module.graph) + with graph.inserting_before(node): + name_to_input_tensor_map = {} + for idx, arg in enumerate(node.args): + name_to_input_tensor_map[decomposed_input_names[idx]] = arg + + decomposed_node_to_subgraph_node = {} + for decomposed_node in decomposed_module.graph.nodes: + if decomposed_node.op == "placeholder": + decomposed_node_to_subgraph_node[decomposed_node] = ( + name_to_input_tensor_map[decomposed_node.name] + ) + + # Copy node from decompose graph module + for decomposed_node in decomposed_module.graph.nodes: + if decomposed_node.op == "placeholder": + continue + if decomposed_node.op == "output": + for user in node.users.copy(): + new_node = decomposed_node_to_subgraph_node[ + decomposed_node.args[0] + ] + user.replace_input_with(node, new_node) + continue + + subgraph_node = graph.node_copy( + decomposed_node, + arg_transform=lambda x, d=decomposed_node_to_subgraph_node: d[ + x + ], + ) + subgraph_node.meta["source_fn_stack"] = [ + (subgraph_node, subgraph_node.target) + ] + decomposed_node_to_subgraph_node[decomposed_node] = subgraph_node + + graph.erase_node(node) + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/mediatek/preprocess.py b/backends/mediatek/preprocess.py new file mode 100644 index 0000000000..92a1e196ad --- /dev/null +++ b/backends/mediatek/preprocess.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. + +import contextlib +import struct + +from typing import final, List + +import mtk_converter +import mtk_neuron +import torch +from executorch.exir.backend.backend_details import ( + BackendDetails, + ExportedProgram, + PreprocessResult, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec + +SKIP_COMPILE_SPEC_KEYS = {"ImportForever"} + + +@final +class NeuropilotBackend(BackendDetails): + + @classmethod + def preprocess( + cls, edge_program: ExportedProgram, module_compile_spec: List[CompileSpec] + ) -> PreprocessResult: + + name_to_node_mappings = {node.name: node for node in edge_program.graph.nodes} + input_names = edge_program.graph_signature.user_inputs + output_names = edge_program.graph_signature.user_outputs + fp_input_indices = [ + idx + for idx, name in enumerate(input_names) + if name_to_node_mappings[name].meta["val"].dtype == torch.float32 + ] + fp_output_indices = [ + idx + for idx, name in enumerate(output_names) + if name_to_node_mappings[name].meta["val"].dtype == torch.float32 + ] + + # This default compile options are only for mt6989 SOC + compile_options = ["--arch=mdla5.1,edpa1.0", "--relax-fp32", "--opt=3"] + for spec in module_compile_spec: + if spec.key in SKIP_COMPILE_SPEC_KEYS: + continue + if spec.value == b"": + compile_options.append(f"--{spec.key}") + else: + value = spec.value.decode("utf-8") + compile_options.append(f"--{spec.key}={value}") + + converter = mtk_converter.PyTorchV2Converter.from_exported_program(edge_program) + converter.quantize = True + converter.input_quantization_bitwidths = None + converter.allow_missing_quantization_ranges = True + converter.prepend_input_quantize_ops = True + converter.prepend_input_quantize_ops_indices = fp_input_indices + converter.append_output_dequantize_ops = True + converter.append_output_dequantize_ops_indices = fp_output_indices + with contextlib.redirect_stdout(None): + mlir_str = converter.convert_to_mlir() + model_bytes = mtk_neuron.compile(mlir_str, " ".join(compile_options)) + + num_inputs = len(input_names) + num_outputs = len(output_names) + header = struct.pack(" None: + # Pattern annotation + _annotate_rmsnorm_pattern(graph, quant_config) + _annotate_fused_activation_pattern(graph, quant_config) + + # Per-op annotation + for node in graph.nodes: + if node.op == "placeholder": + annotate_placeholder(node, quant_config) + elif node.op == "call_function": + annotate_func = OP_TO_ANNOTATOR.get(node.target, None) + if annotate_func is not None: + annotate_func(node, quant_config) + + +def register_annotator(ops: List[OpOverload]): + + def decorator(annotator_fn: Callable): + for op in ops: + OP_TO_ANNOTATOR[op] = annotator_fn + + return decorator + + +def _is_annotated(node: Node): + """ + Given a list of nodes (that represents an operator pattern), + return True if any of the node + is annotated, otherwise return False + """ + KEY = "quantization_annotation" + return KEY in node.meta and node.meta[KEY]._annotated + + +def _mark_as_annotated(nodes: List[Node]): + KEY = "quantization_annotation" + for node in nodes: + if KEY not in node.meta: + node.meta[KEY] = QuantizationAnnotation() + node.meta[KEY]._annotated = True + + +def _is_float_activation_tensor(node: Node): + if not isinstance(node, Node): + return False + if "val" not in node.meta: + return False + if not isinstance(node.meta["val"], FakeTensor): + return False + return node.meta["val"].dtype == torch.float32 + + +def _annotate_fused_activation_pattern( + graph: Graph, quant_config: QuantizationConfig +) -> None: + for relu_node in graph.nodes: + # Check relu/relu6 node + if relu_node.op != "call_function": + continue + if relu_node.target not in [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + torch.ops.aten.relu6.default, + ]: + continue + + producer_node = relu_node.args[0] + if not isinstance(producer_node, Node): + continue + if producer_node.op != "call_function": + continue + if len(producer_node.users) != 1: + continue + + # Handle affine + relu fusion + if producer_node.target in [ + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + torch.ops.aten.linear.default, + ]: + weight_node = producer_node.args[1] + _annotate_input_qspec_map( + producer_node, + weight_node, + quant_config.weight, + ) + _annotate_output_qspec(relu_node, quant_config.activation) + _mark_as_annotated([producer_node, weight_node, relu_node]) + continue + + # Handle arithmetic + relu fusion + if producer_node.target in [ + torch.ops.aten.add.Scalar, + torch.ops.aten.add.Tensor, + torch.ops.aten.add_.Scalar, + torch.ops.aten.add_.Tensor, + torch.ops.aten.div.Scalar, + torch.ops.aten.div.Tensor, + torch.ops.aten.div_.Scalar, + torch.ops.aten.div_.Tensor, + torch.ops.aten.divide.Scalar, + torch.ops.aten.divide.Tensor, + torch.ops.aten.mul.Scalar, + torch.ops.aten.mul.Tensor, + torch.ops.aten.mul_.Scalar, + torch.ops.aten.mul_.Tensor, + torch.ops.aten.rsub.Scalar, + torch.ops.aten.rsub.Tensor, + torch.ops.aten.sub.Scalar, + torch.ops.aten.sub.Tensor, + torch.ops.aten.sub_.Scalar, + torch.ops.aten.sub_.Tensor, + ]: + _annotate_output_qspec(relu_node, quant_config.activation) + _mark_as_annotated([producer_node, relu_node]) + continue + + +def _annotate_rmsnorm_pattern(graph: Graph, quant_config: QuantizationConfig) -> None: + + class ExecuTorchPattern(torch.nn.Module): + def forward(self, x): + norm = x * torch.rsqrt((x * x).mean(-1, keepdim=True) + 1e-6) + return norm, {} + + class MTKPattern(torch.nn.Module): + def forward(self, x): + norm = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-6) + return norm, {} + + for pattern_cls in (ExecuTorchPattern, MTKPattern): + pattern_gm = capture_pre_autograd_graph(pattern_cls(), (torch.randn(3, 3),)) + matcher = SubgraphMatcherWithNameNodeMap( + pattern_gm, ignore_literals=True, remove_overlapping_matches=False + ) + matches = matcher.match(graph) + for match in matches: + target_nodes = [] + for node in match.nodes_map.values(): + if node in match.placeholder_nodes: + continue + if node.op == "call_function" and node.target in OP_TO_ANNOTATOR: + target_nodes.append(node) + + if any(_is_annotated(node) for node in target_nodes): + continue + _mark_as_annotated(target_nodes) + for node in match.returning_nodes: + _annotate_output_qspec(node, quant_config.activation) + + +def annotate_placeholder(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated(node): + return + + if _is_float_activation_tensor(node): + _annotate_output_qspec(node, quant_config.activation) + + _mark_as_annotated([node]) + + +@register_annotator( + [ + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + torch.ops.aten.linear.default, + ] +) +def annotate_affine_ops(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated(node): + return + + weight_node = node.args[1] + _annotate_input_qspec_map( + node, + weight_node, + quant_config.weight, + ) + _annotate_output_qspec(node, quant_config.activation) + + # Make weight as annotated because it is a constant node + _mark_as_annotated([node, weight_node]) + + +@register_annotator( + [ + torch.ops.aten.add.Scalar, + torch.ops.aten.add.Tensor, + torch.ops.aten.add_.Scalar, + torch.ops.aten.add_.Tensor, + torch.ops.aten.bmm.default, + torch.ops.aten.div.Scalar, + torch.ops.aten.div.Tensor, + torch.ops.aten.div_.Scalar, + torch.ops.aten.div_.Tensor, + torch.ops.aten.divide.Scalar, + torch.ops.aten.divide.Tensor, + torch.ops.aten.gelu.default, + torch.ops.aten.group_norm.default, + torch.ops.aten.layer_norm.default, + torch.ops.aten.leaky_relu.default, + torch.ops.aten.matmul.default, + torch.ops.aten.mul.Scalar, + torch.ops.aten.mul.Tensor, + torch.ops.aten.mul_.Scalar, + torch.ops.aten.mul_.Tensor, + torch.ops.aten.pow.Scalar, + torch.ops.aten.pow.Tensor_Scalar, + torch.ops.aten.pow.Tensor_Tensor, + torch.ops.aten.prelu.default, + torch.ops.aten.rsub.Scalar, + torch.ops.aten.rsub.Tensor, + torch.ops.aten.silu.default, + torch.ops.aten.sub.Scalar, + torch.ops.aten.sub.Tensor, + torch.ops.aten.sub_.Scalar, + torch.ops.aten.sub_.Tensor, + ] +) +def annotate_output_qspec(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated(node): + return + _annotate_output_qspec(node, quant_config.activation) + _mark_as_annotated([node]) + + +@register_annotator([torch.ops.aten.embedding.default]) +def annotate_embedding_op(node: Node, quant_config: QuantizationConfig) -> None: + if _is_annotated(node): + return + + wgt_node = node.args[0] + _annotate_input_qspec_map(node, wgt_node, quant_config.activation) + _mark_as_annotated([node]) diff --git a/backends/mediatek/quantizer/qconfig.py b/backends/mediatek/quantizer/qconfig.py new file mode 100644 index 0000000000..e16f5e936c --- /dev/null +++ b/backends/mediatek/quantizer/qconfig.py @@ -0,0 +1,171 @@ +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. + +import copy + +from enum import IntEnum, unique + +import torch + +from torch.ao.quantization.fake_quantize import FakeQuantize +from torch.ao.quantization.observer import MinMaxObserver, PerChannelMinMaxObserver +from torch.ao.quantization.quantizer import QuantizationSpec + + +@unique +class Precision(IntEnum): + A16W16 = 0 + A16W8 = 1 + A16W4 = 2 + A8W8 = 3 + A8W4 = 4 + + +class QuantizationConfig: + + def __init__( + self, activation_spec: QuantizationSpec, weight_spec: QuantizationSpec + ): + self._activation_spec = activation_spec + self._weight_spec = weight_spec + + @property + def activation(self): + return copy.deepcopy(self._activation_spec) + + @property + def weight(self): + return copy.deepcopy(self._weight_spec) + + +def get_quant_config( + precision: Precision, + is_per_channel: bool = False, + is_qat: bool = False, +) -> QuantizationConfig: + + precision_mappings = { + Precision.A16W16: get_a16w16_quant_config, + Precision.A16W8: get_a16w8_quant_config, + Precision.A16W4: get_a16w4_quant_config, + Precision.A8W8: get_a8w8_quant_config, + Precision.A8W4: get_a8w4_quant_config, + } + if precision not in precision_mappings: + raise RuntimeError("Unrecognized precision setting.") + + qconfig_fn = precision_mappings[precision] + return qconfig_fn(is_per_channel, is_qat) + + +def _get_activation_qspec( + dtype, + is_symmetric, + is_qat, + observer_cls=MinMaxObserver, + quant_min=None, + quant_max=None, +): + if quant_max is None: + quant_max = torch.iinfo(dtype).max + if quant_min is None: + # quant_min = torch.iinfo(dtype).min + 1 if is_symmetric else torch.iinfo(dtype).min + quant_min = torch.iinfo(dtype).min + + qscheme = torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine + if is_qat: + observer_or_fake_quant = FakeQuantize.with_args(observer=observer_cls, eps=1e-6) + else: + observer_or_fake_quant = observer_cls.with_args(eps=1e-6) + + return QuantizationSpec( + dtype=dtype, + quant_min=quant_min, + quant_max=quant_max, + qscheme=qscheme, + observer_or_fake_quant_ctr=observer_or_fake_quant, + ) + + +def _get_weight_qspec( + dtype, is_symmetric, is_per_channel, is_qat, quant_min=None, quant_max=None +): + if not is_per_channel: + return _get_activation_qspec( + dtype, is_symmetric, is_qat, observer_cls=MinMaxObserver + ) + + if quant_max is None: + quant_max = torch.iinfo(dtype).max + if quant_min is None: + # quant_min = torch.iinfo(dtype).min + 1 if is_symmetric else torch.iinfo(dtype).min + quant_min = torch.iinfo(dtype).min + + qscheme = torch.per_channel_symmetric if is_symmetric else torch.per_channel_affine + if is_qat: + observer_or_fake_quant = FakeQuantize.with_args( + observer=PerChannelMinMaxObserver, eps=1e-6 + ) + else: + observer_or_fake_quant = PerChannelMinMaxObserver.with_args(eps=1e-6) + + return QuantizationSpec( + dtype=dtype, + quant_min=quant_min, + quant_max=quant_max, + qscheme=qscheme, + ch_axis=0, + observer_or_fake_quant_ctr=observer_or_fake_quant, + ) + + +def get_a16w16_quant_config(is_per_channel, is_qat) -> QuantizationConfig: + act_quantization_spec = _get_activation_qspec(torch.int16, True, is_qat) + wgt_quantization_spec = _get_weight_qspec(torch.int16, True, is_per_channel, is_qat) + quantization_config = QuantizationConfig( + act_quantization_spec, wgt_quantization_spec + ) + return quantization_config + + +def get_a16w8_quant_config(is_per_channel, is_qat) -> QuantizationConfig: + act_quantization_spec = _get_activation_qspec(torch.int16, True, is_qat) + wgt_quantization_spec = _get_weight_qspec(torch.int8, True, is_per_channel, is_qat) + quantization_config = QuantizationConfig( + act_quantization_spec, wgt_quantization_spec + ) + return quantization_config + + +def get_a16w4_quant_config(is_per_channel, is_qat) -> QuantizationConfig: + act_quantization_spec = _get_activation_qspec(torch.int16, True, is_qat) + wgt_quantization_spec = _get_weight_qspec( + torch.int8, False, is_per_channel, is_qat, quant_min=-8, quant_max=7 + ) + quantization_config = QuantizationConfig( + act_quantization_spec, wgt_quantization_spec + ) + return quantization_config + + +def get_a8w8_quant_config(is_per_channel, is_qat) -> QuantizationConfig: + act_quantization_spec = _get_activation_qspec(torch.int8, False, is_qat) + wgt_quantization_spec = _get_weight_qspec(torch.int8, False, is_per_channel, is_qat) + quantization_config = QuantizationConfig( + act_quantization_spec, wgt_quantization_spec + ) + return quantization_config + + +def get_a8w4_quant_config(is_per_channel, is_qat) -> QuantizationConfig: + act_quantization_spec = _get_activation_qspec(torch.int8, False, is_qat) + wgt_quantization_spec = _get_weight_qspec( + torch.int8, False, is_per_channel, is_qat, quant_min=-8, quant_max=7 + ) + quantization_config = QuantizationConfig( + act_quantization_spec, wgt_quantization_spec + ) + return quantization_config diff --git a/backends/mediatek/quantizer/quantizer.py b/backends/mediatek/quantizer/quantizer.py new file mode 100644 index 0000000000..44e35ef54e --- /dev/null +++ b/backends/mediatek/quantizer/quantizer.py @@ -0,0 +1,45 @@ +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. + +from torch.ao.quantization.quantizer import Quantizer +from torch.fx import GraphModule + +from ..passes.decompose_scaled_dot_product_attention import ( + DecomposeScaledDotProductAttention, +) +from .annotator import annotate +from .qconfig import get_quant_config, Precision + + +class NeuropilotQuantizer(Quantizer): + + def __init__(self): + super().__init__() + + # TODO: Provide setter functions for those attributes + self._precision = Precision.A8W8 + self._is_per_channel = True + self._is_qat = False + + def setup_precision(self, precision: Precision) -> None: + self._precision = precision + + def transform_for_annotation(self, model: GraphModule) -> GraphModule: + model = DecomposeScaledDotProductAttention()(model).graph_module + return model + + def annotate(self, model: GraphModule) -> GraphModule: + self._annotate(model) + return model + + def validate(self, model: GraphModule) -> None: + pass + + def _annotate(self, gm: GraphModule) -> None: + quant_config = get_quant_config( + self._precision, self._is_per_channel, self._is_qat + ) + annotate(gm.graph, quant_config) diff --git a/backends/mediatek/runtime/NeuronBackend.cpp b/backends/mediatek/runtime/NeuronBackend.cpp new file mode 100644 index 0000000000..e2ac31cec5 --- /dev/null +++ b/backends/mediatek/runtime/NeuronBackend.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#include "NeuronBackend.h" +#include "NeuronBufferAllocator.h" +#include "NeuronLog.h" +#include "NeuronPayloadHeader.h" +#include "api/NeuronAdapter.h" + +#include "executorch/runtime/core/error.h" + +#include +#include +#include +#include + +namespace torch { +namespace executor { + +const char kHighAddrKey[] = "HighAddr"; +const char kImportForeverKey[] = "ImportForever"; + +Result NeuronBackend::init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const { + NeuronDelegateSetting setting; + for (auto& compile_spec : compile_specs) { + if (std::strcmp(compile_spec.key, kHighAddrKey) == 0) { + setting.mHighAddr = *static_cast(compile_spec.value.buffer); + LogInfo("NeuronBackend", "IsHighAddr Enable : %d", setting.mHighAddr); + } else if (std::strcmp(compile_spec.key, kImportForeverKey) == 0) { + setting.mImportForever = *static_cast(compile_spec.value.buffer); + LogInfo( + "NeuronBackend", + "IsImportForever Enable : %d", + setting.mImportForever); + } else { + LogWarn("NeuronBackend", "unknown compile spec: %s", compile_spec.key); + } + } + auto Payload = NeuronPayload(processed->data(), processed->size()); + LogInfo( + "NeuronBackend", + "version %u, input %u, output %u, length %u, payload size: %zu", + Payload.Header.Version, + Payload.Header.InputCount, + Payload.Header.OutputCount, + Payload.Header.DataLen, + processed->size()); + + MemoryAllocator* runtime_allocator = context.get_runtime_allocator(); + NeuronExecuTorchDelegate* delegate = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR( + runtime_allocator, NeuronExecuTorchDelegate); + new (delegate) NeuronExecuTorchDelegate(); + + if (delegate == nullptr) { + return nullptr; + } + auto res = delegate->LoadCompiledNetwork(Payload, setting); + return res == NEURON_NO_ERROR ? delegate : nullptr; +} + +Error NeuronBackend::execute( + ET_UNUSED BackendExecutionContext& context, + DelegateHandle* handle, + EValue** args) const { + NeuronExecuTorchDelegate* delegate = + reinterpret_cast(handle); + return delegate->execute(context, args); +} + +void NeuronBackend::destroy(DelegateHandle* handle) const { + if (handle != nullptr) { + NeuronExecuTorchDelegate* delegate = + reinterpret_cast(handle); + delegate->~NeuronExecuTorchDelegate(); + } +} + +bool NeuronBackend::is_available() const { + return true; +} + +Error NeuronExecuTorchDelegate::execute( + BackendExecutionContext& context, + EValue** args) const { + if (HintNeuronBackend(args) != NEURON_NO_ERROR) { + return Error::InvalidState; + }; + + auto allocator = + dynamic_cast(context.get_temp_allocator()); + size_t inputCount = mInputSizes.size(), outputCount = mOutputSizes.size(); + + for (int i = 0; i < inputCount; i++) { + auto data_ptr = args[i]->toTensor().data_ptr(); + auto data_size = args[i]->toTensor().nbytes(); + if (IsCached(i, data_ptr)) { + continue; + }; + auto unit = allocator != nullptr ? allocator->Find(data_ptr) : nullptr; + if (unit) { + UpdateCache(i, data_ptr); + size_t offset = (char*)data_ptr - (char*)unit->GetAddress(); + mExecutor.SetInputOutputFromMemory( + i, unit->GetNeuronMemory(), offset, data_size); + } else { + mExecutor.SetInputOutput(i, data_ptr, data_size); + } + } + + for (int o = inputCount; o < inputCount + outputCount; o++) { + auto data_ptr = args[o]->toTensor().data_ptr(); + auto data_size = args[o]->toTensor().nbytes(); + auto output_index = o - inputCount; + if (IsCached(output_index, data_ptr)) { + continue; + }; + auto unit = allocator != nullptr ? allocator->Find(data_ptr) : nullptr; + if (unit) { + UpdateCache(output_index, data_ptr); + size_t offset = (char*)data_ptr - (char*)unit->GetAddress(); + mExecutor.SetInputOutputFromMemory( + output_index, unit->GetNeuronMemory(), offset, data_size); + } else { + mExecutor.SetInputOutput( + output_index, data_ptr, data_size); + } + } + + return mExecutor.Compute() == NEURON_NO_ERROR ? Error::Ok + : Error::InvalidState; +}; + +int NeuronExecuTorchDelegate::HintNeuronBackend(EValue** args) const { + auto HintImportForever = [this](EValue** args) -> int { + auto& allocator = GET_NEURON_ALLOCATOR; + size_t inputCount = mInputSizes.size(), outputCount = mOutputSizes.size(); + for (int i = 0; i < inputCount; i++) { + auto data_ptr = args[i]->toTensor().data_ptr(); + if (mHasImported.count(data_ptr)) { + continue; + } + auto unit = allocator.Find(data_ptr); + if (unit) { + mExecutor.SetInputOutputFromMemory( + i, unit->GetNeuronMemory(), 0, unit->GetSize()); + mHasImported.insert(data_ptr); + } + } + for (int o = inputCount; o < inputCount + outputCount; o++) { + auto data_ptr = args[o]->toTensor().data_ptr(); + if (mHasImported.count(data_ptr)) { + continue; + } + auto output_index = o - inputCount; + auto unit = allocator.Find(data_ptr); + if (unit) { + mExecutor.SetInputOutputFromMemory( + output_index, unit->GetNeuronMemory(), 0, unit->GetSize()); + mHasImported.insert(data_ptr); + } + } + return NEURON_NO_ERROR; + }; + if (mSettings.mImportForever) { + CHECK_NO_ERROR(HintImportForever(args)); + } + return NEURON_NO_ERROR; +} + +} // namespace executor +} // namespace torch + +namespace { +auto cls = torch::executor::NeuronBackend(); +torch::executor::Backend backend{"NeuropilotBackend", &cls}; +static auto success_with_compiler = register_backend(backend); +} // namespace diff --git a/backends/mediatek/runtime/NeuronExecutor.cpp b/backends/mediatek/runtime/NeuronExecutor.cpp new file mode 100644 index 0000000000..39d1bf22d6 --- /dev/null +++ b/backends/mediatek/runtime/NeuronExecutor.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#include "NeuronExecutor.h" +#include "NeuronLog.h" +#include "api/NeuronAdapter.h" + +#include +#include + +#define RESTORE_DLA_EXTENSION_OPERAND_TYPE 0x0100 +#define RESTORE_DLA_EXTENSION_OPERATION_TYPE 0x0000 +#define RESTORE_DLA_EXTENSION_NAME "com.mediatek.compiled_network" + +namespace torch { +namespace executor { +namespace neuron { + +NeuronExecutor::NeuronExecutor(){}; + +int NeuronExecutor::LoadFromCompiledNetwork( + const void* buffer, + size_t size, + int inputCount, + int outputCount, + std::string& runtimeOption) { + NeuronModel* model = nullptr; + NeuronCompilation* compilation = nullptr; + NeuronExecution* execution = nullptr; + + std::vector mInputOperand; + std::vector mOutputOperand; + // ---------------------------Model------------------------------------ + int err = NEURON_NO_ERROR; + err |= NeuronModel_create(&model); + CHECK_NO_ERROR(err); + + mModel = std::unique_ptr(model); + + std::vector input_op_number; + // fake input, the real outputs are loaded by compiled network. + NeuronOperandType fakeInputOperandType{ + .type = NEURON_TENSOR_FLOAT32, + .dimensionCount = 0, + .scale = 0.0f, + .zeroPoint = 0, + }; + + for (int i = 0; i < inputCount; i++) { + mInputOperand.push_back(fakeInputOperandType); + } + for (int i = 0; i < mInputOperand.size(); i++) { + err |= NeuronModel_addOperand(model, &mInputOperand[i]); + input_op_number.emplace_back(i); + } + + int32_t operandType = 0; + const uint16_t network_operand_restore_data = + RESTORE_DLA_EXTENSION_OPERAND_TYPE; + const char* extensionRestoreCompiledNetwork = RESTORE_DLA_EXTENSION_NAME; + err |= NeuronModel_getExtensionOperandType( + model, + extensionRestoreCompiledNetwork, + network_operand_restore_data, + &operandType); + CHECK_NO_ERROR(err); + + NeuronOperandType extenOperandType{ + .type = operandType, + .dimensionCount = 0, + .scale = 0.0f, + .zeroPoint = 0, + }; + + err |= NeuronModel_addOperand(model, &extenOperandType); + CHECK_NO_ERROR(err); + input_op_number.emplace_back(input_op_number.size()); + + // fake output, the real outputs are loaded by compiled network. + NeuronOperandType fakeOutputOperandType{ + .type = NEURON_TENSOR_FLOAT32, + .dimensionCount = 0, + .scale = 0.0f, + .zeroPoint = 0, + }; + + for (int i = 0; i < outputCount; i++) { + mOutputOperand.push_back(fakeOutputOperandType); + } + + std::vector output_op_number; + for (int i = 0; i < mOutputOperand.size(); i++) { + err |= NeuronModel_addOperand(model, &mOutputOperand[i]); + output_op_number.emplace_back(i + input_op_number.size()); + } + + CHECK_NO_ERROR(err); + + err |= + NeuronModel_setOperandValue(model, input_op_number.back(), buffer, size); + + int32_t operationType = 0; + const uint16_t network_operation_type_restore = + RESTORE_DLA_EXTENSION_OPERATION_TYPE; + err |= NeuronModel_getExtensionOperationType( + model, + extensionRestoreCompiledNetwork, + network_operation_type_restore, + &operationType); + + CHECK_NO_ERROR(err); + + // Add extension operation + err |= NeuronModel_addOperation( + model, + (NeuronOperationType)operationType, + input_op_number.size(), + input_op_number.data(), + output_op_number.size(), + output_op_number.data()); + + CHECK_NO_ERROR(err); + + // Identify input and output + err |= NeuronModel_identifyInputsAndOutputs( + model, + input_op_number.size() - 1, + input_op_number.data(), + output_op_number.size(), + output_op_number.data()); + + CHECK_NO_ERROR(err); + + err |= NeuronModel_finish(model); + CHECK_NO_ERROR(err); + // ---------------------------Compilation------------------------------------ + // err = NeuronCompilation_e(model, &compilation) != NEURON_NO_ERROR; + err = NeuronCompilation_createWithOptions( + model, &compilation, runtimeOption.c_str()); + CHECK_NO_ERROR(err); + + mCompilation = std::unique_ptr(compilation); + + err |= + NeuronCompilation_setPreference(compilation, NEURON_PREFER_TURBO_BOOST); + err |= NeuronCompilation_setPriority(compilation, NEURON_PRIORITY_HIGH); + CHECK_NO_ERROR(err); + + err = NeuronCompilation_finish(compilation); + CHECK_NO_ERROR(err); + + // ---------------------------Execution------------------------------------ + // Create Neuron executor instance. + err = NeuronExecution_create(compilation, &execution); + CHECK_NO_ERROR(err); + mExecution = std::unique_ptr(execution); + + return NEURON_NO_ERROR; +} + +} // namespace neuron +} // namespace executor +} // namespace torch diff --git a/backends/mediatek/runtime/include/NeuronBackend.h b/backends/mediatek/runtime/include/NeuronBackend.h new file mode 100644 index 0000000000..7a22956de6 --- /dev/null +++ b/backends/mediatek/runtime/include/NeuronBackend.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#pragma once + +#include "NeuronBufferAllocator.h" +#include "NeuronExecutor.h" +#include "NeuronLog.h" +#include "NeuronPayloadHeader.h" +#include "api/APUWareUtilsLib.h" +#include "api/NeuronAdapter.h" + +#include +#include +#include + +#include +#include +#include + +namespace torch { +namespace executor { + +class NeuronBackend final : public ::executorch::runtime::BackendInterface { + public: + Result init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const override; + + Error execute( + ET_UNUSED BackendExecutionContext& context, + DelegateHandle* handle, + EValue** args) const override; + + void destroy(DelegateHandle* handle) const override; + + bool is_available() const override; +}; + +extern const char kHighAddrKey[]; +extern const char kImportForeverKey[]; + +struct NeuronDelegateSetting { + bool mHighAddr = false; + + bool mImportForever = false; + + std::string ToRuntimeOption() { + if (mHighAddr && mImportForever) { + return "--apusys-config \"{ \\\"high_addr\\\": true, \\\"import_forever\\\": true }\""; + } else if (mHighAddr) { + return "--apusys-config \"{ \\\"high_addr\\\": true }\""; + } else if (mImportForever) { + return "--apusys-config \"{ \\\"import_forever\\\": true }\""; + } else { + return ""; + } + } +}; + +class NeuronExecuTorchDelegate { + public: + class MemoryCache { + public: + template + bool IsCached(int i, void* ptr) { + const auto& cache = isInput ? mInputCache : mOutputCache; + auto it = cache.find(i); + return (it != cache.end()) && (ptr == it->second); + } + + template + void UpdateCache(int i, void* ptr) { + (isInput ? mInputCache[i] : mOutputCache[i]) = ptr; + return; + } + + private: + std::unordered_map mInputCache; + + std::unordered_map mOutputCache; + }; + + NeuronExecuTorchDelegate() {} + + ~NeuronExecuTorchDelegate() { + mPLock->Stop(); + } + + int LoadCompiledNetwork( + NeuronPayload payload, + NeuronDelegateSetting options) { + mSettings = options; + auto runtimeOption = mSettings.ToRuntimeOption(); + auto res = mExecutor.LoadFromCompiledNetwork( + payload.CompiledNetwork, + payload.Header.DataLen, + payload.Header.InputCount, + payload.Header.OutputCount, + runtimeOption); + CHECK_NO_ERROR(res); + CHECK_TRUE(mExecutor.IsValid()); + SummaryIoCounts(); + mPLock = std::unique_ptr(new ScopePerformancer); + return NEURON_NO_ERROR; + } + + Error execute(ET_UNUSED BackendExecutionContext& context, EValue** args) + const; + + private: + template + bool IsCached(int index, void* ptr) const { + return mCache.IsCached(index, ptr); + } + + template + void UpdateCache(int index, void* ptr) const { + mCache.UpdateCache(index, ptr); + } + + int SummaryIoCounts() { + for (int i = 0;; i++) { + size_t size = mExecutor.GetInputOutputPaddedSize(i); + if (size == 0) { + break; + } + LogInfo("NeuronBackend", "Model input:%d size: %lu", i, size); + mInputSizes.push_back(size); + } + for (int o = 0;; o++) { + size_t size = mExecutor.GetInputOutputPaddedSize(o); + if (size == 0) { + break; + } + LogInfo("NeuronBackend", "Model output:%d size: %lu", o, size); + mOutputSizes.push_back(size); + } + return NEURON_NO_ERROR; + } + + int HintNeuronBackend(EValue** args) const; + + private: + std::vector mInputSizes; + + std::vector mOutputSizes; + + mutable MemoryCache mCache; + + std::unique_ptr mPLock; + + neuron::NeuronExecutor mExecutor; + + NeuronDelegateSetting mSettings; + + mutable std::unordered_set mHasImported; + + private: + NeuronExecuTorchDelegate(const NeuronExecuTorchDelegate&); + + NeuronExecuTorchDelegate operator=(const NeuronExecuTorchDelegate&); +}; + +} // namespace executor +} // namespace torch diff --git a/backends/mediatek/runtime/include/NeuronBufferAllocator.h b/backends/mediatek/runtime/include/NeuronBufferAllocator.h new file mode 100644 index 0000000000..58439dea41 --- /dev/null +++ b/backends/mediatek/runtime/include/NeuronBufferAllocator.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#pragma once + +#include "NeuronExecutor.h" +#include "NeuronLog.h" +#include "api/NeuronAdapter.h" + +#include + +#include + +#include +#include +#include +#include + +#define GET_NEURON_ALLOCATOR \ + ::torch::executor::neuron::BufferAllocator::GetInstance() + +namespace torch { +namespace executor { +namespace neuron { + +struct BufferDeleter { + void operator()(AHardwareBuffer* buffer) { + if (buffer != nullptr) { + AHardwareBuffer_unlock(buffer, nullptr); + AHardwareBuffer_release(buffer); + } + } +}; + +class MemoryUnit { + public: + static std::unique_ptr Create(size_t size) { + auto obj = std::unique_ptr(new (std::nothrow) MemoryUnit(size)); + return (obj && (obj->Allocate() == NEURON_NO_ERROR)) ? std::move(obj) + : nullptr; + } + + ~MemoryUnit() { + mNeuronMemory.reset(); + mAhwb.reset(); + } + + size_t GetSize() const { + return mSize; + } + + void* GetAddress() const { + return mAddress; + } + + NeuronMemory* GetNeuronMemory() const { + return mNeuronMemory.get(); + } + + private: + explicit MemoryUnit(size_t size) : mSize(size) {} + + int Allocate() { + AHardwareBuffer_Desc iDesc{ + .width = static_cast(mSize), + .height = 1, + .layers = 1, + .format = AHARDWAREBUFFER_FORMAT_BLOB, + .usage = mAhwbType, + .stride = static_cast(mSize), + }; + AHardwareBuffer* Abuffer = nullptr; + AHardwareBuffer_allocate(&iDesc, &Abuffer); + CHECK_VALID_PTR(Abuffer); + mAhwb = std::unique_ptr(Abuffer); + + NeuronMemory* memory = nullptr; + NeuronMemory_createFromAHardwareBuffer(Abuffer, &memory); + CHECK_VALID_PTR(memory); + mNeuronMemory = std::unique_ptr(memory); + + AHardwareBuffer_lock(Abuffer, mAhwbType, -1, nullptr, &mAddress); + CHECK_VALID_PTR(mAddress); + return NEURON_NO_ERROR; + } + + private: + std::unique_ptr mNeuronMemory; + + std::unique_ptr mAhwb; + + uint64_t mAhwbType = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | + AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN; + + void* mAddress = nullptr; + + size_t mSize = 0; +}; + +class BufferAllocator : public MemoryAllocator { + public: + static BufferAllocator& GetInstance(); + + void* Allocate(size_t size); + + void* allocate(size_t size, size_t alignment = kDefaultAlignment) override { + return Allocate(size); + } + + bool RemoveBuffer(void* address); + + const MemoryUnit* Find(void* address); + + void Clear(); + + private: + BufferAllocator() : MemoryAllocator(0, nullptr) {} + + BufferAllocator(const BufferAllocator&) = delete; + + BufferAllocator& operator=(const BufferAllocator&) = delete; + + ~BufferAllocator() override { + Clear(); + } + + private: + std::map> mPool; + + std::mutex mMutex; +}; + +} // namespace neuron +} // namespace executor +} // namespace torch diff --git a/backends/mediatek/runtime/include/NeuronExecutor.h b/backends/mediatek/runtime/include/NeuronExecutor.h new file mode 100644 index 0000000000..d0d38757aa --- /dev/null +++ b/backends/mediatek/runtime/include/NeuronExecutor.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#pragma once + +#include "NeuronLog.h" +#include "api/NeuronAdapter.h" +#include "api/NeuronAdapterShim.h" + +#include +#include +#include +#include +#include + +namespace torch { +namespace executor { +namespace neuron { + +struct NeuronDeleter { + void operator()(NeuronModel* model) { + if (model != nullptr) { + NeuronModel_free(model); + } + } + void operator()(NeuronCompilation* compilation) { + if (compilation != nullptr) { + NeuronCompilation_free(compilation); + } + } + void operator()(NeuronExecution* execution) { + if (execution != nullptr) { + NeuronExecution_free(execution); + } + } + void operator()(NeuronMemory* memory) { + if (memory != nullptr) { + NeuronMemory_free(memory); + } + } +}; + +class NeuronExecutor { + public: + explicit NeuronExecutor(); + + int LoadFromCompiledNetwork( + const void* buffer, + size_t size, + int inputCount, + int outputCount, + std::string& runtimeOption); + + template + int SetInputOutput(uint32_t index, void* buffer, size_t length) const { + CHECK_VALID_PTR(buffer); + CHECK_VALID_PTR(mExecution); + return isInput ? NeuronExecution_setInput( + mExecution.get(), index, nullptr, buffer, length) + : NeuronExecution_setOutput( + mExecution.get(), index, nullptr, buffer, length); + } + + template + int SetInputOutputFromMemory( + uint32_t index, + const NeuronMemory* memory, + size_t offset, + size_t length) const { + CHECK_VALID_PTR(memory); + CHECK_VALID_PTR(mExecution); + return isInput + ? NeuronExecution_setInputFromMemory( + mExecution.get(), index, nullptr, memory, offset, length) + : NeuronExecution_setOutputFromMemory( + mExecution.get(), index, nullptr, memory, offset, length); + } + + template + size_t GetInputOutputPaddedSize(int32_t index) const { + CHECK_VALID_PTR(mCompilation); + size_t size = 0; + auto res = isInput + ? NeuronCompilation_getInputPaddedSize(mCompilation.get(), index, &size) + : NeuronCompilation_getOutputPaddedSize( + mCompilation.get(), index, &size); + return res == NEURON_NO_ERROR ? size : 0; + } + + int Compute() const { + CHECK_VALID_PTR(mExecution); + return NeuronExecution_compute(mExecution.get()); + } + + bool IsValid() const { + return mExecution != nullptr; + } + + private: + std::unique_ptr mModel; + + std::unique_ptr mCompilation; + + std::unique_ptr mExecution; + + std::vector mInputSizes; + + std::vector mOutputSizes; + + private: + NeuronExecutor(const NeuronExecutor&); + + NeuronExecutor operator=(const NeuronExecutor&); +}; + +} // namespace neuron +} // namespace executor +} // namespace torch diff --git a/backends/mediatek/runtime/include/NeuronLog.h b/backends/mediatek/runtime/include/NeuronLog.h new file mode 100644 index 0000000000..ccf8b24870 --- /dev/null +++ b/backends/mediatek/runtime/include/NeuronLog.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#pragma once + +#include + +#include +#include + +#include +#include + +namespace torch { +namespace executor { +namespace neuron { + +#define AndroidLog(priority, tag, format, ...) \ + __android_log_print(priority, tag, format, ##__VA_ARGS__) + +#define LogError(tag, format, ...) \ + AndroidLog(ANDROID_LOG_ERROR, tag, format, ##__VA_ARGS__) + +#define LogWarn(tag, format, ...) \ + AndroidLog(ANDROID_LOG_WARN, tag, format, ##__VA_ARGS__) + +#define LogInfo(tag, format, ...) \ + AndroidLog(ANDROID_LOG_INFO, tag, format, ##__VA_ARGS__) + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + +#define CHECK_VALID_PTR(ptr) \ + do { \ + if (__builtin_expect(ptr == nullptr, 0)) { \ + LogError( \ + "NeuronBackend", \ + "Check fail: " #ptr \ + " == nullptr at line " TOSTRING(__LINE__) " at file " __FILE__); \ + return NEURON_UNEXPECTED_NULL; \ + } \ + } while (0) + +#define CHECK_NO_ERROR(value) \ + do { \ + if (__builtin_expect(value != NEURON_NO_ERROR, 0)) { \ + LogError( \ + "NeuronBackend", \ + "Check fail: " #value " != NEURON_NO_ERROR at line " TOSTRING( \ + __LINE__) " at file " __FILE__); \ + return value; \ + } \ + } while (0) + +#define CHECK_TRUE(value) \ + do { \ + if (__builtin_expect(value != true, 0)) { \ + LogError( \ + "NeuronBackend", \ + "Check fail: " #value \ + " != true at line " TOSTRING(__LINE__) " at file " __FILE__); \ + return NEURON_BAD_STATE; \ + } \ + } while (0) + +inline int ReadSystemProperty(std::string& property) { + char property_value[PROP_VALUE_MAX]; + if (__system_property_get(property.c_str(), property_value)) { + LogInfo("Get System Property %s : %s", property.c_str(), property_value); + try { + int value = std::stoi(property_value); + return value; + } catch (...) { + return -1; + } + } + return -1; +} + +} // namespace neuron +} // namespace executor +} // namespace torch diff --git a/backends/mediatek/runtime/include/NeuronPayloadHeader.h b/backends/mediatek/runtime/include/NeuronPayloadHeader.h new file mode 100644 index 0000000000..786b4e957f --- /dev/null +++ b/backends/mediatek/runtime/include/NeuronPayloadHeader.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024 MediaTek Inc. + * + * Licensed under the BSD License (the "License"); you may not use this file + * except in compliance with the License. See the license file in the root + * directory of this source tree for more details. + */ + +#pragma once + +#include + +struct __attribute__((packed)) NeuronPayloadHeader { + unsigned char Version; + + uint32_t InputCount; + + uint32_t OutputCount; + + uint32_t DataLen; +}; + +struct NeuronPayload { + NeuronPayload(const void* payload, size_t size) + : Header(*(struct NeuronPayloadHeader*)payload), + CompiledNetwork((char*)payload + sizeof(struct NeuronPayloadHeader)) {} + + NeuronPayloadHeader Header; + + void* CompiledNetwork = nullptr; +}; diff --git a/backends/mediatek/runtime/include/api/APUWareUtilsLib.h b/backends/mediatek/runtime/include/api/APUWareUtilsLib.h new file mode 100644 index 0000000000..8ade536e25 --- /dev/null +++ b/backends/mediatek/runtime/include/api/APUWareUtilsLib.h @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2023 MediaTek Inc., this file is modified on 02/26/2021 + * by MediaTek Inc. based on MIT License . + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the ""Software""), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED ""AS IS"", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +typedef enum { + LOW_POWER_MODE = 0, // For model execution preference + FAST_SINGLE_ANSWER_MODE, // For model execution preference + SUSTAINED_SPEED_MODE, // For model execution preference + FAST_COMPILE_MODE, // For model compile preference + PERFORMANCE_MODE_MAX, +} PERFORMANCE_MODE_E; + +//------------------------------------- ------------------------------------- +#define APUWARE_LOG_D(format, ...) \ + __android_log_print( \ + ANDROID_LOG_DEBUG, "APUWARELIB", format "\n", ##__VA_ARGS__); + +#define APUWARE_LOG_E(format, ...) \ + __android_log_print( \ + ANDROID_LOG_ERROR, "APUWARELIB", format "\n", ##__VA_ARGS__); + +inline void* voidFunction() { + return nullptr; +} + +// ApuWareUtils library construct +struct ApuWareUtilsLib { + static struct ApuWareUtilsLib& GetInstance() { + static struct ApuWareUtilsLib instance; + return instance; + } + + ApuWareUtilsLib() { + load(); + } + + using AcquirePerformanceLockPtr = + std::add_pointer::type; + using AcquirePerfParamsLockPtr = + std::add_pointer::type; + using ReleasePerformanceLockPtr = std::add_pointer::type; + + // Open a given library and load symbols + bool load() { + void* handle = nullptr; + const std::string libraries[] = { + "libapuwareutils_v2.mtk.so", "libapuwareutils.mtk.so"}; + for (const auto& lib : libraries) { + handle = dlopen(lib.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (handle) { + APUWARE_LOG_D("dlopen %s", lib.c_str()); + acquirePerformanceLock = + reinterpret_cast( + dlsym(handle, "acquirePerformanceLockInternal")); + acquirePerfParamsLock = + reinterpret_cast( + dlsym(handle, "acquirePerfParamsLockInternal")); + releasePerformanceLock = + reinterpret_cast( + dlsym(handle, "releasePerformanceLockInternal")); + return mEnable = acquirePerformanceLock && releasePerformanceLock && + acquirePerfParamsLock; + } else { + APUWARE_LOG_E("unable to open library %s", lib.c_str()); + } + } + return false; + } + + bool mEnable = false; + + AcquirePerformanceLockPtr acquirePerformanceLock = + reinterpret_cast(voidFunction); + AcquirePerfParamsLockPtr acquirePerfParamsLock = + reinterpret_cast(voidFunction); + ReleasePerformanceLockPtr releasePerformanceLock = + reinterpret_cast(voidFunction); +}; + +class ScopePerformancer { + public: + ScopePerformancer(uint32_t ms = 2000) + : mLib(ApuWareUtilsLib::GetInstance()), mMs(ms) { + mLock = mLib.mEnable; + if (mLock) { + APUWARE_LOG_D("Powerhal Up"); + mRunning.store(true); + mThread = std::thread(&ScopePerformancer::acquireLockRepeatedly, this); + } + }; + + void Stop() { + if (mRunning.load()) { + mRunning.store(false); + mCond.notify_one(); + } + } + + ~ScopePerformancer() { + Stop(); + if (mThread.joinable()) { + mThread.join(); + } + if (mHalHandle != 0 && mLock) { + APUWARE_LOG_D("Powerhal Free"); + mLib.releasePerformanceLock(mHalHandle); + mHalHandle = 0; + } + } + + private: + void acquireLockRepeatedly() { + std::unique_lock lock(mMutex); + while (mRunning.load()) { + mHalHandle = + mLib.acquirePerformanceLock(mHalHandle, FAST_SINGLE_ANSWER_MODE, mMs); + mCond.wait_for(lock, std::chrono::milliseconds(1000), [this] { + return !mRunning.load(); + }); + } + } + + struct ApuWareUtilsLib mLib; + + bool mLock = false; + + int mHalHandle = 0; + + uint32_t mMs; + + std::atomic mRunning{false}; + + std::thread mThread; + + std::mutex mMutex; + + std::condition_variable mCond; +}; \ No newline at end of file diff --git a/backends/mediatek/runtime/include/api/NeuronAdapter.h b/backends/mediatek/runtime/include/api/NeuronAdapter.h new file mode 100644 index 0000000000..3a4af8299b --- /dev/null +++ b/backends/mediatek/runtime/include/api/NeuronAdapter.h @@ -0,0 +1,2385 @@ +/* Copyright Statement: + * + * This software/firmware and related documentation ("MediaTek Software") are + * protected under relevant copyright laws. The information contained herein + * is confidential and proprietary to MediaTek Inc. and/or its licensors. + * Without the prior written permission of MediaTek inc. and/or its licensors, + * any reproduction, modification, use or disclosure of MediaTek Software, + * and information contained herein, in whole or in part, shall be strictly + * prohibited. + */ +/* MediaTek Inc. (C) 2020. All rights reserved. + * + * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES + * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE") + * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON + * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. + * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE + * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR + * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH + * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY + * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY + * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK + * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO + * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN + * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND + * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER + * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT + * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER + * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE. + * + * The following software/firmware and/or related documentation ("MediaTek + * Software") have been modified by MediaTek Inc. All revisions are subject to + * any receiver's applicable license agreements with MediaTek Inc. + */ + +/** + * @file NeuronAdapter.h + */ + +#pragma once + +#ifdef __ANDROID__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnullability-extension" +#include +#pragma clang diagnostic pop +#endif + +#include +#include +#include + +__BEGIN_DECLS + +/** + * NeuronModel is an opaque type that contains a description of the mathematical + * operations that constitute the model. + */ +typedef struct NeuronModel NeuronModel; + +/** + * NeuronCompilation is an opaque type that can be used to compile a machine + * learning model. + */ +typedef struct NeuronCompilation NeuronCompilation; + +/** + * NeuronExecution is an opaque type that can be used to apply a machine + * learning model to a set of inputs. + */ +typedef struct NeuronExecution NeuronExecution; + +/** + * NeuronDevice is an opaque type that represents a device. + * + * This type is used to query basic properties and supported operations of the + * corresponding device, and control which device(s) a model is to be run on. + * + * Available since 4.1.0 + */ +typedef struct NeuronDevice NeuronDevice; + +/** + * This type is used to represent shared memory, memory mapped files, and + * similar memories. + * + * It is the application's responsibility to ensure that there are no uses of + * the memory after calling NeuronMemory_free. This includes the execution which + * references this memory because of a call to + * NeuronExecution_setInputFromMemory or NeuronExecution_setOutputFromMemory. + * + * Available since 4.1.0 + */ +typedef struct NeuronMemory NeuronMemory; + +/** + * NeuronEvent is an opaque type that represents an event + * that will be signaled once an execution completes. + * + * Available since 5.0.0 + */ +typedef struct NeuronEvent NeuronEvent; + +/** + * Result codes. + */ +typedef enum { + NEURON_NO_ERROR = 0, + NEURON_OUT_OF_MEMORY = 1, + NEURON_INCOMPLETE = 2, + NEURON_UNEXPECTED_NULL = 3, + NEURON_BAD_DATA = 4, + NEURON_OP_FAILED = 5, + NEURON_UNMAPPABLE = 6, + NEURON_BAD_STATE = 7, + NEURON_BAD_VERSION = 8, + + // Available since 5.0.0 + NEURON_OUTPUT_INSUFFICIENT_SIZE = 9, + NEURON_UNAVAILABLE_DEVICE = 10, + NEURON_MISSED_DEADLINE_TRANSIENT = 11, + NEURON_MISSED_DEADLINE_PERSISTENT = 12, + NEURON_RESOURCE_EXHAUSTED_TRANSIENT = 13, + NEURON_RESOURCE_EXHAUSTED_PERSISTENT = 14, + NEURON_DEAD_OBJECT = 15, +} NeuronAdapterResultCode; + +/** + * Operand values with size in bytes that are smaller or equal to this will be + * immediately copied into the model. + */ +enum { NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 }; + +/** + * Size of the cache token, in bytes, required from the application. + */ +enum { NEURON_BYTE_SIZE_OF_CACHE_TOKEN = 32 }; + +/** + * Operand types. + * The type of operands that can be added to a model. + * + * Some notes on quantized tensors + * + *

NEURON_TENSOR_QUANT8_ASYMM + *

Attached to this tensor are two numbers that can be used to convert the 8 + * bit integer to the real value and vice versa. These two numbers are: + * - scale: a 32 bit floating point value greater than zero. + * - zeroPoint: a 32 bit integer, in range [0, 255]. + *

The formula is: real_value = (integer_value - zero_value) * scale. + * + *

NEURON_TENSOR_QUANT16_SYMM + *

Attached to this tensor is a number representing real value scale that is + * used to convert the 16 bit number to a real value in the following way: + * realValue = integerValue * scale. scale is a 32 bit floating point with value + * greater than zero. + * + *

NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL + *

This tensor is associated with additional fields that can be used to + * convert the 8 bit signed integer to the real value and vice versa. These + * fields are: + * - channelDim: a 32 bit unsigned integer indicating channel dimension. + * - scales: an array of positive 32 bit floating point values. + *

The size of the scales array must be equal to dimensions[channelDim]. + * NeuronModel_setOperandSymmPerChannelQuantParams must be used to set the + * parameters for an Operand of this type. The channel dimension of this tensor + * must not be unknown (dimensions[channelDim] != 0). The formula is: + * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] where C is an + * index in the Channel dimension. + * + *

NEURON_TENSOR_QUANT16_ASYMM + *

Attached to this tensor are two numbers that can be used to convert the 16 + * bit integer to the real value and vice versa. These two numbers are: + * - scale: a 32 bit floating point value greater than zero. + * - zeroPoint: a 32 bit integer, in range [0, 65535]. + *

The formula is: real_value = (integer_value - zeroPoint) * scale. + * + *

NEURON_TENSOR_QUANT8_SYMM + *

Attached to this tensor is a number representing real value scale that is + * used to convert the 8 bit number to a real value in the following way: + * realValue = integerValue * scale. scale is a 32 bit floating point with value + * greater than zero. + * + *

NEURON_TENSOR_QUANT8_ASYMM_SIGNED + *

Attached to this tensor are two numbers that can be used to convert the 8 + * bit integer to the real value and vice versa. These two numbers are: + * - scale: a 32 bit floating point value greater than zero. + * - zeroPoint: a 32 bit integer, in range [-128, 127]. + *

The formula is: real_value = (integer_value - zeroPoint) * scale. + */ +enum { + /** A 32 bit floating point scalar value. */ + NEURON_FLOAT32 = 0, + /** A signed 32 bit integer scalar value. */ + NEURON_INT32 = 1, + /** An unsigned 32 bit integer scalar value. */ + NEURON_UINT32 = 2, + /** A tensor of 32 bit floating point values. */ + NEURON_TENSOR_FLOAT32 = 3, + /** A tensor of 32 bit integer values. */ + NEURON_TENSOR_INT32 = 4, + /** A tensor of 8 bit integers that represent real numbers. */ + NEURON_TENSOR_QUANT8_ASYMM = 5, + /** An 8 bit boolean scalar value. */ + NEURON_BOOL = 6, + /** A tensor of 16 bit signed integers that represent real numbers. */ + NEURON_TENSOR_QUANT16_SYMM = 7, + /** A tensor of IEEE 754 16 bit floating point values. */ + NEURON_TENSOR_FLOAT16 = 8, + /** A tensor of 8 bit boolean values. */ + NEURON_TENSOR_BOOL8 = 9, + /** An IEEE 754 16 bit floating point scalar value. */ + NEURON_FLOAT16 = 10, + /** A tensor of 8 bit signed integers that represent real numbers. */ + NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11, + /** A tensor of 16 bit unsigned integers that represent real numbers. */ + NEURON_TENSOR_QUANT16_ASYMM = 12, + /** A tensor of 8 bit signed integers that represent real numbers. */ + NEURON_TENSOR_QUANT8_SYMM = 13, + /** A tensor of 8 bit signed integers that represent real numbers. */ + NEURON_TENSOR_QUANT8_ASYMM_SIGNED = 14, + /** A reference to a model. */ + NEURON_MODEL = 15, + /** Extended data type - tensor uint32 */ + NEURON_EXT_TENSOR_UINT32 = 9001, + /** Extended data type -A tensor of 8 bit unsigned integers that represent + real numbers. */ + NEURON_EXT_TENSOR_QUANT8_ASYMM_PER_CHANNEL = 9002, + /** Extended data type -A tensor of 4 bit unsigned integers that represent + real numbers. */ + NEURON_EXT_TENSOR_QUANT4_ASYMM = 9003, + /** Extended data type -A tensor of 4 bit signed integers that represent real + numbers. */ + NEURON_EXT_TENSOR_QUANT4_ASYMM_SIGNED = 9004, + /** Extended data type -A tensor of 4 bit signed integers that represent real + numbers. */ + NEURON_EXT_TENSOR_QUANT4_SYMM = 9005, + /** Extended data type -A tensor of 16 bit signed integers that represent real + numbers. */ + NEURON_EXT_TENSOR_QUANT16_ASYMM_SIGNED = 9006, + /** Extended data type -A raw tensor. */ + NEURON_EXT_TENSOR_RAW = 9007, + /** Extended data type -A tensor of 8 bit signed integers that represent real + numbers. */ + NEURON_EXT_TENSOR_QUANT8_ASYMM_SIGNED_PER_CHANNEL = 9008, +}; + +/** + * NeuronOperandType describes the type of an operand. + * This structure is used to describe both scalars and tensors. + */ +typedef struct NeuronOperandType { + /** The data type, e.g NEURON_INT8. */ + int32_t type; + /** The number of dimensions. It should be 0 for scalars. */ + uint32_t dimensionCount; + /** The dimensions of the tensor. It should be nullptr for scalars. */ + const uint32_t* dimensions; + /** + * These two fields are only used for quantized tensors. + * They should be zero for scalars and non-fixed point tensors. + * The dequantized value of each entry is (value - zeroPoint) * scale. + */ + float scale; + /** Only used with scale for quantized tensors */ + int32_t zeroPoint; +} NeuronOperandType; + +/** + * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL operand. + */ +typedef struct NeuronSymmPerChannelQuantParams { + /** The index of the channel dimension. */ + uint32_t channelDim; + /** The size of the scale array. Should be equal to dimension[channelDim] of + * the Operand. */ + uint32_t scaleCount; + /** The array of scaling values for each channel. Each value must be greater + * than zero. */ + const float* scales; +} NeuronSymmPerChannelQuantParams; + +/** + * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL and + * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL operand. + */ +typedef struct NeuronPerChannelQuantParams { + /** The index of the channel dimension. */ + uint32_t channelDim; + /** The size of the scale array. Should be equal to dimension[channelDim] of + * the Operand. */ + uint32_t scaleCount; + /** The array of scaling values for each channel. Each value must be greater + * than zero. */ + const float* scales; + /** The size of the zeroPoints. Should be equal to dimension[channelDim] of + * the Operand. */ + uint32_t zeroPointCount; + /** The array of zero point values for each channel. */ + const int32_t* zeroPoints; +} NeuronPerChannelQuantParams; + +/** + * Operation Types + * + * Supported operations are listed with available versions. See + * Neuron_getVersion for querying version number. + * + * Attempting to compile models with operations marked as not available + * will get a compilation failure. + * + * Refer to the operation support status of each hardware platform. + * Attempting to compile models with operations supported by this library but + * not supported by the underlying hardware platform will get a compilation + * failure too. + * + * Compatible NNAPI levels are also listed. + */ +typedef enum { + NEURON_ADD = 0, ///< Available since 4.1.0. NNAPI level 30. + NEURON_AVERAGE_POOL_2D = 1, ///< Available since 4.1.0. NNAPI level 30. + NEURON_CONCATENATION = 2, ///< Available since 4.1.0. NNAPI level 30. + NEURON_CONV_2D = 3, ///< Available since 4.1.0. NNAPI level 30. + NEURON_DEPTHWISE_CONV_2D = 4, ///< Available since 4.1.0. NNAPI level 30. + NEURON_DEPTH_TO_SPACE = 5, ///< Available since 4.1.0. NNAPI level 30. + NEURON_DEQUANTIZE = 6, ///< Available since 4.1.0. NNAPI level 30. + NEURON_EMBEDDING_LOOKUP = 7, ///< Not available. + NEURON_FLOOR = 8, ///< Available since 4.1.0. NNAPI level 30. + NEURON_FULLY_CONNECTED = 9, ///< Available since 4.1.0. NNAPI level 30. + NEURON_HASHTABLE_LOOKUP = 10, ///< Not available. + NEURON_L2_NORMALIZATION = 11, ///< Available since 4.1.0. NNAPI level 30. + NEURON_L2_POOL_2D = 12, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LOCAL_RESPONSE_NORMALIZATION = 13, ///< Not available. + NEURON_LOGISTIC = 14, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LSH_PROJECTION = 15, ///< Not available. + NEURON_LSTM = 16, ///< Not available. + NEURON_MAX_POOL_2D = 17, ///< Available since 4.1.0. NNAPI level 30. + NEURON_MUL = 18, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RELU = 19, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RELU1 = 20, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RELU6 = 21, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RESHAPE = 22, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RESIZE_BILINEAR = 23, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RNN = 24, ///< Not available. + NEURON_SOFTMAX = 25, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SPACE_TO_DEPTH = 26, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SVDF = 27, ///< Not available. + NEURON_TANH = 28, ///< Available since 4.1.0. NNAPI level 30. + NEURON_BATCH_TO_SPACE_ND = 29, ///< Available since 4.1.0. NNAPI level 30. + NEURON_DIV = 30, ///< Available since 4.1.0. NNAPI level 30. + NEURON_MEAN = 31, ///< Available since 4.1.0. NNAPI level 30. + NEURON_PAD = 32, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SPACE_TO_BATCH_ND = 33, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SQUEEZE = 34, ///< Available since 4.1.0. NNAPI level 30. + NEURON_STRIDED_SLICE = 35, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SUB = 36, ///< Available since 4.1.0. NNAPI level 30. + NEURON_TRANSPOSE = 37, ///< Available since 4.1.0. NNAPI level 30. + NEURON_ABS = 38, ///< Available since 4.1.0. NNAPI level 30. + NEURON_ARGMAX = 39, ///< Available since 4.1.0. NNAPI level 30. + NEURON_ARGMIN = 40, ///< Available since 4.1.0. NNAPI level 30. + NEURON_AXIS_ALIGNED_BBOX_TRANSFORM = + 41, ///< Available since 4.1.0. NNAPI level 30. + NEURON_BIDIRECTIONAL_SEQUENCE_LSTM = 42, ///< Not available. + NEURON_BIDIRECTIONAL_SEQUENCE_RNN = 43, ///< Not available. + NEURON_BOX_WITH_NMS_LIMIT = 44, ///< Available since 4.1.0. NNAPI level 30. + NEURON_CAST = 45, ///< Available since 4.1.0. NNAPI level 30. + NEURON_CHANNEL_SHUFFLE = 46, ///< Available since 4.1.0. NNAPI level 30. + NEURON_DETECTION_POSTPROCESSING = 47, ///< Not available. + NEURON_EQUAL = 48, ///< Available since 4.1.0. NNAPI level 30. + NEURON_EXP = 49, ///< Available since 4.1.0. NNAPI level 30. + NEURON_EXPAND_DIMS = 50, ///< Available since 4.1.0. NNAPI level 30. + NEURON_GATHER = 51, ///< Available since 4.1.0. NNAPI level 30. + NEURON_GENERATE_PROPOSALS = 52, ///< Not available. + NEURON_GREATER = 53, ///< Available since 4.1.0. NNAPI level 30. + NEURON_GREATER_EQUAL = 54, ///< Available since 4.1.0. NNAPI level 30. + NEURON_GROUPED_CONV_2D = 55, ///< Available since 4.1.0. NNAPI level 30. + NEURON_HEATMAP_MAX_KEYPOINT = 56, ///< Available since 4.1.0. NNAPI level 30. + NEURON_INSTANCE_NORMALIZATION = + 57, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LESS = 58, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LESS_EQUAL = 59, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LOG = 60, ///< Not available. + NEURON_LOGICAL_AND = 61, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LOGICAL_NOT = 62, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LOGICAL_OR = 63, ///< Available since 4.1.0. NNAPI level 30. + NEURON_LOG_SOFTMAX = 64, ///< Not available. + NEURON_MAXIMUM = 65, ///< Available since 4.1.0. NNAPI level 30. + NEURON_MINIMUM = 66, ///< Available since 4.1.0. NNAPI level 30. + NEURON_NEG = 67, ///< Available since 4.1.0. NNAPI level 30. + NEURON_NOT_EQUAL = 68, ///< Available since 4.1.0. NNAPI level 30. + NEURON_PAD_V2 = 69, ///< Available since 4.1.0. NNAPI level 30. + NEURON_POW = 70, ///< Available since 4.1.0. NNAPI level 30. + NEURON_PRELU = 71, ///< Available since 4.1.0. NNAPI level 30. + NEURON_QUANTIZE = 72, ///< Available since 4.1.0. NNAPI level 30. + NEURON_QUANTIZED_16BIT_LSTM = 73, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RANDOM_MULTINOMIAL = 74, ///< Not available. + NEURON_REDUCE_ALL = 75, ///< Available since 4.1.0. NNAPI level 30. + NEURON_REDUCE_ANY = 76, ///< Available since 4.1.0. NNAPI level 30. + NEURON_REDUCE_MAX = 77, ///< Available since 4.1.0. NNAPI level 30. + NEURON_REDUCE_MIN = 78, ///< Available since 4.1.0. NNAPI level 30. + NEURON_REDUCE_PROD = 79, ///< Not available. + NEURON_REDUCE_SUM = 80, ///< Available since 4.1.0. NNAPI level 30. + NEURON_ROI_ALIGN = 81, ///< Available since 4.1.0. NNAPI level 30. + NEURON_ROI_POOLING = 82, ///< Not available. + NEURON_RSQRT = 83, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SELECT = 84, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SIN = 85, ///< Not available. + NEURON_SLICE = 86, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SPLIT = 87, ///< Available since 4.1.0. NNAPI level 30. + NEURON_SQRT = 88, ///< Available since 4.1.0. NNAPI level 30. + NEURON_TILE = 89, ///< Available since 4.1.0. NNAPI level 30. + NEURON_TOPK_V2 = 90, ///< Available since 4.1.0. NNAPI level 30. + NEURON_TRANSPOSE_CONV_2D = 91, ///< Available since 4.1.0. NNAPI level 30. + NEURON_UNIDIRECTIONAL_SEQUENCE_LSTM = 92, ///< Not available. + NEURON_UNIDIRECTIONAL_SEQUENCE_RNN = 93, ///< Not available. + NEURON_RESIZE_NEAREST_NEIGHBOR = + 94, ///< Available since 4.1.0. NNAPI level 30. + NEURON_QUANTIZED_LSTM = 95, ///< Not available. + NEURON_IF = 96, ///< Available since 4.1.0. NNAPI level 30. + NEURON_WHILE = 97, ///< Available since 4.1.0. NNAPI level 30. + NEURON_ELU = 98, ///< Not available. + NEURON_HARD_SWISH = 99, ///< Available since 4.1.0. NNAPI level 30. + NEURON_FILL = 100, ///< Available since 4.1.0. NNAPI level 30. + NEURON_RANK = 101, ///< Not available. + NEURON_BATCH_MATMUL = 102, ///< Available since 5.1.2. NNAPI FL6. + NEURON_PACK = 103, ///< Not available. + NEURON_MIRROR_PAD = 104, ///< Not available. + NEURON_MIRROR_REVERSE = 105, ///< Not available. + /** + * Decompress HyFBC to YUV420 frame, support both YUV420_8BITS and + * YUV420_10BITS formats. HyFBC (Hybrid Frame Buffer Compression) is a + * compressed format used by video decoder (VDEC). This format uses YUV420 to + * compress. + * + * For input part, need to set two inputs with different shape, representing Y + * and UV plane respectively. The same HyFBC data will be used for both + * inputs. Similarly, the output part also needs to be set to two, + * representing Y and UV plane respectively. + * + * The shape of the two inputs/ outputs (inputY, inputUV, outputY, outputUV) + * depends on the original images' shape ([batches, height, width, channels]). + * Both height and width shold follow 64 alignment rule. For example, if + * original height is 480, its 64 alignment should be 512. For Y plane, + * channel size should be 1; for UV plane, channel size should be 2. Besides, + * the height and width of UV plane should be half of Y's height and width. + * Example: + * + * original_img.shape = [1, 384, 640, 3] + * inputY.shape = [1, 384, 640, 1] + * inputUV.shape = [1, 192, 320, 2] + * outputY.shape = [1, 384, 640, 1] + * outputUV.shape = [1, 192, 320, 2] + * + * Supported tensor {@link OperandCode}: + * * {@link NEURON_EXT_TENSOR_RAW} (for inputY, inputUV) + * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for outputY, outputUV) + * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for outputY, outputUV) + * Note: + * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is + * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM. + * + * Tensor rank: both input and output require rank 4, with "NHWC" data layout. + * + * Inputs: + * * 0: inputY, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor. + * * 1: inputUV, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor. + * * 2: YHeaderAlignment, an {@link NEURON_INT32} scalar, specifying + * the header alignment in Hyfbc format. + * * 3: UVHeaderAlignment, an {@link NEURON_INT32} scalar, specifying + * the header alignment in Hyfbc format. + * * 4: xAlign, an {@link NEURON_INT32} scalar, specifying the frame + * width alignment of video decoder. + * * 5: yAlign, an {@link NEURON_INT32} scalar, specifying the frame + * height alignment of video decoder. + * * 6: xOffset, an {@link NEURON_INT32} scalar, specifying the frame + * width offset of video decoder. + * * 7: yOffset, an {@link NEURON_INT32} scalar, specifying the frame + * height offset of video decoder. + * * 8: mode, an {@link NEURON_INT32} scalar. Set to 0 for + * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the + * compressed bit width in Hyfbc frame, where the decompressed YUV420 is 8b + * for Hyfbc_8b, and YUV420 is 16b for Hyfbc_10b. + * * 9: outPitchN, an {@link NEURON_INT32} scalar, specifying the + * YUV420 N-axis pitch. Must be set to 1, because only a single batch is + * supported for HyfbcDecompress. + * * 10: outPitchH, an {@link NEURON_INT32} scalar, specifying the + * YUV420 H-axis pitch. Set to the original compressed image height with video + * codec alignment. + * * 11: outPitchW, an {@link NEURON_INT32} scalar, specifying the + * YUV420 W-axis pitch. Set to the original compressed image width with video + * codec alignment. + * * 12: outPitchC, an {@link NEURON_INT32} scalar, specifying the + * YUV420 C-axis pitch. Set to 1 for interleaved YUV420. + * + * Outputs: + * * 0: output Y, a 4-D tensor. Tensor type can be either {@link + * NEURON_TENSOR_QUANT8_ASYMM} or {@link + * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. + * * 1: output UV, a 4-D tensor. Tensor type can be either {@link + * NEURON_TENSOR_QUANT8_ASYMM} or {@link + * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. + * + * Available since NeuroPilot 7.0.0. + */ + NEURON_HYFBCTOYUV420 = 106, + /** + * Compress YUV420 to AFBC frame, support both YUV420_8BITS and + * YUV420_10BITS formats. AFBC (Arm Frame Buffer Compression) is a lossless + * compressed image format, created by ARM to reduce the size of images. + * + * For input part, need to set two inputs with different shape, representing Y + * and UV plane respectively. For output part, need to set one output for + * AFBC. + * + * The shape of the two inputs (inputY, inputUV) and output (AFBC) + * depends on the original images' shape ([batches, height, width, channels]). + * Both height and width shold follow 64 alignment rule. For example, if + * original height is 480, its 64 alignment should be 512. For Y plane, + * channel size should be 1; for UV plane, channel size should be 2. Besides, + * the height and width of UV plane should be half of Y's height and width. + * For AFBC output, its height shoud be 3/2 of Y's height, and its width + * equals to Y's width. Example: + * + * original_img.shape = [1, 384, 640, 3] + * inputY.shape = [1, 384, 640, 1] + * inputUV.shape = [1, 192, 320, 2] + * output.shape = [1, 576, 640, 1] + * + * Supported tensor {@link OperandCode}: + * * {@link NEURON_EXT_TENSOR_RAW} (for output) + * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for inputY, inputUV) + * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for inputY, inputUV) + * Note: + * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is + * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM. + * + * Tensor rank: both input and output require rank 4, with "NHWC" data layout. + * + * Inputs: + * * 0: inputY, a 4-D tensor. Tensor type can be either {@link + * NEURON_TENSOR_QUANT8_ASYMM} or {@link + * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. + * * 1: inputUV, a 4-D tensor. Tensor type can be either {@link + * NEURON_TENSOR_QUANT8_ASYMM} or {@link + * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. + * * 2: HeaderAlignment, an {@link NEURON_INT32} scalar, specifying + * the header alignment in AFBC format. + * * 3: xAlign, an {@link NEURON_INT32} scalar, specifying the frame + * width alignment of AFBC format. + * * 4: yAlign, an {@link NEURON_INT32} scalar, specifying the frame + * height alignment of AFBC format. + * * 5: xOffset, an {@link NEURON_INT32} scalar, specifying the frame + * width offset of AFBC format. + * * 6: yOffset, an {@link NEURON_INT32} scalar, specifying the frame + * height offset of AFBC format. + * * 7: mode, an {@link NEURON_INT32} scalar. Set to 0 for + * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the + * compressed bit width in AFBC frame, where the YUV420 must be 8b for + * AFBC_8b, and must be 16b for AFBC_10b. + * * 8: inPitchN, an {@link NEURON_INT32} scalar, specifying the + * YUV420 N-axis pitch. Must be set to 1, because only a single batch is + * supported for AfbcCompress. + * * 9: inPitchH, an {@link NEURON_INT32} scalar, specifying the + * YUV420 H-axis pitch. Set to the expected compressed image height. + * * 10: inPitchW, an {@link NEURON_INT32} scalar, specifying the + * YUV420 W-axis pitch. Set to the expected compressed image height. + * * 11: inPitchC, an {@link NEURON_INT32} scalar, specifying the + * YUV420 C-axis pitch. Set to 1 for interleaved YUV420. + * + * Outputs: + * * 0: output, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor. + * + * Available since NeuroPilot 7.0.0. + */ + NEURON_YUV420TOAFBC = 107, + NEURON_NUMBER_OF_OPERATIONS, +} NeuronOperationType; + +/** + * Fused activation function types. + */ +typedef enum { + // NO fused activation function. + NEURON_FUSED_NONE = 0, + // Fused ReLU activation function. + NEURON_FUSED_RELU = 1, + // Fused ReLU1 activation function. + NEURON_FUSED_RELU1 = 2, + // Fused ReLU6 activation function. + NEURON_FUSED_RELU6 = 3, +} NeuronAdapterFuseCode; + +/** + * Implicit padding algorithms. + */ +typedef enum { + /** + * SAME padding. + * Padding on both ends are the "same": + * padding_to_beginning = total_padding / 2 + * padding_to_end = (total_padding + 1)/2. + * i.e., for even number of padding, padding to both ends are exactly + * the same; for odd number of padding, padding to the ending is bigger + * than the padding to the beginning by 1. + * + * total_padding is a function of input, stride and filter size. + * It could be computed as follows: + * out_size = (input + stride - 1) / stride; + * needed_input = (out_size - 1) * stride + filter_size + * total_padding = max(0, needed_input - input_size) + * The computation is the same for the horizontal and vertical directions. + */ + NEURON_PADDING_SAME = 1, + + /** + * VALID padding. + * No padding. When the input size is not evenly divisible by + * the filter size, the input at the end that could not fill + * the whole filter tile will simply be ignored. + */ + NEURON_PADDING_VALID = 2, +} NeuronAdapterPaddingCode; + +/** + * Execution preferences. + */ +typedef enum { + /* Prefer executing in a way that minimizes battery drain. */ + NEURON_PREFER_LOW_POWER = 0, + /* Prefer executing as fast as possible. (more power consumption)*/ + NEURON_PREFER_FAST_SINGLE_ANSWER = 1, + /* Prefer maximizing the throughput of successive frames */ + NEURON_PREFER_SUSTAINED_SPEED = 2, + /* Prefer executing with turbo boost. (most power consumption) */ + NEURON_PREFER_TURBO_BOOST = 3, +} NeuronAdapterPreferenceCode; + +/** + * Relative execution priority. + */ +typedef enum { + NEURON_PRIORITY_LOW = 90, + NEURON_PRIORITY_MEDIUM = 100, + NEURON_PRIORITY_HIGH = 110, + NEURON_PRIORITY_DEFAULT = NEURON_PRIORITY_MEDIUM, +} NeuronAdapterPriorityCode; + +/** + * Compiler optimization hint. + */ +typedef enum { + /** + * Normal optimization. + * Available since 4.3.1 + */ + NEURON_OPTIMIZATION_NORMAL = 0, + /** + * Reduce latency by utilizing as many APU cores as possible. + * Available since 4.3.1 + */ + NEURON_OPTIMIZATION_LOW_LATENCY = 1 << 0, + /** + * Reducing DRAM access as more as possible. + * Available since 4.4.0 + */ + NEURON_OPTIMIZATION_DEEP_FUSION = 1 << 1, + /** + * Reduce latency by using as many APU cores as possible in batch-dimension. + * (For models with batch > 1) + * Available since 4.4.0 + */ + NEURON_OPTIMIZATION_BATCH_PROCESSING = 1 << 2, + /** + * Default optimization setting. + * Available since 4.3.1 + */ + NEURON_OPTIMIZATION_DEFAULT = NEURON_OPTIMIZATION_NORMAL, +} OptimizationCode; + +/** + * CPU cache flush hint. + */ +typedef enum { + /** + * Sync input buffer and invalidate output buffer. + * Available since 5.0.1 + */ + NEURON_CACHE_FLUSH_ENABLE_ALL = 0, + /** + * Disable sync input buffer. + * Available since 5.0.1 + */ + NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT = 1 << 0, + /** + * Disable invalidate output buffer. + * Available since 5.0.1 + */ + NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT = 1 << 1, + /** + * Default cache flush setting. + * Available since 5.0.1 + */ + NEURON_CACHE_FLUSH_DEFAULT = NEURON_CACHE_FLUSH_ENABLE_ALL, +} CacheFlushCode; + +/** + * Compilation Type. + */ +typedef enum { + /* Normal Compilation Available since 7.0.0 */ + COMPILATION_TYPE_NORMAL = 0, + /* @deprecate */ + COMPILATION_TYPE_DEBUG_PLUS = 1, + /* Batched Execution: Set input/output from memory every time. + * Available since 7.0.0 + */ + COMPILATION_TYPE_BATCHED = 2, + /* One compilation with multi-executions could be created. + * Available since 7.0.0 + */ + COMPILATION_TYPE_MULTI_EXECUTIONS = 3, + /* Batched Execution: Set input/output from memory 1st time and memcpy next + * time. Available since 7.0.1 + */ + COMPILATION_TYPE_EXECUTION_CONTROLLER = 4, +} CompilationType; + +/** + * Supported Feature + */ +typedef enum { + NEURON_FEATURE_NONE = 0, + NEURON_THROUGHPUT_MODE = 1, +} NeuronFeatureType; + +/** + * The structure to represent the neuron version. + */ +typedef struct { + uint8_t major; ///< major version + uint8_t minor; ///< minor version + uint8_t patch; ///< patch version +} NeuronRuntimeVersion; + +/** + * Get the version of Neuron runtime library. + * + * @param version the version of Neuron runtime library. + * @return NEURON_NO_ERROR + */ +int Neuron_getVersion(NeuronRuntimeVersion* version); + +/** + * Get the supported status of feature. + * + * Available since 7.0.0 + * + * @param type input feature @NeuronFeatureType to check supported or not + * @param supported return the supported status + * @return NEURON_NO_ERROR if successful. + */ +int Neuron_getFeatureSupportedStatus(NeuronFeatureType type, bool* supported); + +/** + * Get the size of L1 memory in APU. + * + * Available since 4.3.0 + * + * @param sizeKb L1 memory size in KB + * @return NEURON_NO_ERROR if successful. + */ +int Neuron_getL1MemorySizeKb(uint32_t* sizeKb); + +/** + * Creates a shared memory object from a file descriptor. + * + * For ion descriptor, application should create the ion memory and descriptor + * first and then use it in this function. + * + * Available since 4.1.0 Only supports ion fd. + * + * @param size The requested size in bytes. Must not be larger than the file + * size. + * @protect The desired memory protection for the mapping. It is either + * PROT_NONE or the bitwise OR of one or more of the following flags: PROT_READ, + * PROT_WRITE. + * @fd The requested file descriptor. The file descriptor has to be mmap-able. + * @offset The offset to the beginning of the file of the area to map. + * @memory The memory object to be created. Set to NULL if unsuccessful. + */ +int NeuronMemory_createFromFd( + size_t size, + int protect, + int fd, + size_t offset, + NeuronMemory** memory); + +#ifdef __ANDROID__ +/** + * Creates a shared memory object from an AHardwareBuffer handle. + * + * We only support AHardwareBuffer with format AHARDWAREBUFFER_FORMAT_BLOB and + * it can only be used for Model inputs and outputs. + * + * The AHardwareBuffer with AHARDWAREBUFFER_FORMAT_BLOB format can be used the + * same way as shared memory created from a file handle. See NeuronMemory for + * description on how to use this shared memory. + * + * The provided AHardwareBuffer must outlive the NeuronMemory object. + * + * Available since 5.0.0 + * + * @param ahwb The AHardwareBuffer handle. + * @param memory The memory object to be created. + * Set to NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if the request completed normally. + * + */ +int NeuronMemory_createFromAHardwareBuffer( + const AHardwareBuffer* ahwb, + NeuronMemory** memory); + +#else // __ANDROID__ + +/** + * Not supported at non-android platform + * + * @return NEURON_BAD_STATE + */ +int NeuronMemory_createFromAHardwareBuffer(); + +#endif + +/** + * Delete a memory object. + * + * For ion memory, this function cleans up the internal resource associated with + * this memory. Applications should clean up the allocated ion memory after this + * function. + * + * Available since 4.1.0 + */ +void NeuronMemory_free(NeuronMemory* memory); + +/** + * Create an empty NeuronModel. The model should be constructed with calls to + * NeuronModel_addOperation and NeuronModel_addOperand. + * + * Available since 4.1.0 + * + * @param model The NeuronModel to be created. Set to NULL if unsuccessful. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_create(NeuronModel** model); + +/** + * Destroy a model. The model need not have been finished by a call to + * NeuronModel_finish. + * + * Available since 4.1.0 + * + * @param model The model to be destroyed. + */ +void NeuronModel_free(NeuronModel* model); + +/** + * Indicate that we have finished modifying a model. Required before calling + * NeuronCompilation_compile. + * + * Available since 4.1.0 + * + * @param model The model to be finished. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_finish(NeuronModel* model); + +/** + * Add an operand to a model. The order in which the operands are added is + * important. The first one added to a model will have the index value 0, the + * second 1, etc. These indexes are used as operand identifiers in + * NeuronModel_addOperation. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param type The NeuronOperandType that describes the shape of the operand. + * Neither the NeuronOperandType nor the dimensions it points to need to outlive + * the call to NeuronModel_addOperand. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type); + +/** + * Sets an operand to a constant value. + * Values of length smaller or equal to + * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES are immediately copied into the + * model. For values of length greater than + * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES, a pointer to the buffer is + * stored within the model. The application must not change the content of this + * region until all executions using this model have completed. As the data may + * be copied during processing, modifying the data after this call yields + * undefined results. + * + * Attempting to modify a model once NeuronModel_finish has been called will + * return an error. + * + * A special notice on the buffer lifetime when the length is greater than + * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES. The provided buffer must + * outlive the compilation of this model. I.e. user must keep the buffer + * unchanged until NeuronCompilation_finish of this model. This is an internal + * optimization comparing to NNAPI. In NNAPI, NN runtime will copy the buffer to + * a shared memory between NN runtime and NNAPI HIDL service during + * ANNModel_finish, and it will be copied again to the compiled result during + * ANNCompilation_finish. In Neuron Adapter, there will be only one copying + * during NeuronCompilaiton_finish, so it is required to keep the buffer alive + * until NeuronCompilaiton_finish returned. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param index The index of the model operand we're setting. + * @param buffer A pointer to the data to use. + * @param length The size in bytes of the data value. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_setOperandValue( + NeuronModel* model, + int32_t index, + const void* buffer, + size_t length); +/** + * Sets an operand to a value that is a reference to another NeuronModel. + * + * The referenced model must already have been finished by a call to + * NeuronModel_finish. + * + * The NeuronModel_relaxComputationFloat32toFloat16 setting of referenced models + * is overridden by that setting of the main model of a compilation. + * + * The referenced model must outlive the model referring to it. + * + * Attempting to modify a model once NeuronModel_finish has been called will + * return an error. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param index The index of the model operand we're setting. + * @param value The model to be referenced. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_setOperandValueFromModel( + NeuronModel* model, + int32_t index, + const NeuronModel* value); + +/** + * Sets an operand's per channel quantization parameters + * Sets parameters required by a tensor of type + * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL This function must be called for every + * tensor of type NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL before calling + * NeuronModel_finish + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param index The index of the model operand we're setting. + * @param channelQuant The per channel quantization parameters for the operand. + * No memory in this struct needs to outlive the call to this function. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_setOperandSymmPerChannelQuantParams( + NeuronModel* model, + int32_t index, + const NeuronSymmPerChannelQuantParams* channelQuant); + +/** + * Sets an operand's per channel quantization parameters + * Sets parameters required by a tensor of type + * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or + * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL. + * This function must be called for every tensor of type + * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or + * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL before calling NeuronModel_finish. + * + * Available since 6.0.0 + * + * @param model The model to be modified. + * @param index The index of the model operand we're setting. + * @param channelQuant The per channel quantization parameters(include + * per-channel offset) for the operand. No memory in this struct needs to + * outlive the call to this function. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_setOperandPerChannelQuantParams( + NeuronModel* model, + int32_t index, + const NeuronPerChannelQuantParams* channelQuant); + +/** + * Add an operation to a model. + * The operands specified by inputs and outputs must have been previously added + * by calls to NeuronModel_addOperand. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param type The NeuronOperationType of the operation. + * @param inputCount The number of entries in the inputs array. + * @param inputs An array of indexes identifying each operand. + * @param outputCount The number of entries in the outputs array. + * @param outputs An array of indexes identifying each operand. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_addOperation( + NeuronModel* model, + NeuronOperationType type, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + +/** + * Add an operation extension to a model. + * The operands specified by inputs and outputs must have been previously added + * by calls to NeuronModel_addOperand. User needs to specify the operation + * extension name and the desired device which will execute the operation + * extension. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param name The name of the operation extension. + * @param vendor The name of the vendor which will implement the operation + * extension. + * @param device The device which will execute the operation extension. + * @param inputCount The number of entries in the inputs array. + * @param inputs An array of indexes identifying each operand. + * @param outputCount The number of entries in the outputs array. + * @param outputs An array of indexes identifying each operand. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_addOperationExtension( + NeuronModel* model, + const char* name, + const char* vendor, + const NeuronDevice* device, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + +/** + * Specfifies which operands will be the model's inputs and outputs. + * An operand cannot be used for both input and output. Doing so will return an + * error. + * + * The operands specified by inputs and outputs must have been + * previously added by calls to NeuronModel_addOperand. + * + * Attempting to modify a model once NeuronModel_finish has been + * called will return an error. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param inputCount The number of entries in the inputs array. + * @param inputs An array of indexes identifying the input operands. + * @param outputCount The number of entries in the outputs array. + * @param outputs An array of indexes identifying the output operands. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_identifyInputsAndOutputs( + NeuronModel* model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + +/** + * Gets the supported operations in a model. + * This function must be called after calling NeuronModel_finish + * + * Available since 4.1.0 + * + * @param model The model to be queried. + * @param supported The boolean array to be filled. True means supported. The + * size of the boolean array must be at least as large as the number of + * operations in the model. The order of elements in the supported array matches + * the order in which the corresponding operations were added to the model. + * @param operationCount number of operations in the model + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_getSupportedOperations( + NeuronModel* model, + bool* supported, + uint32_t operationCount); + +/** + * Get the supported operations for a specified set of devices. + * If multiple devices are selected, the supported operation list is a union of + * supported operations of all selected devices. + * + * Available since 4.1.0 + * + * @param model The model to be queried. + * @param devices Selected devices + * @param numDevices Number of selected devices + * @param supportedOps The boolean array to be filled. True means supported. The + * size of the boolean array must be as least as large as the number of + * operations in the model. The order of elements in the supportedOps array + * matches the order in which the corresponding operations were added to the + * model. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_getSupportedOperationsForDevices( + const NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + bool* supportedOps); + +/** + * Specifies whether NEURON_TENSOR_FLOAT32 is allowed to be calculated with + * range and/or precision as low as that of the IEEE 754 16-bit floating-point + * format. By default, NEURON_TENSOR_FLOAT32 must be calculated using at least + * the range and precision of the IEEE 754 32-bit floating-point format. + * + * Available since 4.1.0 + * + * @param model The model to be modified. + * @param allow 'true' indicates NEURON_TENSOR_FLOAT32 may be calculated with + * range and/or precision as low as that of the IEEE 754 16-bit floating point + * format. 'false' indicates NEURON_TENSOR_FLOAT32 must be calculated using at + * least the range and precision of the IEEE 754 32-bit floating point format. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_relaxComputationFloat32toFloat16( + NeuronModel* model, + bool allow); + +/** + * Hint compiler to suppress the input data conversion, the users have to + * convert the input data into platform-expected format before inference. + * + * Available since 4.2.0 + * + * @param model The model to be modified. + * @param suppress True to suppress the input data conversion. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_suppressInputConversion(NeuronModel* model, bool suppress); + +/** + * Hint compiler to suppress the output data conversion, the users have to + * convert the output data from platform-generated format before inference. + * + * Available since 4.2.0 + * + * @param model The model to be modified. + * @param suppress True to suppress the output data conversion. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_suppressOutputConversion(NeuronModel* model, bool suppress); + +/** + * Restore the compiled network using user provided buffer. + * + * The restored NeuronCompilaton could be used in creating executing instance. + * The restored NeuronModel cannot be recompiled. + * + * Available since 4.3.0 + * + * @param model Restored model. + * @param compilation Restored compilation + * @param buffer User provided buffer to restore the compiled network. + * @param size Size of the user provided buffer in bytes. + * @return NEURON_NO_ERROR if compiled network is successfully copied to the + * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled + * network, this could either be the version is not matched or the data is + * corrupted. + */ +int NeuronModel_restoreFromCompiledNetwork( + NeuronModel** model, + NeuronCompilation** compilation, + const void* buffer, + const size_t size); + +/** + * Restore the compiled network using user provided buffer. + * Support multiple compilation type; choices are: COMPILATION_TYPE_BATCHED, + * COMPILATION_TYPE_EXECUTION_CONTROLLER, COMPILATION_TYPE_EXECUTION_CONTROLLER, + * and COMPILATION_TYPE_NORMAL. + * + * There are two ways to use Batched Compilation: + * 1) load from DLA. + * 2) create batched compilation directly. + * To load DLA, one should call NeuronCompilation_create and + * NeuronModel_restoreFromCompiledNetworkV2. To create directly, one should call + * NeuronCompilation_createForBatch. + * + * The restored NeuronCompilaton could be used in creating executing instance. + * The restored NeuronModel cannot be recompiled. + * + * Available since 7.0.0 + * + * @param model Restored model. + * @param compilation Restored compilation + * @param buffer User provided buffer to restore the compiled network. + * @param size Size of the user provided buffer in bytes. + * @param type Type of the compilation needed to be restored. + * @return NEURON_NO_ERROR if compiled network is successfully copied to the + * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled + * network, this could either be the version is not matched or the data is + * corrupted. + */ +int NeuronModel_restoreFromCompiledNetworkV2( + NeuronModel** model, + NeuronCompilation** compilation, + const void* buffer, + const size_t size, + const CompilationType& type); + +/** + * Set a string into model that can be used for recognition for user. + * It's only used for debug, the string can be dumped into log and make users + * check the model behavior easily. + * + * Available since 7.0.0 + * + * @param model The model to be modified. + * @param name The string, user can free buffer 'name' after calling this API. + * @return NEURON_NO_ERROR if the string is set success. NEURON_UNEXPECTED_NULL + * if the input param is nullptr. + */ +int NeuronModel_setName(NeuronModel* model, const char* name); + +/** + * Create a NeuronCompilation to compile the given model. + * + * This function only creates the object. Compilation is only performed once + * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be + * called once all desired properties have been set on the compilation. + * NeuronModel_free should be called once the compilation is no longer needed. + * The provided model must outlive the compilation. The model must already have + * been finished by a call to NeuronModel_finish. + * + * Available since 4.1.0 + * + * @param model The NeuronModel to be compiled. + * @param compilation The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful + */ +int NeuronCompilation_create( + NeuronModel* model, + NeuronCompilation** compilation); + +/** + * Create a NeuronCompilation with different purpose to compile the given model. + * + * This function only creates the object. Compilation is only performed once + * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be + * called once all desired properties have been set on the compilation. + * NeuronModel_free should be called once the compilation is no longer needed. + * The provided model must outlive the compilation. The model must already have + * been finished by a call to NeuronModel_finish. + * + * Available since 7.0.1 + * + * @param model The NeuronModel to be compiled. + * @param type Type of the compilation needed to be created. + * @param options The options which used to create with compilation. + * @param compilation The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful + */ +int NeuronCompilation_createV2( + NeuronModel* model, + CompilationType type, + const char* options, + NeuronCompilation** compilation); + +/** + * Destroy a compilation. + * + * Available since 4.1.0 + * + * @param compilation The compilation to be destroyed. + */ +void NeuronCompilation_free(NeuronCompilation* compilation); + +/** + * Compilation is finished once NeuronCompilation_finish is invoked. Required + * before calling NeuronExecution_create. This function must only be called once + * for a given compilation. + * + * Available since 4.1.0 + * + * @param compilation The compilation to be finished. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_finish(NeuronCompilation* compilation); + +/** + * Gets the supported operations in a model with specific optimized configures. + * This function must be called before calling NeuronCompilation_finish. + * + * Available since 7.0.0 + * + * @param compilation The compilation to be queried. + * @param operationCount number of operations in the model + * @param supported The boolean array to be filled. True means supported. The + * size of the boolean array must be at least as large as the number of + * operations in the model. The order of elements in the supported array matches + * the order in which the corresponding operations were added to the model. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getSupportedOperations( + NeuronCompilation* compilation, + uint32_t operationCount, + bool* supported); + +/** + * Provides optional caching information for faster re-compilation. + * + * Available since 4.1.0 + * + * @param compilation The compilation to be cached. + * @param cacheDir The cache directory for storing and retrieving caching data. + * The user should choose a directory local to the application, and is + * responsible for managing the cache entries. + * @param token The token provided by the user to specify a model must be of + * length NEURON_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that the token + * is unique to a model within the application. Neuron cannot detect token + * collisions; a collision will result in a failed execution or in a successful + * execution that produces incorrect output values. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setCaching( + NeuronCompilation* compilation, + const char* cacheDir, + const uint8_t* token); + +/** + * Hint compiler with the size of L1 memory, this value should not be larger + * than real platform's settings. The user can get the platform's L1 memory size + * in KB by calling Neuron_getL1MemorySizeKb. + * + * Available since 4.3.0 + * + * @param compilation The compilation to be modified. + * @param sizeKb L1 memory size in KB. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setL1MemorySizeKb( + NeuronCompilation* compilation, + uint32_t sizeKb); + +/** + * Create a NeuronCompilation to compile the given model for a specified set of + * devices. The user must handle all compilation and execution failures from the + * specified set of devices. This is in contrast to a use of + * NeuronCompilation_create, where neuron will attempt to recover from such + * failures. + * + * Available since 4.1.0 + * + * @param model The NeuronModel to be compiled. + * @param devices The set of devices. Must not contain duplicates. + * @param numDevices The number of devices in the set. + * @param compilation The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is + * invalid. + */ +int NeuronCompilation_createForDevices( + NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + NeuronCompilation** compilation); + +/** + * Create a NeuronCompilation. Which can divide one graph into several subgraph + * and use the information to debug. + * + * Only be used in debug purpose, no guarantees performance and thread safe. + * + * Available since 5.0.0 + * + * @param model The NeuronModel to be compiled. + * @param compilation The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is + * invalid. + */ +int NeuronCompilation_createForDebug( + NeuronModel* model, + NeuronCompilation** compilation); + +/** + * Sets the execution preference associated with this compilation. + * + * Default value of preference is PREFER_SINGLE_FAST_ANSWER + * + * Available since 4.1.0 + * + * @param compilation The compilation to be modified. + * @param preference Either NEURON_PREFER_LOW_POWER, + * NEURON_PREFER_SINGLE_FAST_ANSWER, or NEURON_PREFER_SUSTAINED_SPEED. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setPreference( + NeuronCompilation* compilation, + int32_t preference); + +/** + * Sets the execution priority associated with this compilation. + * + * Execution priorities are relative to other executions created by the same + * application (specifically same uid) for the same device. Specifically, + * priorities of executions from one application will not affect executions from + * another application. + * + * Higher priority executions may use more compute resources than lower priority + * executions, and may preempt or starve lower priority executions. + * + * Available since 4.1.0 + * + * @param compilation The compilation to be modified. + * @param priority The relative priority of the execution compared to other + * executions created by the application. Must be one of NEURON_PRIORITY_*. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setPriority(NeuronCompilation* compilation, int priority); + +/** + * Get the padded dimensional information of the specified input operand of the + * compilation. This function must be called after calling + * NeuronCompilation_finish. If NeuronModel_suppressInputConversion was not + * applied to the model to be compiled, the returned dimensions are the padded + * dimension after NeuronCompilation_finish to satisfy the optimization + * requirement from the underlying hardware accelerators. + * If NeuronModel_suppressInputConversion was applied to the model to be + * compiled, the returned dimensions are the same as the original dimensions + * given from user. + * + * Available since 4.2.0 + * + * @param compilation The compilation to be queried. + * @param index The index of the input operand we are querying. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with NeuronModel_addOperand. + * @param dimensions The dimension array to be filled. The size of the array + * must be exactly as large as the rank of the input operand to be queried in + * the model. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getInputPaddedDimensions( + NeuronCompilation* compilation, + int32_t index, + uint32_t* dimensions); + +/** + * Get the padded dimensional information of the specified output operand of the + * compilation. This function must be called after calling + * NeuronCompilation_finish. If NeuronModel_suppressOutputConversion was not + * applied to the model to be compiled, the returned dimensions are the padded + * dimension after NeuronCompilation_finish to satisfy the optimization + * requirement from the underlying hardware accelerators. + * If NeuronModel_suppressOutputConversion was applied to the model to be + * compiled, the returned dimensions are the same as the original dimensions + * given from user. + * + * Available since 4.2.0 + * + * @param compilation The compilation to be queried. + * @param index The index of the output operand we are querying. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with NeuronModel_addOperand. + * @param dimensions The dimension array to be filled. The size of the array + * must be exactly as large as the rank of the output operand to be queried in + * the model. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getOutputPaddedDimensions( + NeuronCompilation* compilation, + int32_t index, + uint32_t* dimensions); + +/** + * Get the expected buffer size (bytes) of the specified input operand of the + * compilation. If NeuronModel_suppressInputConversion was not applied to the + * model to be compiled, the returned size are the padded size after + * NeuronCompilation_finish to satisfy the optimization requirement from the + * underlying hardware accelerators. If NeuronModel_suppressInputConversion was + * applied to the model to be compiled, the returned size are the same as the + * original size given from user. + * + * Available since 4.2.0 + * + * @param compilation The compilation to be queried. + * @param index The index of the input operand we are querying. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with NeuronModel_addOperand. + * @param size the expected buffer size in bytes. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getInputPaddedSize( + NeuronCompilation* compilation, + int32_t index, + size_t* size); + +/** + * Get the expected buffer size (bytes) of the specified output operand of the + * compilation. If NeuronModel_suppressOutputConversion was not applied to the + * model to be compiled, the returned size are the padded size after + * NeuronCompilation_finish to satisfy the optimization requirement from the + * underlying hardware accelerators. If NeuronModel_suppressOutputConversion was + * applied to the model to be compiled, the returned size are the same as the + * original size given from user. + * + * Available since 4.2.0 + * + * @param compilation The compilation to be queried. + * @param index The index of the output operand we are querying. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with NeuronModel_addOperand. + * @param size the expected buffer size in bytes. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getOutputPaddedSize( + NeuronCompilation* compilation, + int32_t index, + size_t* size); + +/** + * Get the compiled network size of the compilation. + * + * This must be called after NeuronCompilation_finished and before + * NeuronExecution_create. It is not allowed to call this with a compilation + * restored from cache. + * + * Available since 4.3.0 + * + * @param compilation The compilation to be queried. + * @param size The compiled network size in bytes. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getCompiledNetworkSize( + NeuronCompilation* compilation, + size_t* size); + +/** + * Store the compiled network. + * + * Users have to allocate the buffer with the specified size before calling this + * function. + * + * This must be called after NeuronCompilation_finished and before + * NeuronExecution_create. It is not allowed to call this with a compilation + * restored from cache. + * + * Available since 4.3.0 + * + * @param compilation The compilation to be queried. + * @param buffer User allocated buffer to store the compiled network. + * @param size Size of the user allocated buffer in bytes. + * @return NEURON_NO_ERROR if compiled network is successfully copied to the + * user allocated buffer. + */ +int NeuronCompilation_storeCompiledNetwork( + NeuronCompilation* compilation, + void* buffer, + const size_t size); +/** + * Hint the compiler to apply the optimization strategy according to the user + * specified parameters. + * + * Available since 4.3.0 + * + * @param compilation The compilation to be modified. + * @param optimizationCode User specified optimization strategy. Must be one of + * NEURON_OPTIMIZATION_* or the inclusive OR value of multiple + * NEURON_OPTIMIZATION_*. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setOptimizationHint( + NeuronCompilation* compilation, + uint32_t optimizationCode); + +/** + * Hint the compiler to apply the optimization strategy according to the user + * specified arguments in a null-terminated string. + * + * Available since 4.6.0 + * + * @param compilation The compilation to be modified. + * @param optimizationString A null-terminated string to represent the user + * specified optimization strategy. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setOptimizationString( + NeuronCompilation* compilation, + const char* optimizationString); + +/** + * Only allow users' optimization string(from + * NeuronCompilation_setOptimizationString), the system won't set any compiler + * options for them. + * + * Available since 6.0.5 + * + * @param compilation The compilation to be modified. + * @param allow Allow only use user's setting or not. + * strategy. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setOnlyAllowOptimizationString( + NeuronCompilation* compilation, + bool allow); + +/** + * Get the compiler hints which are used to apply the optimization strategy + * according to the user specified arguments in a null-terminated string. + * + * Available since 6.0.5 + * + * @param compilation The compilation to be modified. + * @param optimizationString A null-terminated string to represent the user + * specified optimization strategy. + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getOptimizationString( + NeuronCompilation* compilation, + const char** optimizationString); + +/** + * Hint compiler to trim the model IO alignment. + * + * Available since 4.4.8 + * + * @param compilation The compilation to be modified. + * @param enable 'true' for trimming model IO alignment. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setTrimIOAlignment( + NeuronCompilation* compilation, + bool enable); + +/** + * Hint compiler to use software dilated convolution + * + * Available since 4.4.8 + * + * @param compilation The compilation to be modified. + * @param enable 'true' indicates a hint to compiler to use software dilated + * convolution + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_setSWDilatedConv( + NeuronCompilation* compilation, + bool enable); + +/** + * Create a new execution instance by calling the NeuronExecution_create + * function. The provided compilation must outlive the execution. + * + * Available since 4.1.0 + * + * @param compilation The NeuronCompilation to be evaluated. + * @param execution The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful + */ +int NeuronExecution_create( + NeuronCompilation* compilation, + NeuronExecution** execution); + +/** + * Destroy an execution. + * + * Available since 4.1.0 + * + * @param execution The execution to be destroyed. + */ +void NeuronExecution_free(NeuronExecution* execution); + +/** + * Associate a user buffer with an input of the model of the NeuronExecution. + * The provided buffer must outlive the execution. + * + * Available since 4.1.0 + * + * @param execution The execution to be modified. + * @param index The index of the input argument we are setting. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with NeuronModel_addOperand. + * @param type The NeuronOperandType of the operand. Currently NeuronAdapter + * only takes NULL. + * @param buffer The buffer containing the data. + * @param length The length in bytes of the buffer. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not + * recognized or the buffer is too small for the input. + */ +int NeuronExecution_setInput( + NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length); + +/** + * Associate a user buffer with an output of the model of the NeuronExecution. + * The provided buffer must outlive the execution. + * + * Available since 4.1.0 + * + * @param execution The execution to be modified. + * @param index The index of the output argument we are setting. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with NeuronModel_addOperand. + * @param type The NeuronOperandType of the operand. Currently NeuronAdapter + * only takes NULL. + * @param buffer The buffer where the data is to be written. + * @param length The length in bytes of the buffer. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not + * recognized or the buffer is too small for the output. + */ +int NeuronExecution_setOutput( + NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length); + +/** + * Associate part of a memory object with an input of the model of the + * NeuronExecution. + * + * The provided memory must outlive the execution and should not be changed + * during computation. + * + * Available since 4.1.0 + * + * @param execution The execution to be modified. + * @param index The index of the input argument we are setting. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with Neuronodel_addOperand. + * @param type The NeuronOperandType of the operand. Currently NueronAdapter + * only takes NULL. + * @param memory The memory containing the data. + * @param offset This specifies the location of the data within the memory. The + * offset is in bytes from the start of memory. + * @param length The size in bytes of the data value. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not + * recognized or the buffer is too small for the input. + */ +int NeuronExecution_setInputFromMemory( + NeuronExecution* execution, + uint32_t index, + const NeuronOperandType* type, + const NeuronMemory* memory, + size_t offset, + size_t length); + +/** + * Associate part of a memory object with an output of the model of the + * NeuronExecution. + * + * The provided memory must outlive the execution and should not be changed + * during computation. + * + * Available since 4.1.0 + * + * @param execution The execution to be modified. + * @param index The index of the output argument we are setting. It is an index + * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the + * index associated with Neuronodel_addOperand. + * @param type The NeuronOperandType of the operand. Currently NueronAdapter + * only takes NULL. + * @param memory The memory containing the data. + * @param offset This specifies the location of the data within the memory. The + * offset is in bytes from the start of memory. + * @param length The size in bytes of the data value. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not + * recognized or the buffer is too small for the input. + */ +int NeuronExecution_setOutputFromMemory( + NeuronExecution* execution, + uint32_t index, + const NeuronOperandType* type, + const NeuronMemory* memory, + size_t offset, + size_t length); + +/** + * Schedule synchronous evaluation of the execution. + * Returns once the execution has completed and the outputs are ready to be + * consumed. + * + * Available since 4.1.0 + * + * @param execution The execution to be scheduled and executed. + * + * @return NEURON_NO_ERROR if the execution completed normally. NEURON_BAD_STATE + * if the inference fails. Add two return code since 5.0.0 + * (NEURON_MISSED_DEADLINE_TRANSIENT if inference timeout, and + * NEURON_OUTPUT_INSUFFICIENT_SIZE if given outsize is not sufficient for real + * output) + * + */ +int NeuronExecution_compute(NeuronExecution* execution); + +/** + * Schedule asynchronous evaluation of the execution with dependencies. + * + * The execution will wait for all the depending events to be signaled before + * starting the evaluation. Once the execution has completed and the outputs + * are ready to be consumed, the returned event will be signaled. Depending on + * which devices are handling the execution, the event could be backed by a sync + * fence. Use NeuronEvent_wait to wait for that event. + * + * NeuronEvent_wait must be called to recurperate the resources used by the + * execution. + * + * If parts of the execution are scheduled on devices that do not support fenced + * execution, the function call may wait for such parts to finish before + * returning. + * + * The function will return an error if any of the events in dependencies is + * already in a bad state. After the execution is scheduled, if any of the + * events in dependencies does not complete normally, the execution will fail, + * and NeuronEvent_wait on the returned event will return an error. + * + * The function will return an error if any of the execution outputs has a + * tensor operand type that is not fully specified. + * + * @param execution The execution to be scheduled and executed. + * @param dependencies A set of depending events. The actual evaluation will not + * start until all the events are signaled. + * @param num_dependencies The number of events in the dependencies set. + * @param duration currently not used + * @param event The event that will be signaled on completion. event is set to + * NULL if there's an error. + * + * @return NEURON_NO_ERROR if the evaluation is successfully scheduled. + * + * Available since 5.0.0 + */ +int NeuronExecution_startComputeWithDependencies( + NeuronExecution* execution, + const NeuronEvent* const* dependencies, + uint32_t num_dependencies, + uint64_t duration, + NeuronEvent** event); + +/** + * Set the maximum duration of WHILE loops in the specified execution. + * + * @param execution The execution to be modified. + * @param duration The maximum amount of time in nanoseconds. + * @return NEURON_NO_ERROR if successful. + * + * Available since 5.0.0 + */ +int NeuronExecution_setLoopTimeout( + NeuronExecution* execution, + uint64_t duration); + +/** + * Get the default timeout value for WHILE loops. + * + * @return The default timeout value in nanoseconds. + * + * Available since 5.0.0 + */ +uint64_t Neuron_getDefaultLoopTimeout(); + +/** + * Get the maximum timeout value for WHILE loops. + * + * @return The maximum timeout value in nanoseconds. + * + * Available since 5.0.0 + */ +uint64_t Neuron_getMaximumLoopTimeout(); + +/** + * Sets the execution boost hint associated with this execution. Required before + * calling NeuronExecution_compute. + * + * Execution boost is the hint for the device frequency, ranged between 0 + * (lowest) to 100 (highest). For the compilation with preference set as + * NEURON_PREFER_SUSTAINED_SPEED, scheduler guarantees that the executing boost + * value would equal to the boost value hint. + * + * On the other hand, for the compilation with preference set as + * NEURON_PREFER_LOW_POWER, scheduler would try to save power by configuring the + * executing boost value with some value that is not higher than the boost value + * hint. + * + * Available since 4.1.0 + * + * @param execution The execution to be modified. + * @param boostValue The hint for the device frequency, ranged between 0 + * (lowest) to 100 (highest). + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronExecution_setBoostHint( + NeuronExecution* execution, + uint8_t boostValue); + +/** + * Sets the execution CPU cache flush hint associated with this execution. + * Required before calling NeuronExecution_setInputFromMemory and + * NeuronExecution_setOutputFromMemory. + * + * Default value of preference is NEURON_CACHE_FLUSH_ENABLE_ALL + * + * Available since 5.0.1 + * + * @param execution The execution to be modified. + * @param hint It is either NEURON_CACHE_FLUSH_ENABLE_ALL or the bitwise OR + * of one or more of the following flags: NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT, + * NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronExecution_setCacheFlushHint( + NeuronExecution* execution, + uint8_t flushHint); + +/** + * Get the dimensional information of the specified output operand of the model + * of the latest computation evaluated on {@link NeuronExecution}. + * + * This function may only be invoked when the execution is in the completed + * state. + * + * Available since 5.0.0 + * + * @param execution The execution to be queried. + * @param index The index of the output argument we are querying. It is + * an index into the lists passed to {@link + * NeuronModel_identifyInputsAndOutputs}. + * @param rank The rank of the output operand. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronExecution_getOutputOperandRank( + NeuronExecution* execution, + int32_t index, + uint32_t* rank); + +/** + * Get the dimensional information of the specified output operand of the model + * of the latest computation evaluated on {@link NeuronExecution}. The target + * output operand cannot be a scalar. + * + * This function may only be invoked when the execution is in the completed + * state. + * + * Available since 5.0.0 + * + * @param execution The execution to be queried. + * @param index The index of the output argument we are querying. It is + * an index into the lists passed to {@link + * NeuronModel_identifyInputsAndOutputs}. + * @param dimensions The dimension array to be filled. The size of the array + * must be exactly as large as the rank of the output operand to be queried in + * the model. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronExecution_getOutputOperandDimensions( + NeuronExecution* execution, + int32_t index, + uint32_t* dimensions); + +/** + * Create a NeuronCompilation which can create executions with shared static + * memory. + * + * This function only creates the object. Compilation is only performed once + * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be + * called once all desired properties have been set on the compilation. + * NeuronModel_free should be called once the compilation is no longer needed. + * The provided model must outlive the compilation. The model must already have + * been finished by a call to NeuronModel_finish. + * + * Available since 7.0.0 + * + * @param model The NeuronModel to be compiled. + * @param compilation The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful + */ +int NeuronCompilation_createForBatch( + NeuronModel* model, + NeuronCompilation** compilation); + +/** + * Set the size of runner pool, and create same number of runners. + * + * The execution must created by the following steps: + * NeuronCompilation_createForBatch, NeuronCompilation_finish, + * NeuronExecution_create. + * + * The execution created from this compilation has to use + * NeuronExecution_setRunnerPoolSize to create thread pool and then set a series + * of inputs & outputs into the execution. The execution will inference with the + * series of inputs. + * + * Available since 7.0.0 + * + * @param execution The NeuronExecution to be utilized. + * @param numRunners The number of runner need to be created. + * + * @return NEURON_NO_ERROR if successful + * @return NEURON_BAD_STATE if the compilation is not created via + * NeuronCompilation_createForBatch. + */ +int NeuronExecution_setRunnerPoolSize( + NeuronExecution* execution, + uint8_t numRunners); + +/** + * Notify the execution that all inputs / outputs have been set. + * Should be called after NeuronExecution_setInputFromMemory and + * NeuronExecution_setOutputFromMemory. + * + * The execution must created by the following steps: + * NeuronCompilation_createForBatch, NeuronCompilation_finish, + * NeuronExecution_create. + * + * Available since 7.0.0 + * + * @param execution The NeuronExecution to be utilized. + * + * @return NEURON_NO_ERROR if successful + * @return NEURON_BAD_STATE if the compilation is not created via + * NeuronCompilation_createForBatch. + */ +int NeuronExecution_setBatchDone(NeuronExecution* execution); + +/** + * Notify the execution that all inputs / outputs have been set. + * Should be called after NeuronExecution_setInputFromMemory and + * NeuronExecution_setOutputFromMemory. + * + * The execution must created by the following steps: + * 1. NeuronCompilation_createV2 with COMPILATION_TYPE_EXECUTION_CONTROLLER + * 2. NeuronCompilation_finish + * 3. NeuronExecution_create. + * or + * 1. NeuronModel_restoreFromCompiledNetworkV2 with + * COMPILATION_TYPE_EXECUTION_CONTROLLER + * 2. NeuronExecution_create. + * + * Available since 7.0.1 + * + * @param execution The NeuronExecution to be utilized. + * @param idx The index of runner to set the previous inputs and outputs. + * + * @return NEURON_NO_ERROR if successful + * @return NEURON_BAD_STATE if the compilation is not created via + * COMPILATION_TYPE_EXECUTION_CONTROLLER. + */ +int NeuronExecution_setIODone(NeuronExecution* execution, int idx); + +/** + * Create a NeuronCompilation which can create executions with shared static + * memory. + * + * This function only creates the object. Compilation is only performed once + * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be + * called once all desired properties have been set on the compilation. + * NeuronModel_free should be called once the compilation is no longer needed. + * The provided model must outlive the compilation. The model must already have + * been finished by a call to NeuronModel_finish. + * + * The executions created from this compilation can be executed at the same + * time. + * + * Available since 7.0.0 + * + * @param model The NeuronModel to be compiled. + * @param compilation The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful + */ +int NeuronCompilation_createForMultiExecutions( + NeuronModel* model, + NeuronCompilation** compilation); + +/** + * Set report path for debug plus. + * + * Only be used in debug purpose, the execution should be created by + * NeuronCompilation_createForDebug compilation. + * + * Available since 5.0.0 + * + * @param model The model need to be debug. + * @param path The path of execution report. + * + * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the path is empty. + */ +int NeuronDebug_setReportPath(NeuronModel* model, const char* path); + +/** + * Get the number of available devices. + * + * Available since 4.1.0 + * @param numDevices The number of devices returned. + * + * @return NEURON_NO_ERROR if successful. + */ +int Neuron_getDeviceCount(uint32_t* numDevices); + +/** + * Get the representation of the specified device. + * + * Available since 4.1.0 + * + * @param devIndex The index of the specified device. Must be less than the + * number of available devices. + * @param device The representation of the specified device. The same + * representation will always be returned for the specified device. + * + * @return NEURONNO_ERROR if successful. + */ +int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device); + +/** + * Get the name of the specified device. + * + * Available since 4.1.0 + * + * @param device The representation of the specified device. + * @param name The returned name of the specified device. The name will remain + * valid for the duration of the application. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronDevice_getName(const NeuronDevice* device, const char** name); + +/** + * Get the description of the specified device. + * + * Available since 5.0.0 + * + * @param device The representation of the specified device. + * @param description The returned description of the specified device. The + * description will remain valid for the duration of the application. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronDevice_getDescription( + const NeuronDevice* device, + const char** description); + +/* + * Destroys the event. + * + * See NeuronExecution for information on multithreaded usage. + * + * Available since 5.0.0 + * + * @param event The event object to be destroyed. Passing NULL is acceptable and + * results in no operation. + */ +void NeuronEvent_free(NeuronEvent* event); + +/* + * Force destroys the event without calling NeuronEvent_wait(). + * If user wants do wait before destroying the event, they should use + * NeuronEvent_free. + * + * See NeuronExecution for information on multithreaded usage. + * + * Available since 6.0.0 + * + * @param event The event object to be destroyed. Passing NULL is acceptable and + * results in no operation. + */ +void NeuronEvent_freeForce(NeuronEvent* event); + +/** + * Waits until the execution completes. + * + * More than one thread can wait on an event. When the execution completes, + * all threads will be released. + * + * SeeNeuronExecution for information on multithreaded usage. + * + * Available since 5.0.0 + * + * @param event The event that will be signaled on completion. + * @return NEURON_NO_ERROR if the execution completed normally. + * NEURON_UNMAPPABLE if the execution input or output memory cannot + * be properly mapped. + */ +int NeuronEvent_wait(NeuronEvent* event); + +/** + * Create a NeuronEventfrom a sync_fence file descriptor. + * + * The newly created NeuronEvent does not take ownership of the provided + * sync_fence_fd, it will instead dup the provided sync_fence_fd and own the + * duplicate. + * + * @param sync_fence_fd The sync_fence file descriptor. + * @param event The newly created object or NULL if unsuccessful. + * + * @return NEURON_NO_ERROR if successful. + * + * Available since 5.0.0 + */ +int NeuronEvent_createFromSyncFenceFd(int sync_fence_fd, NeuronEvent** event); + +/** + * Get sync_fence file descriptor from the event. + * + * If the NeuronEvent is not backed by a sync fence, the sync_fence_fd + * will be set to -1, and NEURON_BAD_DATA will be returned. + * + * See NeuronEvent_createFromSyncFenceFd and + * NeuronExecution_startComputeWithDependencies to see how to create an event + * backed by a sync fence. + * + * The user takes ownership of the returned fd, and must close the returned file + * descriptor when it is no longer needed. + * + * @param event An event that is backed by a sync fence. + * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will + * be set to -1 if there is an error. + * + * @return NEURON_NO_ERROR if successful. + * + * Available since 5.0.0 + */ +int NeuronEvent_getSyncFenceFd(const NeuronEvent* event, int* sync_fence_fd); + +/** + * Queries whether an extension is supported by the driver implementation of the + * specified device. + * + * @param extension The extension name. + * @param isExtensionSupported The boolean value indicating whether the + * extension is supported. + * + * @return NEURON_NO_ERROR if successful. + * + * Available since 5.0.0 + */ +// Note: Remove "device" +int NeuronDevice_getExtensionSupport( + const char* extensionName, + bool* isExtensionSupported); + +/** + * Creates an operand type from an extension name and an extension operand code. + * + * See {@link NeuronModel} for information on multithreaded usage. + * + * Available since 5.0.0 + * + * @param model The model to contain the operand. + * @param extensionName The extension name. + * @param operandCodeWithinExtension The extension operand code. + * @param type The operand type. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_getExtensionOperandType( + NeuronModel* model, + const char* extensionName, + uint16_t operandCodeWithinExtension, + int32_t* type); + +/** + * Creates an operation type from an extension name and an extension operation + * code. + * + * See {@link NeuronModel} for information on multithreaded usage. + * + * Available since 5.0.0 + * + * @param model The model to contain the operation. + * @param extensionName The extension name. + * @param operationCodeWithinExtension The extension operation code. + * @param type The operation type. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_getExtensionOperationType( + NeuronModel* model, + const char* extensionName, + uint16_t operationCodeWithinExtension, + int32_t* type); + +/** + * Sets extension operand parameters. + * + * Available since 5.0.0 + * + * @param model The model to be modified. + * @param index The index of the model operand we're setting. + * @param data A pointer to the extension operand data. + * The data does not have to outlive the call to this function. + * @param length The size in bytes of the data value. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronModel_setOperandExtensionData( + NeuronModel* model, + int32_t index, + const void* data, + size_t length); + +/** + * Gets the execution preference associated with this compilation. + * This function must be called after calling NeuronCompilation_finish. + * + * Available since 6.0.0 + * + * @param compilation The compilation to be queried. + * @param preference The execution preference will be one of NEURON_PREFER_*. + * Ignore preference value if this function doesn't return NEURON_NO_ERROR. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getPreference( + NeuronCompilation* compilation, + int* preference); + +/** + * Gets the execution priority associated with this compilation. + * This function must be called after calling NeuronCompilation_finish. + * + * Available since 6.0.0 + * + * @param compilation The compilation to be queried. + * @param priority The priority will be one of NEURON_PRIORITY_*. Ignore + * priority value if this function doesn't return NEURON_NO_ERROR. + * + * @return NEURON_NO_ERROR if successful. + */ +int NeuronCompilation_getPriority( + NeuronCompilation* compilation, + int* priority); + +int NeuronCompilation_createWithOptions( + NeuronModel* model, + NeuronCompilation** compilation, + const char* options); +__END_DECLS diff --git a/backends/mediatek/runtime/include/api/NeuronAdapterShim.h b/backends/mediatek/runtime/include/api/NeuronAdapterShim.h new file mode 100644 index 0000000000..3b955eb497 --- /dev/null +++ b/backends/mediatek/runtime/include/api/NeuronAdapterShim.h @@ -0,0 +1,962 @@ +/* Copyright Statement: + * + * This software/firmware and related documentation ("MediaTek Software") are + * protected under relevant copyright laws. The information contained herein + * is confidential and proprietary to MediaTek Inc. and/or its licensors. + * Without the prior written permission of MediaTek inc. and/or its licensors, + * any reproduction, modification, use or disclosure of MediaTek Software, + * and information contained herein, in whole or in part, shall be strictly + * prohibited. + */ +/* MediaTek Inc. (C) 2020. All rights reserved. + * + * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES + * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE") + * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON + * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. + * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE + * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR + * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH + * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY + * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY + * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK + * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO + * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN + * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND + * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER + * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT + * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER + * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE. + * + * The following software/firmware and/or related documentation ("MediaTek + * Software") have been modified by MediaTek Inc. All revisions are subject to + * any receiver's applicable license agreements with MediaTek Inc. + */ + +#pragma once + +#include +#include +#include "NeuronAdapter.h" + +#define LOAD_ADAPTER_FUNCTION(name) \ + static name##_fn fn = reinterpret_cast(loadAdapterFunction(#name)); + +#define EXECUTE_ADAPTER_FUNCTION(...) \ + if (fn != nullptr) { \ + fn(__VA_ARGS__); \ + } + +#define EXECUTE_ADAPTER_FUNCTION_RETURN_INT(...) \ + return fn != nullptr ? fn(__VA_ARGS__) : -1; + +#define EXECUTE_ADAPTER_FUNCTION_RETURN_BOOL(...) \ + return fn != nullptr ? fn(__VA_ARGS__) : false; + +static void* sHandle = nullptr; +inline void* loadAdapterLibrary(const char* name) { + sHandle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); + if (sHandle == nullptr) { + __android_log_print( + ANDROID_LOG_ERROR, "AdapterShimApi", "Unable to open library %s", name); + } + return sHandle; +} + +inline void* getAdapterLibraryHandle() { + if (sHandle == nullptr) { + sHandle = loadAdapterLibrary("libneuronusdk_adapter.mtk.so"); + } + if (sHandle == nullptr) { + sHandle = loadAdapterLibrary("libneuron_adapter_mgvi.so"); + } + if (sHandle == nullptr) { + sHandle = loadAdapterLibrary("libneuron_adapter.so"); + } + return sHandle; +} + +inline void* loadAdapterFunction(const char* name) { + void* fn = nullptr; + if (getAdapterLibraryHandle() != nullptr) { + fn = dlsym(getAdapterLibraryHandle(), name); + } + + if (fn == nullptr) { + __android_log_print( + ANDROID_LOG_ERROR, + "AdapterShimApi", + "Unable to open function %s", + name); + } + + return fn; +} + +/*************************************************************************************************/ +typedef int (*Neuron_getVersion_fn)(NeuronRuntimeVersion* version); + +typedef int (*Neuron_getFeatureSupportedStatus_fn)( + NeuronFeatureType type, + bool* supported); + +typedef int (*Neuron_getNeuroPilotMagicNumber_fn)(int32_t* magic); + +typedef int (*Neuron_getL1MemorySizeKb_fn)(uint32_t* sizeKb); + +typedef int (*NeuronModel_create_fn)(NeuronModel** model); + +typedef void (*NeuronModel_free_fn)(NeuronModel* model); + +typedef int (*NeuronModel_finish_fn)(NeuronModel* model); + +typedef int (*NeuronModel_addOperand_fn)( + NeuronModel* model, + const NeuronOperandType* type); + +typedef int (*NeuronModel_setOperandValue_fn)( + NeuronModel* model, + int32_t index, + const void* buffer, + size_t length); + +typedef int (*NeuronModel_setOperandValueFromModel_fn)( + NeuronModel* model, + int32_t index, + const NeuronModel* value); + +typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams_fn)( + NeuronModel* model, + int32_t index, + const NeuronSymmPerChannelQuantParams* channelQuant); + +typedef int (*NeuronModel_setOperandPerChannelQuantParams_fn)( + NeuronModel* model, + int32_t index, + const NeuronPerChannelQuantParams* channelQuant); + +typedef int (*NeuronModel_addOperation_fn)( + NeuronModel* model, + NeuronOperationType type, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + +typedef int (*NeuronModel_addOperationExtension_fn)( + NeuronModel* model, + const char* name, + const char* vendor, + const NeuronDevice* device, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + +typedef int (*NeuronModel_identifyInputsAndOutputs_fn)( + NeuronModel* model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + +typedef int (*NeuronModel_getSupportedOperations_fn)( + NeuronModel* model, + bool* supported, + uint32_t operationCount); + +typedef int (*NeuronModel_getSupportedOperationsForDevices_fn)( + const NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + bool* supportedOps); + +typedef int (*NeuronModel_relaxComputationFloat32toFloat16_fn)( + NeuronModel* model, + bool allow); + +typedef int ( + *NeuronModel_suppressInputConversion_fn)(NeuronModel* model, bool suppress); + +typedef int (*NeuronModel_suppressOutputConversion_fn)( + NeuronModel* model, + bool suppress); + +typedef int (*NeuronModel_restoreFromCompiledNetwork_fn)( + NeuronModel** model, + NeuronCompilation** compilation, + const void* buffer, + const size_t size); + +typedef int (*NeuronCompilation_create_fn)( + NeuronModel* model, + NeuronCompilation** compilation); + +typedef int (*NeuronCompilation_createV2_fn)( + NeuronModel* model, + CompilationType type, + const char* options, + NeuronCompilation** compilation); + +typedef int (*NeuronCompilation_createForDevices_fn)( + NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + NeuronCompilation** compilation); + +typedef int (*NeuronCompilation_createForDebug_fn)( + NeuronModel* model, + NeuronCompilation** compilation); + +typedef void (*NeuronCompilation_free_fn)(NeuronCompilation* compilation); + +typedef int (*NeuronCompilation_finish_fn)(NeuronCompilation* compilation); + +typedef int (*NeuronCompilation_getSupportedOperations_fn)( + NeuronCompilation* compilation, + uint32_t operationCount, + bool* supported); + +typedef int (*NeuronCompilation_setCaching_fn)( + NeuronCompilation* compilation, + const char* cacheDir, + const uint8_t* token); + +typedef int (*NeuronCompilation_setPreference_fn)( + NeuronCompilation* compilation, + int32_t preference); + +typedef int (*NeuronCompilation_setPriority_fn)( + NeuronCompilation* compilation, + int32_t priority); + +typedef int (*NeuronCompilation_getInputPaddedDimensions_fn)( + NeuronCompilation* compilation, + int32_t index, + uint32_t* dimensions); + +typedef int (*NeuronCompilation_getOutputPaddedDimensions_fn)( + NeuronCompilation* compilation, + int32_t index, + uint32_t* dimensions); + +typedef int (*NeuronCompilation_getInputPaddedSize_fn)( + NeuronCompilation* compilation, + int32_t index, + size_t* size); + +typedef int (*NeuronCompilation_getOutputPaddedSize_fn)( + NeuronCompilation* compilation, + int32_t index, + size_t* size); + +typedef int (*NeuronCompilation_getCompiledNetworkSize_fn)( + NeuronCompilation* compilation, + size_t* size); + +typedef int (*NeuronCompilation_storeCompiledNetwork_fn)( + NeuronCompilation* compilation, + void* buffer, + const size_t size); + +typedef int (*NeuronCompilation_setOptimizationHint_fn)( + NeuronCompilation* compilation, + uint32_t optimizationCode); + +typedef int (*NeuronCompilation_setOptimizationString_fn)( + NeuronCompilation* compilation, + const char* optimizationString); + +typedef int (*NeuronCompilation_setTrimIOAlignment_fn)( + NeuronCompilation* compilation, + bool enable); + +typedef int (*NeuronCompilation_setSWDilatedConv_fn)( + NeuronCompilation* compilation, + bool enable); + +typedef int (*NeuronExecution_create_fn)( + NeuronCompilation* compilation, + NeuronExecution** execution); + +typedef void (*NeuronExecution_free_fn)(NeuronExecution* execution); + +typedef int (*NeuronExecution_setInput_fn)( + NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length); + +typedef int (*NeuronExecution_setOutput_fn)( + NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length); + +typedef int (*NeuronExecution_setInputFromMemory_fn)( + NeuronExecution* execution, + uint32_t index, + const NeuronOperandType* type, + const NeuronMemory* memory, + size_t offset, + size_t length); + +typedef int (*NeuronExecution_setOutputFromMemory_fn)( + NeuronExecution* execution, + uint32_t index, + const NeuronOperandType* type, + const NeuronMemory* memory, + size_t offset, + size_t length); + +typedef int (*NeuronMemory_createFromFd_fn)( + size_t size, + int protect, + int fd, + size_t offset, + NeuronMemory** memory); + +typedef int (*NeuronMemory_createFromAHardwareBuffer_fn)( + const AHardwareBuffer* ahwb, + NeuronMemory** memory); + +typedef void (*NeuronMemory_free_fn)(NeuronMemory* memory); + +typedef int (*NeuronExecution_compute_fn)(NeuronExecution* execution); + +typedef int (*NeuronExecution_startComputeWithDependencies_fn)( + NeuronExecution* execution, + const NeuronEvent* const* dependencies, + uint32_t num_dependencies, + uint64_t duration, + NeuronEvent** event); + +typedef int ( + *NeuronEvent_getSyncFenceFd_fn)(const NeuronEvent* event, int* syncFenceFd); + +typedef int (*NeuronEvent_wait_fn)(NeuronEvent* event); + +typedef void (*NeuronEvent_free_fn)(NeuronEvent* event); + +typedef int (*NeuronExecution_setLoopTimeout_fn)( + NeuronExecution* execution, + uint64_t duration); + +typedef int (*NeuronExecution_setBoostHint_fn)( + NeuronExecution* execution, + uint8_t boostValue); + +typedef int (*NeuronCompilation_createForMultiExecutions_fn)( + NeuronModel* model, + NeuronCompilation** compilation); + +typedef int ( + *NeuronDebug_setReportPath_fn)(NeuronModel* model, const char* path); + +typedef int (*Neuron_getDeviceCount_fn)(uint32_t* numDevices); + +typedef int (*Neuron_getDevice_fn)(uint32_t devIndex, NeuronDevice** device); + +typedef int ( + *NeuronDevice_getName_fn)(const NeuronDevice* device, const char** name); + +typedef int (*NeuronDevice_getDescription_fn)( + const NeuronDevice* device, + const char** description); + +typedef int (*NeuronDevice_getExtensionSupport_fn)( + const char* extensionName, + bool* isExtensionSupported); + +typedef int (*NeuronModel_getExtensionOperandType_fn)( + NeuronModel* model, + const char* extensionName, + uint16_t operandCodeWithinExtension, + int32_t* type); + +typedef int (*NeuronModel_getExtensionOperationType_fn)( + NeuronModel* model, + const char* extensionName, + uint16_t operationCodeWithinExtension, + int32_t* type); + +typedef int (*NeuronModel_setOperandExtensionData_fn)( + NeuronModel* model, + int32_t index, + const void* data, + size_t length); + +typedef int (*NeuronCompilation_createForBatch_fn)( + NeuronModel* model, + NeuronCompilation** compilation); + +typedef int (*NeuronModel_restoreFromCompiledNetworkV2_fn)( + NeuronModel** model, + NeuronCompilation** compilation, + const void* buffer, + const size_t size, + const CompilationType& type); + +typedef int (*NeuronExecution_setRunnerPoolSize_fn)( + NeuronExecution* execution, + uint8_t numRunners); + +typedef int (*NeuronExecution_setBatchDone_fn)(NeuronExecution* execution); + +typedef int ( + *NeuronExecution_setIODone_fn)(NeuronExecution* execution, int idx); + +typedef int (*NeuronCompilation_createWithOptions_fn)( + NeuronModel* model, + NeuronCompilation** compilation, + const char* options); +/*************************************************************************************************/ + +inline int Neuron_getVersion(NeuronRuntimeVersion* version) { + LOAD_ADAPTER_FUNCTION(Neuron_getVersion); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(version); +} + +inline int Neuron_getFeatureSupportedStatus( + NeuronFeatureType type, + bool* supported) { + LOAD_ADAPTER_FUNCTION(Neuron_getFeatureSupportedStatus); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(type, supported); +} + +inline int Neuron_getNeuroPilotMagicNumber(int32_t* magic) { + LOAD_ADAPTER_FUNCTION(Neuron_getNeuroPilotMagicNumber); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(magic); +} + +inline int Neuron_getL1MemorySizeKb(uint32_t* sizeKb) { + LOAD_ADAPTER_FUNCTION(Neuron_getL1MemorySizeKb); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(sizeKb); +} + +inline int NeuronModel_create(NeuronModel** model) { + LOAD_ADAPTER_FUNCTION(NeuronModel_create); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model); +} + +inline void NeuronModel_free(NeuronModel* model) { + LOAD_ADAPTER_FUNCTION(NeuronModel_free); + EXECUTE_ADAPTER_FUNCTION(model); +} + +inline int NeuronModel_finish(NeuronModel* model) { + LOAD_ADAPTER_FUNCTION(NeuronModel_finish); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model); +} + +inline int NeuronModel_addOperand( + NeuronModel* model, + const NeuronOperandType* type) { + LOAD_ADAPTER_FUNCTION(NeuronModel_addOperand); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, type); +} + +inline int NeuronModel_setOperandValue( + NeuronModel* model, + int32_t index, + const void* buffer, + size_t length) { + LOAD_ADAPTER_FUNCTION(NeuronModel_setOperandValue); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, index, buffer, length); +} + +inline int NeuronModel_setOperandValueFromModel( + NeuronModel* model, + int32_t index, + const NeuronModel* value) { + LOAD_ADAPTER_FUNCTION(NeuronModel_setOperandValueFromModel); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, index, value); +} + +inline int NeuronModel_setOperandSymmPerChannelQuantParams( + NeuronModel* model, + int32_t index, + const NeuronSymmPerChannelQuantParams* channelQuant) { + LOAD_ADAPTER_FUNCTION(NeuronModel_setOperandSymmPerChannelQuantParams); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, index, channelQuant); +} + +inline int NeuronModel_setOperandPerChannelQuantParams( + NeuronModel* model, + int32_t index, + const NeuronPerChannelQuantParams* channelQuant) { + LOAD_ADAPTER_FUNCTION(NeuronModel_setOperandPerChannelQuantParams); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, index, channelQuant); +} + +inline int NeuronModel_addOperation( + NeuronModel* model, + NeuronOperationType type, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs) { + LOAD_ADAPTER_FUNCTION(NeuronModel_addOperation); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + model, type, inputCount, inputs, outputCount, outputs); +} + +inline int NeuronModel_addOperationExtension( + NeuronModel* model, + const char* name, + const char* vendor, + const NeuronDevice* device, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs) { + LOAD_ADAPTER_FUNCTION(NeuronModel_addOperationExtension); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + model, name, vendor, device, inputCount, inputs, outputCount, outputs); +} + +inline int NeuronModel_identifyInputsAndOutputs( + NeuronModel* model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs) { + LOAD_ADAPTER_FUNCTION(NeuronModel_identifyInputsAndOutputs); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + model, inputCount, inputs, outputCount, outputs); +} + +inline int NeuronModel_getSupportedOperations( + NeuronModel* model, + bool* supported, + uint32_t operationCount) { + LOAD_ADAPTER_FUNCTION(NeuronModel_getSupportedOperations); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, supported, operationCount); +} + +inline int NeuronModel_getSupportedOperationsForDevices( + const NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + bool* supportedOps) { + LOAD_ADAPTER_FUNCTION(NeuronModel_getSupportedOperationsForDevices); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, devices, numDevices, supportedOps); +} + +inline int NeuronCompilation_getSupportedOperations( + NeuronCompilation* compilation, + uint32_t operationCount, + bool* supported) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_getSupportedOperations); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, operationCount, supported); +} + +inline int NeuronModel_relaxComputationFloat32toFloat16( + NeuronModel* model, + bool allow) { + LOAD_ADAPTER_FUNCTION(NeuronModel_relaxComputationFloat32toFloat16); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, allow); +} + +inline int NeuronModel_suppressInputConversion( + NeuronModel* model, + bool suppress) { + LOAD_ADAPTER_FUNCTION(NeuronModel_suppressInputConversion); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, suppress); +} + +inline int NeuronModel_suppressOutputConversion( + NeuronModel* model, + bool suppress) { + LOAD_ADAPTER_FUNCTION(NeuronModel_suppressOutputConversion); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, suppress); +} + +inline int NeuronModel_restoreFromCompiledNetwork( + NeuronModel** model, + NeuronCompilation** compilation, + const void* buffer, + const size_t size) { + LOAD_ADAPTER_FUNCTION(NeuronModel_restoreFromCompiledNetwork); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation, buffer, size); +} + +inline int NeuronCompilation_create( + NeuronModel* model, + NeuronCompilation** compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_create); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation); +} + +inline int NeuronCompilation_createV2( + NeuronModel* model, + CompilationType type, + const char* options, + NeuronCompilation** compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_createV2); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, type, options, compilation); +} + +inline int NeuronCompilation_createForDevices( + NeuronModel* model, + const NeuronDevice* const* devices, + uint32_t numDevices, + NeuronCompilation** compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_createForDevices); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, devices, numDevices, compilation); +} + +inline int NeuronCompilation_createForDebug( + NeuronModel* model, + NeuronCompilation** compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_createForDebug); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation); +} + +inline void NeuronCompilation_free(NeuronCompilation* compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_free); + EXECUTE_ADAPTER_FUNCTION(compilation); +} + +inline int NeuronCompilation_finish(NeuronCompilation* compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_finish); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation); +} + +inline int NeuronCompilation_setCaching( + NeuronCompilation* compilation, + const char* cacheDir, + const uint8_t* token) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setCaching); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, cacheDir, token); +} + +inline int NeuronCompilation_setPreference( + NeuronCompilation* compilation, + int32_t preference) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setPreference); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, preference); +} + +inline int NeuronCompilation_setPriority( + NeuronCompilation* compilation, + int32_t priority) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setPriority); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, priority); +} + +inline int NeuronCompilation_getInputPaddedDimensions( + NeuronCompilation* compilation, + int32_t index, + uint32_t* dimensions) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_getInputPaddedDimensions); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, index, dimensions); +} + +inline int NeuronCompilation_getOutputPaddedDimensions( + NeuronCompilation* compilation, + int32_t index, + uint32_t* dimensions) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_getOutputPaddedDimensions); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, index, dimensions); +} + +inline int NeuronCompilation_getInputPaddedSize( + NeuronCompilation* compilation, + int32_t index, + size_t* size) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_getInputPaddedSize); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, index, size); +} + +inline int NeuronCompilation_getOutputPaddedSize( + NeuronCompilation* compilation, + int32_t index, + size_t* size) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_getOutputPaddedSize); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, index, size); +} + +inline int NeuronCompilation_getCompiledNetworkSize( + NeuronCompilation* compilation, + size_t* size) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_getCompiledNetworkSize); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, size); +} + +inline int NeuronCompilation_storeCompiledNetwork( + NeuronCompilation* compilation, + void* buffer, + const size_t size) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_storeCompiledNetwork); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, buffer, size); +} + +inline int NeuronCompilation_setOptimizationHint( + NeuronCompilation* compilation, + uint32_t optimizationCode) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setOptimizationHint); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, optimizationCode); +} + +inline int NeuronCompilation_setOptimizationString( + NeuronCompilation* compilation, + const char* optimizationString) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setOptimizationString); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, optimizationString); +} + +inline int NeuronCompilation_setTrimIOAlignment( + NeuronCompilation* compilation, + bool enable) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setTrimIOAlignment); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, enable); +} + +inline int NeuronCompilation_setSWDilatedConv( + NeuronCompilation* compilation, + bool enable) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_setSWDilatedConv); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, enable); +} + +inline int NeuronExecution_create( + NeuronCompilation* compilation, + NeuronExecution** execution) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_create); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(compilation, execution); +} + +inline void NeuronExecution_free(NeuronExecution* execution) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_free); + EXECUTE_ADAPTER_FUNCTION(execution); +} + +inline int NeuronExecution_setInput( + NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setInput); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution, index, type, buffer, length); +} + +inline int NeuronExecution_setInputFromMemory( + NeuronExecution* execution, + uint32_t index, + const NeuronOperandType* type, + const NeuronMemory* memory, + size_t offset, + size_t length) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setInputFromMemory); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + execution, index, type, memory, offset, length); +} + +inline int NeuronExecution_setOutput( + NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setOutput); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution, index, type, buffer, length); +} + +inline int NeuronExecution_setOutputFromMemory( + NeuronExecution* execution, + uint32_t index, + const NeuronOperandType* type, + const NeuronMemory* memory, + size_t offset, + size_t length) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setOutputFromMemory); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + execution, index, type, memory, offset, length); +} + +inline int NeuronMemory_createFromFd( + size_t size, + int protect, + int fd, + size_t offset, + NeuronMemory** memory) { + LOAD_ADAPTER_FUNCTION(NeuronMemory_createFromFd); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(size, protect, fd, offset, memory); +} + +inline int NeuronMemory_createFromAHardwareBuffer( + const AHardwareBuffer* ahwb, + NeuronMemory** memory) { + LOAD_ADAPTER_FUNCTION(NeuronMemory_createFromAHardwareBuffer); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(ahwb, memory); +} + +inline void NeuronMemory_free(NeuronMemory* memory) { + LOAD_ADAPTER_FUNCTION(NeuronMemory_free); + EXECUTE_ADAPTER_FUNCTION(memory); +} + +inline int NeuronExecution_compute(NeuronExecution* execution) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_compute); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution); +} + +inline int NeuronExecution_startComputeWithDependencies( + NeuronExecution* execution, + const NeuronEvent* const* dependencies, + uint32_t num_dependencies, + uint64_t duration, + NeuronEvent** event) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_startComputeWithDependencies); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + execution, dependencies, num_dependencies, duration, event); +} + +inline int NeuronEvent_getSyncFenceFd( + const NeuronEvent* event, + int* syncFenceFd) { + LOAD_ADAPTER_FUNCTION(NeuronEvent_getSyncFenceFd); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(event, syncFenceFd); +} + +inline int NeuronEvent_wait(NeuronEvent* event) { + LOAD_ADAPTER_FUNCTION(NeuronEvent_wait); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(event); +} + +inline void NeuronEvent_free(NeuronEvent* event) { + LOAD_ADAPTER_FUNCTION(NeuronEvent_free); + EXECUTE_ADAPTER_FUNCTION(event); +} + +inline int NeuronExecution_setLoopTimeout( + NeuronExecution* execution, + uint64_t duration) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setLoopTimeout); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution, duration); +} + +inline int NeuronExecution_setBoostHint( + NeuronExecution* execution, + uint8_t boostValue) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setBoostHint); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution, boostValue); +} + +inline int NeuronCompilation_createForMultiExecutions( + NeuronModel* model, + NeuronCompilation** compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_createForMultiExecutions); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation); +} + +inline int NeuronDebug_setReportPath(NeuronModel* model, const char* path) { + LOAD_ADAPTER_FUNCTION(NeuronDebug_setReportPath); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, path); +} + +inline int Neuron_getDeviceCount(uint32_t* numDevices) { + LOAD_ADAPTER_FUNCTION(Neuron_getDeviceCount); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(numDevices); +} + +inline int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) { + LOAD_ADAPTER_FUNCTION(Neuron_getDevice); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(devIndex, device); +} + +inline int NeuronDevice_getName(const NeuronDevice* device, const char** name) { + LOAD_ADAPTER_FUNCTION(NeuronDevice_getName); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(device, name); +} + +inline int NeuronDevice_getDescription( + const NeuronDevice* device, + const char** description) { + LOAD_ADAPTER_FUNCTION(NeuronDevice_getDescription); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(device, description); +} + +inline int NeuronDevice_getExtensionSupport( + const char* extensionName, + bool* isExtensionSupported) { + LOAD_ADAPTER_FUNCTION(NeuronDevice_getExtensionSupport); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(extensionName, isExtensionSupported); +} + +inline int NeuronModel_getExtensionOperandType( + NeuronModel* model, + const char* extensionName, + uint16_t operandCodeWithinExtension, + int32_t* type) { + LOAD_ADAPTER_FUNCTION(NeuronModel_getExtensionOperandType); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + model, extensionName, operandCodeWithinExtension, type); +} + +inline int NeuronModel_getExtensionOperationType( + NeuronModel* model, + const char* extensionName, + uint16_t operationCodeWithinExtension, + int32_t* type) { + LOAD_ADAPTER_FUNCTION(NeuronModel_getExtensionOperationType); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT( + model, extensionName, operationCodeWithinExtension, type); +} + +inline int NeuronModel_setOperandExtensionData( + NeuronModel* model, + int32_t index, + const void* data, + size_t length) { + LOAD_ADAPTER_FUNCTION(NeuronModel_setOperandExtensionData); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, index, data, length); +} + +inline int NeuronCompilation_createForBatch( + NeuronModel* model, + NeuronCompilation** compilation) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_createForBatch); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation); +} + +inline int NeuronModel_restoreFromCompiledNetworkV2( + NeuronModel** model, + NeuronCompilation** compilation, + const void* buffer, + const size_t size, + const CompilationType& type) { + LOAD_ADAPTER_FUNCTION(NeuronModel_restoreFromCompiledNetworkV2); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation, buffer, size, type); +} + +inline int NeuronExecution_setRunnerPoolSize( + NeuronExecution* execution, + uint8_t numRunners) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setRunnerPoolSize); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution, numRunners); +} + +inline int NeuronExecution_setBatchDone(NeuronExecution* execution) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setBatchDone); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution); +} + +inline int NeuronExecution_setIODone(NeuronExecution* execution, int idx) { + LOAD_ADAPTER_FUNCTION(NeuronExecution_setIODone); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(execution, idx); +} + +inline int NeuronCompilation_createWithOptions( + NeuronModel* model, + NeuronCompilation** compilation, + const char* options) { + LOAD_ADAPTER_FUNCTION(NeuronCompilation_createWithOptions); + EXECUTE_ADAPTER_FUNCTION_RETURN_INT(model, compilation, options); +} diff --git a/backends/mediatek/scripts/README.md b/backends/mediatek/scripts/README.md new file mode 100644 index 0000000000..76d0c5ad5f --- /dev/null +++ b/backends/mediatek/scripts/README.md @@ -0,0 +1,50 @@ +# Build Instructions + +This document provides a step-by-step guide to set up the build environment for the MediaTek ExercuTorch libraries. + +## Prerequisites + +Before you begin, ensure you have the following prerequisites installed and configured: + +### 1. Buck2 Build Tool + +- **Download Buck2**: Obtain Buck2 from the official [releases page](https://github.com/facebook/buck2/releases/tag/2024-02-01). +- **Add to PATH**: Extract the downloaded file and add the directory to your system's `$PATH` environment variable. +```bash +export PATH=:$PATH +``` + +### 2. Android NDK + +- **Download Android NDK**: Acquire the Android NDK from the [Android developer site](https://developer.android.com/ndk/downloads). +- **Set NDK Path**: Ensure that the `$ANDROID_NDK` environment variable is set to the path where the NDK is located. +```bash +export ANDROID_NDK= +``` + +### 3. MediaTek ExercuTorch Libraries + +Download the following libraries from MediaTek's NeuroPilot portal (link to be added): + +- `libneuronusdk_adapter.mtk.so`: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip. +- `libneuron_buffer_allocator.so`: This utility library is designed for allocating DMA buffers necessary for model inference. +```bash +export NEURON_BUFFER_ALLOCATOR_LIB= +``` + +## Setup + +Follow the steps below to set up your build environment: + +1. **ExercuTorch Official Tutorial**: Refer to the [Setting up ExercuTorch](https://pytorch.org/executorch/stable/getting-started-setup) guide for detailed instructions on setting up the ExercuTorch environment. + +2. **Build Script**: Once the prerequisites are in place, run the `mtk_build.sh` script to start the build process. + + ```bash + ./mtk_build.sh + ``` +3. **Push MediaTek universal SDK to the device**: push libneuronusdk_adapter.mtk.so to the phone and export it to the `$LD_LIBRARY_PATH` environment variable before executing ExercuTorch with MediaTek backend. + + ```bash + export LD_LIBRARY_PATH=:$LD_LIBRARY_PATH + ``` diff --git a/backends/mediatek/scripts/mtk_build.sh b/backends/mediatek/scripts/mtk_build.sh new file mode 100755 index 0000000000..5e6724a9b5 --- /dev/null +++ b/backends/mediatek/scripts/mtk_build.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Exit immediately if a command exits with a non-zero status. +set -e + +# Define the directory where CMakeLists.txt is located +SOURCE_DIR=$(realpath "$(dirname "$0")/../../..") + +# Check if buck2 exists +BUCK_PATH=${BUCK2:-buck2} +if [ -z "$BUCK2" ]; then + echo "Info: BUCK2 environment variable is not set." >&2 +fi + +# Check if the ANDROID_NDK environment variable is set +if [ -z "$ANDROID_NDK" ]; then + echo "Error: ANDROID_NDK environment variable is not set." >&2 + exit 1 +fi + +# Check if the NEURON_BUFFER_ALLOCATOR_LIB environment variable is set +if [ -z "$NEURON_BUFFER_ALLOCATOR_LIB" ]; then + echo "Error: NEURON_BUFFER_ALLOCATOR_LIB environment variable is not set." >&2 + exit 1 +fi + +# Create and enter the build directory +cd "$SOURCE_DIR" +rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out + +# Configure the project with CMake +# Note: Add any additional configuration options you need here +cmake -DBUCK2="$BUCK_PATH" \ + -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-30 \ + -DEXECUTORCH_BUILD_NEURON=ON \ + -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \ + .. + +# Build the project +cd .. +cmake --build cmake-android-out -j4 + +# Switch back to the original directory +cd - > /dev/null + +# Print a success message +echo "Build successfully completed." diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index cefc330d3d..a8265df8c7 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -58,6 +58,7 @@ add_compile_options("-Wall" "-Werror" "-Wno-sign-compare") # which can be ignored by GNU. So we make it a warning, not an error in GNU. if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") add_compile_options("-Wno-error=attributes") + add_link_options("-flto=auto") endif() if(CMAKE_BUILD_TYPE STREQUAL "Release") @@ -65,10 +66,7 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release") add_link_options("-s") # --gc-sections is added by torch. - add_compile_options( - "-O3" "-ffunction-sections" "-fdata-sections" "-frtti" - "-Wno-unused-command-line-argument" - ) + add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti") endif() include_directories( @@ -183,7 +181,10 @@ target_link_libraries( ) target_link_libraries( qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager - executorch_no_prim_ops qcir_utils + executorch_no_prim_ops qcir_utils extension_tensor +) +set_target_properties( + qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" ) target_link_libraries(utils PRIVATE qnn_executorch_logging) target_link_libraries( @@ -245,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") qnn_executorch_header executorch qcir_utils + extension_tensor ) target_link_libraries( PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers @@ -259,6 +261,19 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") pybind11_strip(PyQnnWrapperAdaptor) endif() + if(CMAKE_BUILD_TYPE STREQUAL "Release") + # need to allow exceptions in pybind + set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti + -fexceptions + ) + target_compile_options( + PyQnnManagerAdaptor PUBLIC ${_pybind_compile_options} + ) + target_compile_options( + PyQnnWrapperAdaptor PUBLIC ${_pybind_compile_options} + ) + endif() + add_subdirectory( ${QNN_EXECUTORCH_ROOT_DIR}/aot/python ${CMAKE_CURRENT_BINARY_DIR}/qnn_executorch/python diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md index 618a1f3e32..3c0fdd8f98 100644 --- a/backends/qualcomm/README.md +++ b/backends/qualcomm/README.md @@ -1,12 +1,14 @@ # Qualcomm AI Engine Direct Backend Disclaimer: At present, we do not offer any backward compatibility guarantees -for any APIs. We are currently in a pre-alpha development phase, and as such, +for any APIs. We are currently in a development phase, and as such, we reserve the right to modify interfaces and implementations. This backend is implemented on the top of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk). -Please follow [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation). +Please follow [tutorial](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md) to setup environment, build, and run executorch models by this backend (Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation). + +A website version of the tutorial is [here](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html). ## Delegate Options @@ -29,7 +31,7 @@ Add SoC model into QcomChipset enum in [schema](./serialization/schema.fbs) and Insert new SoC information into _soc_info_table in [qnn_compile_spec_schema](./serialization/qnn_compile_spec_schema.py). #### Step 3: Recompile the .pte file -Follow [setup](setup.md) to setup environment and build runtime with new schema header. +Follow [setup](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md) to setup environment and build runtime with new schema header. ### Supported Inference Type - Quantized @@ -46,6 +48,7 @@ backends/qualcomm ├── partition # QNN Partitioner (AoT Part). ├── passes # Various passes helping lower models to QNN backend (AoT Part). ├── python # Places to put pybind artifacts for accessing QNN APIs, structures, etc (AoT Part). +├── quantizer # QNN Quantizer ├── runtime # Here is QNN runtime responsbile for compiling a model on x64. | | # Meanwhile, this is also the runtime responsbile for executing compiled | | # models on a device. @@ -58,8 +61,11 @@ backends/qualcomm ├── tests # Unit tests and model tests go here. └── utils # Miscellaneous utilities. -examples -└── qualcomm # Examples to run QNN backends. +examples/qualcomm +├── executor_runner # A general runner that is capable of running most of the basic models. +├── oss_scripts # Scripts for OSS(Open Source Software) models and customized runner for some specific models. +├── qaihub_scripts # Scripts for Qaihub models and corresponding customized runner for these models. +└── scripts # Scripts for models provided by executorch. ``` ## Examples diff --git a/backends/qualcomm/TARGETS b/backends/qualcomm/TARGETS index 5c4f482b5e..0a42614a38 100644 --- a/backends/qualcomm/TARGETS +++ b/backends/qualcomm/TARGETS @@ -1 +1,5 @@ -# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/backends/qualcomm/aot/ir/TARGETS b/backends/qualcomm/aot/ir/TARGETS new file mode 100644 index 0000000000..0a42614a38 --- /dev/null +++ b/backends/qualcomm/aot/ir/TARGETS @@ -0,0 +1,5 @@ +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp index e025b8667a..75446bb733 100755 --- a/backends/qualcomm/aot/ir/qcir_utils.cpp +++ b/backends/qualcomm/aot/ir/qcir_utils.cpp @@ -100,7 +100,7 @@ Qnn_DataType_t ToDataType(qcir::DataType type) { } flatbuffers::Offset ToQuantizeParam( - const Qnn_QuantizeParams_t& param, + const Qnn_Tensor_t& tensor, flatbuffers::FlatBufferBuilder* builder) { static const std::unordered_map def_map{ {QNN_DEFINITION_IMPL_GENERATED, qcir::QuantizeDef::IMPL_GENERATED}, @@ -124,6 +124,7 @@ flatbuffers::Offset ToQuantizeParam( int32_t axis = 0; uint32_t bitwidth = 0; + auto param = QNN_VER_PTR(tensor)->quantizeParams; auto quant_type = type_map.at(param.quantizationEncoding); std::vector data; std::vector scales; @@ -160,7 +161,9 @@ flatbuffers::Offset ToQuantizeParam( } } break; default: - QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected"); + QNN_EXECUTORCH_LOG_WARN( + "QNN_QUANTIZATION_ENCODING_UNDEFINED detected: %s", + QNN_VER_PTR(tensor)->name); break; } return CreateQuantizeParamDirect( @@ -174,7 +177,7 @@ flatbuffers::Offset ToQuantizeParam( &data); } -Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) { +Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) { static const std::unordered_map def_map{ {qcir::QuantizeDef::IMPL_GENERATED, QNN_DEFINITION_IMPL_GENERATED}, {qcir::QuantizeDef::DEFINED, QNN_DEFINITION_DEFINED}, @@ -196,6 +199,7 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) { }; Qnn_QuantizeParams_t p = QNN_QUANTIZE_PARAMS_INIT; + auto param = tensor->qparam(); p.encodingDefinition = def_map.at(param->def()); p.quantizationEncoding = type_map.at(param->type()); switch (p.quantizationEncoding) { @@ -225,7 +229,9 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) { const_cast(param->offsets()->data()); } break; default: - QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected"); + QNN_EXECUTORCH_LOG_WARN( + "qcir::QuantizeType::UNDEFINED detected: %s", + tensor->name()->c_str()); break; } return p; @@ -248,7 +254,7 @@ flatbuffers::Offset ToTensor( &shape, ToTensorType(QNN_VER_PTR(tensor)->type), ToDataType(QNN_VER_PTR(tensor)->dataType), - ToQuantizeParam(QNN_VER_PTR(tensor)->quantizeParams, builder), + ToQuantizeParam(tensor, builder), &buffer); } @@ -261,7 +267,7 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) { QNN_VER_PTR(t)->name = tensor->name()->c_str(); QNN_VER_PTR(t)->type = ToTensorType(tensor->type()); QNN_VER_PTR(t)->dataType = ToDataType(tensor->dtype()); - QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor->qparam()); + QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor); QNN_VER_PTR(t)->rank = tensor->shape()->size(); QNN_VER_PTR(t)->dimensions = const_cast(tensor->shape()->data()); QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size(); diff --git a/backends/qualcomm/aot/ir/qcir_utils.h b/backends/qualcomm/aot/ir/qcir_utils.h index 30a5481f9f..2fb2b68d3b 100755 --- a/backends/qualcomm/aot/ir/qcir_utils.h +++ b/backends/qualcomm/aot/ir/qcir_utils.h @@ -8,17 +8,17 @@ #pragma once +#include #include "QnnTypes.h" -#include "qcir_generated.h" namespace torch { namespace executor { namespace qnn { -typedef flatbuffers::Vector<::flatbuffers::Offset>::value_type +typedef flatbuffers::Vector<::flatbuffers::Offset>::return_type tensor_type; typedef flatbuffers::Vector< - ::flatbuffers::Offset>::value_type qparam_type; + ::flatbuffers::Offset>::return_type qparam_type; qcir::TensorType ToTensorType(Qnn_TensorType_t type); Qnn_TensorType_t ToTensorType(qcir::TensorType type); @@ -26,9 +26,9 @@ qcir::DataType ToDataType(Qnn_DataType_t type); Qnn_DataType_t ToDataType(qcir::DataType type); flatbuffers::Offset ToQuantizeParam( - const Qnn_QuantizeParams_t& param, + const Qnn_Tensor_t& tensor, flatbuffers::FlatBufferBuilder* builder); -Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& type); +Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor); flatbuffers::Offset ToTensor( const Qnn_Tensor_t& tensor, diff --git a/backends/qualcomm/aot/ir/targets.bzl b/backends/qualcomm/aot/ir/targets.bzl new file mode 100644 index 0000000000..a7cc5c03e2 --- /dev/null +++ b/backends/qualcomm/aot/ir/targets.bzl @@ -0,0 +1,66 @@ +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "ANDROID", +) +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbsource//xplat/executorch/backends/qualcomm:targets.bzl", "generate_schema_header") + +QCIR_NAME = "qcir" +INPUT_QCIR = QCIR_NAME + ".fbs" +OUTPUT_QCIR_HEADER = QCIR_NAME + "_generated.h" +QCIR_GEN_RULE_NAME = "qcir_generated" + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + generate_schema_header( + QCIR_GEN_RULE_NAME, + [INPUT_QCIR], + [OUTPUT_QCIR_HEADER], + OUTPUT_QCIR_HEADER, + ) + + # Header-only library target with the generate executorch program schema header. + runtime.cxx_library( + name = "qcir_schema", + srcs = [], + exported_headers = { + OUTPUT_QCIR_HEADER: ":{}[{}]".format(QCIR_GEN_RULE_NAME, OUTPUT_QCIR_HEADER), + }, + visibility = [ + # Lock this down as tightly as possible to ensure that flatbuffers + # are an implementation detail. Ideally this list would only include + # //executorch/runtime/executor/... + "//executorch/backends/qualcomm/...", + "//executorch/backends/qualcomm/aot/ir/...", + ], + exported_external_deps = ["flatbuffers-api"], + define_static_target = True, + platforms = [ANDROID], + ) + + + runtime.cxx_library( + name = "qcir_utils", + srcs = [ + "qcir_utils.cpp", + ], + exported_headers = [ + "qcir_utils.h", + ], + define_static_target = True, + platforms = [ANDROID], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "fbsource//third-party/qualcomm/qnn:api", + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + "//executorch/backends/qualcomm/aot/wrappers:wrappers", + ], + exported_deps = [ + ":qcir_schema", + ], + ) diff --git a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h index f13b5962b7..1f7f5ccb08 100644 --- a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h @@ -86,7 +86,7 @@ class PyQnnOpWrapper { break; default: QNN_EXECUTORCH_LOG_ERROR( - "%s has invalid data type: %d", name, data_type); + "%s has invalid data type: %d", name.c_str(), data_type); break; } } @@ -171,7 +171,7 @@ class PyQnnTensorWrapper { return {enc_data, data.axis}; } default: - QNN_EXECUTORCH_LOG_ERROR( + QNN_EXECUTORCH_LOG_WARN( "%s QNN_QUANTIZATION_ENCODING_UNDEFINED detected", GetName().c_str()); break; diff --git a/backends/qualcomm/aot/wrappers/TARGETS b/backends/qualcomm/aot/wrappers/TARGETS new file mode 100644 index 0000000000..0a42614a38 --- /dev/null +++ b/backends/qualcomm/aot/wrappers/TARGETS @@ -0,0 +1,5 @@ +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl new file mode 100644 index 0000000000..08d6920a02 --- /dev/null +++ b/backends/qualcomm/aot/wrappers/targets.bzl @@ -0,0 +1,32 @@ +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "ANDROID", +) +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + runtime.cxx_library( + name = "wrappers", + srcs = glob([ + "*.cpp", + ]), + exported_headers = glob([ + "*.h", + ]), + define_static_target = True, + platforms = [ANDROID], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "fbsource//third-party/qualcomm/qnn:api", + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + ], + exported_deps = [ + "//executorch/backends/qualcomm/runtime:logging", + ], + ) diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index c4fbdeae14..79c02e2207 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -23,6 +23,8 @@ op_hardsigmoid, op_hardswish, op_hardtanh, + op_index, + op_index_put, op_layer_norm, op_linear, op_log_softmax, @@ -36,6 +38,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, @@ -75,6 +78,8 @@ op_hardswish, op_hardtanh, op_hardsigmoid, + op_index, + op_index_put, op_layer_norm, op_linear, op_log_softmax, @@ -88,6 +93,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py index 641e2445f2..514bc6efd7 100644 --- a/backends/qualcomm/builders/node_visitor.py +++ b/backends/qualcomm/builders/node_visitor.py @@ -12,13 +12,20 @@ import numpy as np import torch from executorch.backends.qualcomm.utils.constants import ( + QCOM_AXIS, QCOM_AXIS_ORDER, QCOM_BITWIDTH, + QCOM_DTYPE, QCOM_ENCODING, + QCOM_OFFSET, QCOM_QUANT_ATTRS, + QCOM_QUANT_MAX, + QCOM_QUANT_MIN, QCOM_REQUANTIZE, + QCOM_SCALE, QCOM_SCALE_OFFSET, QCOM_SCALES, + QCOM_ZERO_POINT, QCOM_ZERO_POINTS, ) @@ -125,16 +132,16 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict): "convolution" in user_0.target.__name__ and list(node.users)[0].args[1] == node ): - quant_config["axis"] = 3 + quant_config[QCOM_AXIS] = 3 else: - quant_config["axis"] = quant_attrs["axis"] + quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS] quant_config[QCOM_SCALE_OFFSET] = scale_offset # special case for 4 bits if ( - quant_config["dtype"] == torch.int8 - and quant_config["quant_max"] - quant_config["quant_min"] <= 15 + quant_config[QCOM_DTYPE] == torch.int8 + and quant_config[QCOM_QUANT_MAX] - quant_config[QCOM_QUANT_MIN] <= 15 ): quant_config[QCOM_BITWIDTH] = 4 return ( @@ -149,11 +156,11 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict): def make_qnn_per_tensor_config(self, quant_attrs: Dict): quant_config = copy.deepcopy(quant_attrs) # check Qnn_ScaleOffset_t in QNN/include/QnnTypes.h - quant_config["offset"] = -quant_attrs["zero_point"] + quant_config[QCOM_OFFSET] = -quant_attrs[QCOM_ZERO_POINT] # special case for 4 bits if ( - quant_config["dtype"] == torch.int8 - and quant_config["quant_max"] - quant_config["quant_min"] <= 15 + quant_config[QCOM_DTYPE] == torch.int8 + and quant_config[QCOM_QUANT_MAX] - quant_config[QCOM_QUANT_MIN] <= 15 ): quant_config[QCOM_BITWIDTH] = 4 return ( @@ -187,13 +194,13 @@ def get_quant_tensor_value( self, tensor: torch.Tensor, quant_attrs: Dict, quant_configs: Dict ) -> torch.Tensor: if quant_attrs[QCOM_ENCODING] in PER_TENSOR_ENCODING: - scale = quant_attrs["scale"] - zero_point = quant_attrs["zero_point"] + scale = quant_attrs[QCOM_SCALE] + zero_point = quant_attrs[QCOM_ZERO_POINT] else: # per channel case scale = quant_attrs[QCOM_SCALES] zero_point = quant_attrs[QCOM_ZERO_POINTS] - dtype = quant_configs["dtype"] + dtype = quant_configs[QCOM_DTYPE] tensor = tensor.div(scale).add(zero_point).round().to(dtype) # Make the backends access data correctly @@ -233,8 +240,8 @@ def get_data_type( quant_config: Dict, ) -> PyQnnWrapper.Qnn_TensorType_t: if quant_config: - quant_config["dtype"] = deduce_dtype(tensor, quant_config) - return QNN_QUANT_TYPE_MAP[quant_config["dtype"]] + quant_config[QCOM_DTYPE] = deduce_dtype(tensor, quant_config) + return QNN_QUANT_TYPE_MAP[quant_config[QCOM_DTYPE]] return QNN_TENSOR_TYPE_MAP[tensor.dtype] diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py index 13b24c0d72..9ca299e743 100644 --- a/backends/qualcomm/builders/op_batch_norm.py +++ b/backends/qualcomm/builders/op_batch_norm.py @@ -8,6 +8,11 @@ import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper import torch +from executorch.backends.qualcomm.utils.constants import ( + QCOM_QUANT_ATTRS, + QCOM_QUANT_MAX, + QCOM_SCALE, +) from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW @@ -21,6 +26,15 @@ class BatchNorm(NodeVisitor): def __init__(self, *args) -> None: super().__init__(*args) + def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor, eps): + if isinstance(tensor, torch._subclasses.FakeTensor): + return + + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + # scale value equals to zero will cause failure in HTP + diff = max(abs(tensor.max()), abs(tensor.min())) + eps + quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX] + def define_node( self, node: torch.fx.Node, @@ -29,7 +43,7 @@ def define_node( input_node = node.args[0] input_tensor = self.get_tensor(input_node, node) - mean_node, var_node, eps = node.args[3], node.args[4], 1e-5 + mean_node, var_node, eps = node.args[3], node.args[4], 1e-9 mean_tensor = get_parameter(mean_node, self.edge_program) var_tensor = get_parameter(var_node, self.edge_program) @@ -48,6 +62,7 @@ def define_node( amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps) bias_tensor = bias_tensor - amount + self.update_encoding(bias_node, bias_tensor, eps) bias_tensor_wrapper = self.define_tensor( bias_node, bias_tensor, @@ -57,6 +72,7 @@ def define_node( ) filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps) + self.update_encoding(filter_node, filter_tensor, eps) filter_tensor_wrapper = self.define_tensor( filter_node, filter_tensor, diff --git a/backends/qualcomm/builders/op_index.py b/backends/qualcomm/builders/op_index.py new file mode 100644 index 0000000000..6f8dc558fe --- /dev/null +++ b/backends/qualcomm/builders/op_index.py @@ -0,0 +1,83 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper + +import numpy as np +import torch + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class Index(NodeVisitor): + # schema = aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + target = ["aten.index.Tensor"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + + if len(node.args[1]) > 1: + # TODO consider to implement it in a recursive way. + raise NotImplementedError("Not support tuple of tensor.") + + indices_node = node.args[1][0] + indices_tensor = self.get_tensor(indices_node, node).to(torch.int32) + assert indices_tensor.size(0) != 0, "Not support empty indices list" + + indices_tensor_wrapper = self.define_tensor( + indices_node, + indices_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + + gather_input_tensors = [input_tensor_wrapper, indices_tensor_wrapper] + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=False, + ) + gather_output_tensors = [output_tensor_wrapper] + + gather_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpGather.op_name, + ) + gather_op.AddInputTensors(gather_input_tensors) + gather_op.AddOutputTensors(gather_output_tensors) + + # If support tuple of tensor, need to refine it based on len + gather_op.AddScalarParam( + OpGather.param_axis, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32, + {"data": np.int32(0)}, + ) + + return gather_op diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py new file mode 100644 index 0000000000..af5311dfb2 --- /dev/null +++ b/backends/qualcomm/builders/op_index_put.py @@ -0,0 +1,83 @@ +from typing import Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper + +import torch + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpScatterNd, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class IndexPutVisitor(NodeVisitor): + target = ["aten.index_put.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + indicies_node = node.args[1] + indices_list = [ + self.get_tensor(idx, idx) for idx in indicies_node if idx is not None + ] + + # Unpack the tuple + indices_unpacked = [torch.flatten(idx) for idx in indices_list] + + # Convert to 2-D tensor + indices_qnn = torch.cat(indices_unpacked).unsqueeze(0) + indice_node = [n for n in indicies_node if isinstance(n, torch.fx.Node)] + # TODO consider to write a pass to combine to one input tensor for indices + assert len(indice_node) == 1, "Not support mutilple indices tensor" + + indices_tensor_wrapper = self.define_tensor( + indice_node[0], + indices_qnn, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + value_node = node.args[2] + + value_tensor = self.get_tensor(value_node, node) + + value_tensor_wrapper = self.define_tensor( + value_node, + value_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=False, + ) + + index_put_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpScatterNd.op_name, + ) + index_put_op.AddInputTensors( + [input_tensor_wrapper, indices_tensor_wrapper, value_tensor_wrapper] + ) + index_put_op.AddOutputTensors([output_tensor_wrapper]) + + return index_put_op diff --git a/backends/qualcomm/builders/op_prelu.py b/backends/qualcomm/builders/op_prelu.py index fc0c6b9232..5da017b8b7 100644 --- a/backends/qualcomm/builders/op_prelu.py +++ b/backends/qualcomm/builders/op_prelu.py @@ -11,6 +11,10 @@ from executorch.backends.qualcomm.utils.constants import ( QCOM_AXIS_ORDER, QCOM_QUANT_ATTRS, + QCOM_QUANT_MAX, + QCOM_QUANT_MIN, + QCOM_SCALE, + QCOM_ZERO_POINT, ) from executorch.exir.dialects._ops import ops as exir_ops @@ -77,10 +81,10 @@ def define_node( ) if pow_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): quant_attrs = pow_quant_attrs.copy() - quant_range = quant_attrs["quant_max"] - quant_attrs["quant_min"] + quant_range = quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN] # coeff is guaranteed to be positive - quant_attrs["zero_point"] = 0 - quant_attrs["scale"] = coeff / quant_range + quant_attrs[QCOM_ZERO_POINT] = 0 + quant_attrs[QCOM_SCALE] = coeff / quant_range scalar_node.meta[QCOM_QUANT_ATTRS] = quant_attrs scalar_tensor_wrapper = self.define_tensor( diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py new file mode 100644 index 0000000000..e99b1f47ba --- /dev/null +++ b/backends/qualcomm/builders/op_rms_norm.py @@ -0,0 +1,127 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper +import numpy as np + +import torch +from executorch.backends.qualcomm.builders.utils import get_parameter +from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpRmsNorm, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class RmsNormVisitor(NodeVisitor): + target = ["aten.rms_norm.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + # args of node : ['input', 'normalized_shape', 'weight', 'eps'] + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + + # should be a immutable list + normalized_shapes = node.args[1] + if ( + len(normalized_shapes) != 1 + and normalized_shapes[0] != input_tensor.shape[-1] + ): + print("Only supports normalization with last input dimension") + return + axes = [node.args[0].meta["val"].dim() - 1] + axes_shape = [len(axes)] + + weight_node = node.args[2] + weight_tensor = get_parameter(weight_node, self.edge_program) + weight_tensor_wrapper = self.define_tensor( + weight_node, + weight_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + # Fake node, nn moudle seems to be inconsistant with document + bias_tensor = torch.zeros(weight_tensor.shape) + bias_node = torch.fx.Node( + node.graph, + node.name + "_runtime_bias", + "call_function", + exir_ops.edge.aten.tensor.default, + (), # args + {}, # kwargs + ) + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + epsilon = node.args[3] + if isinstance(epsilon, torch.fx.Node): + epsilon = get_parameter(epsilon, self.edge_program) + epsilon = ( + epsilon + if isinstance(epsilon, float) + else torch.finfo(epsilon.dtype).eps + ) + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=False, + ) + + rms_nrom_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpRmsNorm.op_name, + ) + + rms_nrom_op.AddInputTensors( + [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper] + ) + rms_nrom_op.AddOutputTensors([output_tensor_wrapper]) + rms_nrom_op.AddScalarParam( + OpRmsNorm.param_epsilon, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32, + {QCOM_DATA: np.float32(epsilon)}, + ) + rms_nrom_op.AddTensorParam( + OpRmsNorm.param_axes, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(axes_shape), + axes_shape, + np.array(axes, dtype=np.uint32), + True, + ) + + return rms_nrom_op diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py index ae4c89bbb9..cda40aed45 100644 --- a/backends/qualcomm/builders/op_softmax.py +++ b/backends/qualcomm/builders/op_softmax.py @@ -17,7 +17,7 @@ @register_node_visitor class Softmax(NodeVisitor): - target = ["aten._softmax.default"] + target = ["aten._softmax.default", "aten._safe_softmax.default"] def __init__(self, *args) -> None: super().__init__(*args) diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py index dca47ebeec..8ac702f2ad 100644 --- a/backends/qualcomm/builders/qnn_constants.py +++ b/backends/qualcomm/builders/qnn_constants.py @@ -124,13 +124,6 @@ class OpExpandDims: param_axis: str = "axis" -@dataclass(init=False, frozen=True) -class OpReduceSum: - op_name: str = "ReduceSum" - param_axes: str = "axes" - param_keep_dims: str = "keep_dims" - - @dataclass(init=False, frozen=True) class OpFullyConnected: op_name: str = "FullyConnected" @@ -144,13 +137,14 @@ class OpGather: @dataclass(init=False, frozen=True) -class OpGelu: - op_name: str = "Gelu" +class OpGatherND: + op_name: str = "GatherNd" + param_batch_dims: str = "batch_dims" @dataclass(init=False, frozen=True) -class OpSqrt: - op_name: str = "ElementWiseSquareRoot" +class OpGelu: + op_name: str = "Gelu" @dataclass(init=False, frozen=True) @@ -246,6 +240,13 @@ class OpReduceMean: param_keep_dims: str = "keep_dims" +@dataclass(init=False, frozen=True) +class OpReduceSum: + op_name: str = "ReduceSum" + param_axes: str = "axes" + param_keep_dims: str = "keep_dims" + + @dataclass(init=False, frozen=True) class OpRelu: op_name: str = "Relu" @@ -277,6 +278,19 @@ class OpResizeNearestNeighbor: param_half_pixel_centers: str = "half_pixel_centers" +@dataclass(init=False, frozen=True) +class OpRmsNorm: + op_name: str = "RmsNorm" + param_epsilon: str = "epsilon" + param_axes: str = "axes" + + +@dataclass(init=False, frozen=True) +class OpScatterNd: + op_name: str = "ScatterNd" + param_reduction: str = "reduction" + + @dataclass(init=False, frozen=True) class OpSigmoid: op_name: str = "Sigmoid" @@ -307,6 +321,11 @@ class OpSplit: param_split_index: str = "split_index" +@dataclass(init=False, frozen=True) +class OpSqrt: + op_name: str = "ElementWiseSquareRoot" + + @dataclass(init=False, frozen=True) class OpSqueeze: op_name: str = "Squeeze" diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py index 61935cf353..d68441c2f7 100644 --- a/backends/qualcomm/partition/common_defs.py +++ b/backends/qualcomm/partition/common_defs.py @@ -13,8 +13,15 @@ exir_ops.edge.aten.clone.default, exir_ops.edge.aten.full.default, exir_ops.edge.aten.slice_scatter.default, - exir_ops.edge.aten.index.Tensor, - exir_ops.edge.aten.index_put.default, + exir_ops.edge.aten.copy.default, +] + +to_be_implemented_operator = [ + exir_ops.edge.aten.any.dim, + exir_ops.edge.aten.eq.Scalar, + exir_ops.edge.aten.full_like.default, + exir_ops.edge.aten.logical_not.default, + exir_ops.edge.aten.where.self, ] allow_list_operator = [ diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py index c3afc23dae..659bda517f 100644 --- a/backends/qualcomm/partition/qnn_partitioner.py +++ b/backends/qualcomm/partition/qnn_partitioner.py @@ -27,7 +27,11 @@ from torch.fx.passes.infra.partitioner import Partition from torch.fx.passes.operator_support import OperatorSupportBase -from .common_defs import allow_list_operator, not_supported_operator +from .common_defs import ( + allow_list_operator, + not_supported_operator, + to_be_implemented_operator, +) class QnnOperatorSupport(OperatorSupportBase): @@ -40,16 +44,7 @@ def __init__( ): self.node_visitors = node_visitor.get_node_visitors(edge_program) - self.skip_node_op_builder_set = set() - if skip_node_op_set is not None: - self.skip_node_op_builder_set = set( - [ - self.node_visitors[val] - for val in skip_node_op_set - if val in self.node_visitors - ] - ) - + self.skip_node_op_set = skip_node_op_set self.skip_node_id_set = skip_node_id_set self.nodes_to_wrappers = defaultdict(dict) self.qnn_manager = PyQnnManager.QnnManager( @@ -62,17 +57,18 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: if node.op != "call_function" or node.target in not_supported_operator: return False + if node.target in to_be_implemented_operator: + print( + f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped, this op can be supported, please report an issue in https://github.com/pytorch/executorch/issues" + ) + return False + if node.target in allow_list_operator: return True - if self.skip_node_id_set is not None and node.name in self.skip_node_id_set: - print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped") - return False - if ( - self.skip_node_op_builder_set is not None - and self.node_visitors[node.target.__name__] - in self.skip_node_op_builder_set + node.name in self.skip_node_id_set + or node.target.__name__ in self.skip_node_op_set ): print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped") return False @@ -114,8 +110,8 @@ def __init__( QnnBackend.__name__, self.compiler_specs_snapshot ) self.partition_tags: Dict[str, DelegationSpec] = {} - self.skip_node_id_set = skip_node_id_set - self.skip_node_op_set = skip_node_op_set + self.skip_node_id_set = set() if skip_node_id_set is None else skip_node_id_set + self.skip_node_op_set = set() if skip_node_op_set is None else skip_node_op_set def generate_partitions( self, edge_program: torch.export.ExportedProgram @@ -131,18 +127,35 @@ def generate_partitions( op_support=self.op_support_checker, ) - def tag_nodes(self, partitions: List[Partition]) -> None: + def tag_nodes( + self, partitions: List[Partition], edge_program: torch.export.ExportedProgram + ) -> None: for partition in partitions: for node in partition.nodes: delegation_tag = f"qnn_{partition.id}" node.meta["delegation_tag"] = delegation_tag self.partition_tags[delegation_tag] = self.delegation_spec + # need to take care of consumed constants + consumed_constants = ( + *edge_program.graph_signature.inputs_to_buffers, + *edge_program.graph_signature.inputs_to_parameters, + ) + for node in edge_program.graph_module.graph.nodes: + # find placeholders as lifted_constants + if node.op != "placeholder" or len(node.users) != 0: + continue + + if node.name in consumed_constants: + # does no harm to merge them into last partition, + # since they will all be removed in following stage + node.meta["delegation_tag"] = delegation_tag + # override def partition(self, edge_program: torch.export.ExportedProgram) -> PartitionResult: partitions = self.generate_partitions(edge_program) if len(partitions) != 0: - self.tag_nodes(partitions) + self.tag_nodes(partitions, edge_program) tag_constant_data(edge_program) for node in edge_program.graph_module.graph.nodes: if hasattr(node, "meta"): diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py index 5f111ee9c8..1db50694ec 100644 --- a/backends/qualcomm/passes/annotate_and_quant_scalar.py +++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py @@ -14,7 +14,7 @@ from executorch.exir.passes import dead_code_elimination_pass from torch.fx.passes.utils.source_matcher_utils import get_source_partitions -from .utils import get_quant_attrs +from .utils import dq_ops, get_quant_attrs class AnnotateAndQuantScalar(ExportPass): @@ -78,6 +78,7 @@ def _annotate_scalar_node( float, torch.float32, torch.int32, + torch.int64, ]: return @@ -88,30 +89,43 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule): graph_module.graph, self.binary_op_sources ) src_partitions = list(itertools.chain(*src_partitions.values())) + processed = set() for src_partition in src_partitions: - output = src_partition.output_nodes[0] - if ( - output.meta.get(QCOM_QUANT_ATTRS) - and len(src_partition.input_nodes) == 1 - ): - dq_node = src_partition.input_nodes[0] - q_node = dq_node.args[0] - q_node_attrs = get_quant_attrs(graph_module, q_node) - - scalar_nodes = [n for n in output.args if n != dq_node] - if len(scalar_nodes) == 0: + # need post process here to identify partitioned nodes: + src_fn_dict = {} + for n in src_partition.nodes: + # e.g. + # meta["source_fn_stack"]: [('mul', )] + # we'll use as grouping key + node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], []) + node_list.append(n) + + for nodes in src_fn_dict.values(): + output = [n for n in nodes if n in src_partition.output_nodes][0] + # if all args have been annotated, it shouldn't be a scalar operation + if all(arg.target in dq_ops for arg in output.args): continue - scalar_node = scalar_nodes[0] - source_scalar_node = self._get_source_scalar_node(scalar_node) - # we'll abandon cast op here, since the constant scalar will - # be pre-loaded into QNN context binary - output.replace_input_with(scalar_node, source_scalar_node) + if output not in processed and QCOM_QUANT_ATTRS in output.meta: + dq_node = [n for n in output.args if n.target in dq_ops][0] + q_node = dq_node.args[0] + q_node_attrs = get_quant_attrs(graph_module, q_node) + + scalar_nodes = [n for n in output.args if n != dq_node] + if len(scalar_nodes) == 0: + continue + + scalar_node = scalar_nodes[0] + source_scalar_node = self._get_source_scalar_node(scalar_node) + # we'll abandon cast op here, since the constant scalar will + # be pre-loaded into QNN context binary + output.replace_input_with(scalar_node, source_scalar_node) - scalar_quant_attrs = self._update_scalar_node_attrs( - source_scalar_node, q_node_attrs - ) - self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + scalar_quant_attrs = self._update_scalar_node_attrs( + source_scalar_node, q_node_attrs + ) + self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + processed.add(output) def call(self, graph_module: torch.fx.GraphModule): self._traverse_binary_node(graph_module) diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py index 7814a3ff0d..1d2171cc37 100644 --- a/backends/qualcomm/passes/i64_to_i32.py +++ b/backends/qualcomm/passes/i64_to_i32.py @@ -5,7 +5,9 @@ # LICENSE file in the root directory of this source tree. import torch from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult +from torch._subclasses.fake_tensor import FakeTensor class I64toI32(ExportPass): @@ -16,6 +18,8 @@ class I64toI32(ExportPass): def __init__(self, edge_program: torch.export.ExportedProgram): super(I64toI32, self).__init__() self.edge_program = edge_program + # pyre-ignore[4] + self.copy_op = exir_ops.edge.aten._to_copy.default def _update_meta(self, node: torch.fx.node) -> None: meta_val = node.meta["val"] @@ -32,6 +36,10 @@ def _update_meta(self, node: torch.fx.node) -> None: if meta_val.dtype == torch.int64: node.meta["val"] = meta_val.to(torch.float) + # pyre-ignore[2] + def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool: + return isinstance(node_val, FakeTensor) and node_val.dtype == dtype + def _cast_to_int32(self, graph_module: torch.fx.GraphModule): for n in graph_module.graph.nodes: if is_constant(n, self.edge_program): @@ -39,6 +47,22 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule): if param.dtype == torch.int64: # QNN does not support int64 self._update_meta(n) + elif n.op == "placeholder": + node_val = n.meta["val"] + if self._is_tensor_of_dtype(node_val, torch.int64): + with graph_module.graph.inserting_after(n): + args = (n,) + to_dst_node = graph_module.graph.create_node( + "call_function", + self.copy_op, + args, + {"dtype": torch.int32}, + ) + to_dst_node.meta["val"] = node_val.to(torch.int32) + + # Replace usage of the src dtype result with the dst dtype result. + n.replace_all_uses_with(to_dst_node) + to_dst_node.args = (n,) def call(self, graph_module: torch.fx.GraphModule): self._cast_to_int32(graph_module) diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py deleted file mode 100644 index 9eec6bfa26..0000000000 --- a/backends/qualcomm/passes/recompose_pixel_shuffle.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions - - -class RecomposePixelShuffle(ExportPass): - """ - Merge decomposed operators back to one super node. - """ - - def __init__(self): - super().__init__() - - def call(self, graph_module: torch.fx.GraphModule): - graph = graph_module.graph - # decomposed core aten ops - partitions = get_source_partitions(graph, [torch.nn.PixelShuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - upscale_factor = h_out_shape / h_in_shape - - pixel_shuffle_node = graph.create_node( - "call_function", - exir_ops.edge.aten.pixel_shuffle.default, - (input_node, int(upscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_shuffle_node) - # copy metadata - pixel_shuffle_node.meta = output_node.meta - - graph.eliminate_dead_code() - graph_module.recompile() - return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py index a47f3d119a..00d4663908 100644 --- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py +++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py @@ -6,7 +6,6 @@ import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions class RecomposePixelUnshuffle(ExportPass): @@ -85,30 +84,6 @@ def call(self, graph_module: torch.fx.GraphModule): # copy metadata pixel_unshuffle_node.meta = node.meta - # decomposed core aten ops - if not self.quantization_capture: - partitions = get_source_partitions(graph, [torch.nn.PixelUnshuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - downscale_factor = h_in_shape / h_out_shape - - op = self.op - pixel_unshuffle_node = graph.create_node( - "call_function", - op, - (input_node, int(downscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_unshuffle_node) - # copy metadata - pixel_unshuffle_node.meta = output_node.meta - graph.eliminate_dead_code() graph_module.recompile() return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_rms_norm.py b/backends/qualcomm/passes/recompose_rms_norm.py new file mode 100644 index 0000000000..b26de8bd79 --- /dev/null +++ b/backends/qualcomm/passes/recompose_rms_norm.py @@ -0,0 +1,76 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + +from .utils import dq_ops + + +class RecomposeRmsNorm(ExportPass): + """ + Merge decomposed operators back to one super node. + """ + + def __init__(self): + super().__init__() + + def _get_eps_node(self, nodes): + # eps: one of inputs of add node + add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0] + for a in add_node.args: + if isinstance(a, float) or a.op != "call_function": + return a + + def _get_gamma_node(self, output_node): + # gamma: one of inputs of output node + for a in output_node.args: + if a.op != "call_function" or a.target in dq_ops: + return a + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + partitions = get_source_partitions(graph, [torch.nn.RMSNorm]) + for _, src_partitions in partitions.items(): + for src_partition in src_partitions: + input_len = len(src_partition.input_nodes) + if input_len == 1: + input_node = src_partition.input_nodes[0] + elif input_len == 2: + inp_0, inp_1 = src_partition.input_nodes + input_node = inp_0 if len(inp_0.users) == 2 else inp_1 + else: + raise RuntimeError( + f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs" + ) + + output_node = src_partition.output_nodes[0] + eps_node = self._get_eps_node(src_partition.nodes) + gamma_node = self._get_gamma_node(output_node) + + with graph.inserting_before(output_node): + # args schema + # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + rms_node = graph.create_node( + "call_function", + exir_ops.edge.aten.rms_norm.default, + ( + input_node, + list(gamma_node.meta["val"].shape), + gamma_node, + eps_node, + ), + ) + users = output_node.users.copy() + for user in users: + user.replace_input_with(output_node, rms_node) + # copy metadata + rms_node.meta = output_node.meta + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/replace_index_put_input.py b/backends/qualcomm/passes/replace_index_put_input.py new file mode 100644 index 0000000000..1eb210cf67 --- /dev/null +++ b/backends/qualcomm/passes/replace_index_put_input.py @@ -0,0 +1,54 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ReplaceIndexPutInput(ExportPass): + """ + Index put input workaround for quantized module + """ + + dq_q_map = { + # per tensor + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, + # per channel + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: exir_ops.edge.quantized_decomposed.quantize_per_channel.default, + } + + def __init__(self, edge_program: torch.export.ExportedProgram): + super(ReplaceIndexPutInput, self).__init__() + self.edge_program = edge_program + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + for node in graph.nodes: + if node.target == exir_ops.edge.aten.index_put.default: + if ( + copy_node := list(node.users)[0] + ) and copy_node.target == exir_ops.edge.aten.copy.default: + m_buffer_node = copy_node.args[0] + bad_frozen_node = node.args[0] + if QCOM_QUANT_ATTRS in bad_frozen_node.meta: + m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[ + QCOM_QUANT_ATTRS + ] + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] = ( + self.dq_q_map[ + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] + ] + ) + with graph.inserting_after(bad_frozen_node): + node.replace_input_with(bad_frozen_node, m_buffer_node) + else: + continue + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/replace_inf_buffer.py b/backends/qualcomm/passes/replace_inf_buffer.py index bafa3fdb18..776bc9beeb 100644 --- a/backends/qualcomm/passes/replace_inf_buffer.py +++ b/backends/qualcomm/passes/replace_inf_buffer.py @@ -8,14 +8,18 @@ class ReplaceInfBuffer(ExportPass): + """ + Due to limitation in Qnn, we need to change inf or -inf to arbitrary value in quantization. + """ + def __init__(self): super(ReplaceInfBuffer, self).__init__() def call(self, graph_module: torch.fx.GraphModule): for buf_name, tensor in graph_module.named_buffers(): if tensor.is_floating_point(): - tensor[tensor == float("inf")] = torch.finfo(torch.float32).max - tensor[tensor == float("-inf")] = torch.finfo(torch.float32).min + tensor[tensor == float("inf")] = 255 + tensor[tensor == float("-inf")] = -255 setattr(graph_module, buf_name, tensor) graph_module.recompile() diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py new file mode 100644 index 0000000000..9cde50b9c7 --- /dev/null +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -0,0 +1,120 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import Sequence + +import torch +from executorch.backends.qualcomm.quantizer.quantizer import ( + get_16a8w_qnn_ptq_config, + get_default_8bit_qnn_ptq_config, + QuantizationConfig, +) +from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY +from torch.ao.quantization.quantizer import ( + QuantizationAnnotation, + SharedQuantizationSpec, +) +from torch.fx import Node + + +def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901 + """ + This function is specific for llama matmul op 16a8w. + """ + + def annotate_matmul(node: Node, quantization_config: QuantizationConfig): + input_qspec_map = {} + input_act = node.args[0] + input_spec = quantization_config.input_activation + input_qspec_map[input_act] = input_spec + input_act1 = node.args[1] + input_spec1 = quantization_config.weight + input_qspec_map[input_act1] = input_spec1 + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config.output_activation, + _annotated=True, + ) + + def annotate_index_put(node: Node, quantization_config: QuantizationConfig) -> None: + input = node.args[0] + value = node.args[2] + input_qspec_map = {} + input_qspec_map[input] = quantization_config.input_activation + input_qspec_map[value] = SharedQuantizationSpec((input, node)) + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=SharedQuantizationSpec((input, node)), + _annotated=True, + ) + + def annotate_single_in_single_out( + node: Node, quantization_config: QuantizationConfig + ) -> None: + input_qspec_map = {} + input_act = node.args[0] + input_qspec_map[input_act] = quantization_config.input_activation + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config.output_activation, + _annotated=True, + ) + + def annotate_cat(node: Node, quantization_config: QuantizationConfig): + input_nodes = node.args[0] + assert isinstance(input_nodes, Sequence) + first_input_node = input_nodes[0] + input_qspec_map = {} + assert isinstance(first_input_node, Node) + assert isinstance(node, Node) + input_qspec_map[first_input_node] = quantization_config.input_activation + share_qparams_with_input_act0_qspec = SharedQuantizationSpec( + (first_input_node, node) + ) + for input_node in input_nodes[1:]: + if input_node not in input_qspec_map: + assert isinstance(input_node, Node) + input_qspec_map[input_node] = share_qparams_with_input_act0_qspec + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=share_qparams_with_input_act0_qspec, + _annotated=True, + ) + + def is_edge_condition(node: Node): + if not isinstance(node, Node) or node.op != "call_function": + return True + return False + + def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig): + if is_edge_condition(node): + return + if node.target in [ + torch.ops.aten.index_put.default, + torch.ops.aten.index_put_.default, + ]: + annotate_index_put(node, quantization_config) + annotate_matmul_input1(node.args[0], quantization_config) + elif node.target == torch.ops.aten.cat.default: + annotate_cat(node, quantization_config) + # Expect that the inputs of the cat op are select ops + for arg in node.args[0]: + annotate_matmul_input1(arg, quantization_config) + else: + annotate_single_in_single_out(node, quantization_config) + annotate_matmul_input1(node.args[0], quantization_config) + + # Annotate 16a8w for matmul op to get better performance + quantization_config_16a8w = get_16a8w_qnn_ptq_config() + # Annotate 8a8w for second input of matmul until past_kv_cache + quantization_config_8a8w = get_default_8bit_qnn_ptq_config(act_symmetric=True) + for node in gm.graph.nodes: + if node.op == "call_function" and node.target == torch.ops.aten.matmul.default: + if "nn_module_stack" in node.meta: + module_values_list = list(node.meta["nn_module_stack"].values()) + full_qualified_name = module_values_list[-1][0] + if "SDPA" in full_qualified_name: + annotate_matmul(node, quantization_config_16a8w) + annotate_matmul_input1(node.args[1], quantization_config_8a8w) diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index d51e016473..e27edf939c 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -116,7 +116,7 @@ def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: boo if enable: self.use_per_channel_weight_quant_ops.update(ops) else: - self.use_per_channel_weight_quant_ops.difference(ops) + self.use_per_channel_weight_quant_ops.difference_update(ops) def add_16bit_quant_ops(self, ops: Set[OpOverload]) -> None: for op in ops: diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py index f2265daf32..d3ae1194ac 100644 --- a/backends/qualcomm/quantizer/utils.py +++ b/backends/qualcomm/quantizer/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import numbers +import operator from dataclasses import dataclass from functools import partial from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple @@ -77,7 +78,7 @@ def _derive_bias_qparams_fn( def get_default_8bit_qnn_ptq_config( - act_symmetric: bool = False, act_observer=MinMaxObserver + act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver ) -> QuantizationConfig: extra_args: Dict[str, Any] = {"eps": 2**-12} @@ -96,7 +97,7 @@ def get_default_8bit_qnn_ptq_config( quant_max=torch.iinfo(torch.int8).max, qscheme=torch.per_tensor_symmetric, ch_axis=0, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) bias_quantization_spec = QuantizationSpec( @@ -104,7 +105,7 @@ def get_default_8bit_qnn_ptq_config( quant_min=torch.iinfo(torch.int32).min, quant_max=torch.iinfo(torch.int32).max, qscheme=torch.per_tensor_symmetric, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) quantization_config = QuantizationConfig( @@ -619,7 +620,13 @@ def annotate_upsample_nearest2d( annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.softmax.int, torch.ops.aten._softmax.default]) +@register_annotator( + [ + torch.ops.aten.softmax.int, + torch.ops.aten._softmax.default, + torch.ops.aten._safe_softmax.default, + ] +) def annotate_softmax(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -684,6 +691,31 @@ def annotate_squeeze(node: Node, quantization_config: QuantizationConfig) -> Non annotate_single_in_single_out(node, quantization_config) +@register_annotator([torch.ops.aten.rms_norm.default]) +def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act_node = node.args[0] + weight_node = node.args[2] + + if _is_annotated([node]): + return + + # TODO current only support 16a16w + _annotate_input_qspec_map( + node, + act_node, + quantization_config.input_activation, + ) + + _annotate_input_qspec_map( + node, + weight_node, + quantization_config.input_activation, + ) + nodes_to_mark_annotated = [node] + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + @register_annotator([torch.ops.aten.rsqrt.default]) def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -784,6 +816,38 @@ def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> N ) +@register_annotator([torch.ops.aten.index.Tensor]) +def annotate_index(node: Node, quantization_config: QuantizationConfig) -> None: + annotate_in_out_obs_sharing_op(node, quantization_config) + if not _is_annotated([node]): + input_qspec_map = {} + input = node.args[0] + input_qspec_map[input] = quantization_config.input_activation + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=SharedQuantizationSpec((input, node)), + _annotated=True, + ) + + +@register_annotator( + [torch.ops.aten.index_put.default, torch.ops.aten.index_put_.default] +) +def annotate_index_put(node: Node, quantization_config: QuantizationConfig) -> None: + input = node.args[0] + value = node.args[2] + + input_qspec_map = {} + input_qspec_map[input] = quantization_config.input_activation + input_qspec_map[value] = SharedQuantizationSpec((input, node)) + + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=SharedQuantizationSpec((input, node)), + _annotated=True, + ) + + @register_annotator([torch.ops.aten.expand.default]) def annotate_expand(node: Node, quantization_config: QuantizationConfig) -> None: annotate_in_out_obs_sharing_op(node, quantization_config) @@ -943,6 +1007,38 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None node.meta["source_fn_stack"] = [(node, torch.nn.Linear)] +@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default]) +def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act, weight, bias = node.args[0:3] + if _is_annotated([node]): + return + + _annotate_input_qspec_map( + node, + act, + quantization_config.input_activation, + ) + # QNN requires uint8 instead of int8 in 'weight' config + _annotate_input_qspec_map( + node, + weight, + quantization_config.input_activation, + ) + _annotate_input_qspec_map( + node, + bias, + quantization_config.bias, + ) + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node, *node.args[0:3]]) + + +@register_annotator([operator.getitem]) +def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None: + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node]) + + @register_annotator([torch.ops.aten.layer_norm.default]) def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None: act_node = node.args[0] diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h index 45525726ca..dabd4cdde5 100644 --- a/backends/qualcomm/runtime/QnnExecuTorch.h +++ b/backends/qualcomm/runtime/QnnExecuTorch.h @@ -44,7 +44,7 @@ struct CustomMemTensorInfo { size_t tensor_bytes; uint32_t* shape; uint32_t rank; - torch::executor::ScalarType dtype; + exec_aten::ScalarType dtype; }; /// Allocate specific tensors (usually graph inputs and outputs) on shared diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index f08f688cf9..f5c9473411 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -23,12 +23,12 @@ Result QnnExecuTorchBackend::init( ArrayRef compile_specs) const { // covert SizedBuffer to qnn ExecuTorch option QnnExecuTorchContextBinary qnn_context_blob; - const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options; + const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options = nullptr; qnn_context_blob.buffer = const_cast(processed->data()); qnn_context_blob.nbytes = processed->size(); - // covert CompileSpec to qnn ExecuTorch option + // convert CompileSpec to qnn ExecuTorch option for (auto& compile_spec : compile_specs) { if (std::strcmp(compile_spec.key, QNN_COMPILE_SPEC) == 0) qnn_executorch_options = @@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute( } ET_CHECK_OR_RETURN_ERROR( - qnn_manager->Execute(input_tensor_structs, output_tensor_structs) == - Error::Ok, + qnn_manager->Execute( + input_tensor_structs, + output_tensor_structs, + context.event_tracer()) == Error::Ok, Internal, "Fail to execute graph"); ET_CHECK_OR_RETURN_ERROR( diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h index 2963ec0f62..fbcc705889 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h @@ -14,7 +14,8 @@ namespace torch { namespace executor { -class QnnExecuTorchBackend final : public PyTorchBackendInterface { +class QnnExecuTorchBackend final + : public ::executorch::runtime::BackendInterface { public: ~QnnExecuTorchBackend(){}; @@ -24,7 +25,7 @@ class QnnExecuTorchBackend final : public PyTorchBackendInterface { ArrayRef compile_specs) const override; Error execute( - __ET_UNUSED BackendExecutionContext& context, + ET_UNUSED BackendExecutionContext& context, DelegateHandle* handle, EValue** args) const override; diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 3dc135deb6..f4275f0ab3 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -56,9 +58,7 @@ QnnManager::QnnManager( "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type)); QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str()); QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str()); - QNN_EXECUTORCH_LOG_INFO( - "tensor_dump_output_path: %s", - options_->tensor_dump_output_path()->c_str()); + QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump()); QNN_EXECUTORCH_LOG_INFO( "log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level())); QNN_EXECUTORCH_LOG_INFO( @@ -281,6 +281,8 @@ Error QnnManager::Init() { options_->backend_options()->backend_type()); backend_params_ptr_ = QnnBackendFactory().Create( qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_); + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.") ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok, Internal, @@ -332,7 +334,8 @@ Error QnnManager::AllocateTensor() { const std::string& tensor_name = tensor_wrapper->GetName(); // this is required by identifying shared buffer mechanism // info might be missed if context binary came from qnn_converter - if (tensor_name.find("output_") == std::string::npos) { + if (options_->is_from_context_binary() && + tensor_name.find("output_") == std::string::npos) { tensor_wrapper->SetName("output_" + tensor_name); } if (IsTensorDump()) { @@ -362,7 +365,8 @@ Error QnnManager::AllocateTensor( Error QnnManager::Execute( const std::vector& input_tensor_structs, - std::vector& output_tensor_structs) { + std::vector& output_tensor_structs, + EventTracer* event_tracer) { Qnn_ErrorHandle_t error = QNN_SUCCESS; error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute( @@ -373,30 +377,27 @@ Error QnnManager::Execute( "qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error)); return Error::Internal; } - if (IsTensorDump()) { // TODO: Need to handle the graph which is partitioned. // Maybe we could use graph name. - std::string dir = options_->tensor_dump_output_path()->str() + "/Result/"; - CreateDirectory(dir); - QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str()); for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size(); ++out_idx) { const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx]; - - std::string output_path = - dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw"; - - std::ofstream fout(output_path, std::ios::binary); - if (fout.fail()) { - QNN_EXECUTORCH_LOG_ERROR( - "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name); - return Error::Internal; - } - - fout.write( - static_cast(QNN_VER_PTR(output_tensor)->clientBuf.data), - QNN_VER_PTR(output_tensor)->clientBuf.dataSize); + std::vector sizes( + QNN_VER_PTR(output_tensor)->dimensions, + QNN_VER_PTR(output_tensor)->dimensions + + QNN_VER_PTR(output_tensor)->rank); + + auto dump_tensor = executorch::extension::from_blob( + QNN_VER_PTR(output_tensor)->clientBuf.data, + sizes, + qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]); + + torch::executor::event_tracer_log_output_delegate( + event_tracer, + QNN_VER_PTR(output_tensor)->name, + /*delegate_debug_id=*/static_cast(-1), + *dump_tensor); } } diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 5190f6768b..3d1cc3863a 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -37,7 +37,8 @@ class QnnManager { Error Execute( const std::vector& input_tensor_structs, - std::vector& output_tensor_structs); + std::vector& output_tensor_structs, + EventTracer* event_tracer); Error ProfileExecuteData(EventTracer* event_tracer); @@ -52,7 +53,7 @@ class QnnManager { } bool IsTensorDump() { - return options_->tensor_dump_output_path()->size() > 0; + return options_->dump_intermediate_outputs(); } bool IsNodeSupportedByBackend( diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp index 430c8f757a..2b2a729835 100644 --- a/backends/qualcomm/runtime/SharedBuffer.cpp +++ b/backends/qualcomm/runtime/SharedBuffer.cpp @@ -25,7 +25,7 @@ std::size_t std::hash::operator()( hash_val ^= info.shape[i]; } hash_val ^= std::hash()(info.rank); - hash_val ^= std::hash()(info.dtype); + hash_val ^= std::hash()(info.dtype); return hash_val; } @@ -87,7 +87,12 @@ SharedBuffer& SharedBuffer::GetSharedBufferManager() { std::lock_guard lk(init_mutex_); static SharedBuffer shared_buffer_manager; if (!shared_buffer_manager.GetInitialize()) { +#if defined(__aarch64__) Error status = shared_buffer_manager.Load(); +#else + // For x86_64 platform + Error status = Error::Ok; +#endif if (status == Error::Ok) { shared_buffer_manager.SetInitialize(true); } @@ -96,9 +101,11 @@ SharedBuffer& SharedBuffer::GetSharedBufferManager() { } SharedBuffer::~SharedBuffer() { +#if defined(__aarch64__) if (initialize_) { SharedBuffer::GetSharedBufferManager().UnLoad(); } +#endif }; void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) { diff --git a/backends/qualcomm/runtime/TARGETS b/backends/qualcomm/runtime/TARGETS new file mode 100644 index 0000000000..0a42614a38 --- /dev/null +++ b/backends/qualcomm/runtime/TARGETS @@ -0,0 +1,5 @@ +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp index 3e286c07b0..c67f9b52f5 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp @@ -53,6 +53,85 @@ Error QnnBackend::Configure() { } return Error::Ok; } + +Error QnnBackend::VerifyQNNSDKVersion( + const QnnExecuTorchBackendType backend_id) { + const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); + + Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT}; + Qnn_ErrorHandle_t error = + qnn_interface.qnn_backend_get_api_version(&qnn_version); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR("Failed to get Qnn API version."); + return Error::Internal; + } + + Qnn_ApiVersion_t expected_version = {QNN_VERSION_INIT}; + expected_version.coreApiVersion.major = QNN_API_VERSION_MAJOR; + expected_version.coreApiVersion.minor = QNN_API_VERSION_MINOR; + expected_version.coreApiVersion.patch = QNN_API_VERSION_PATCH; + expected_version.backendApiVersion = GetExpectedBackendVersion(); + const char* backend_type = EnumNameQnnExecuTorchBackendType(backend_id); + + Error status = VersionChecker( + qnn_version.coreApiVersion, expected_version.coreApiVersion, "Qnn API"); + if (status == Error::Ok) { + status = VersionChecker( + qnn_version.backendApiVersion, + expected_version.backendApiVersion, + backend_type); + } + + return status; +} + +Error QnnBackend::VersionChecker( + const Qnn_Version_t& qnn_version, + const Qnn_Version_t& expected, + const std::string& prefix) { + if (qnn_version.major != expected.major) { + QNN_EXECUTORCH_LOG_ERROR( + "%s version %u.%u.%u is not supported. " + "The minimum supported version is %u.%u.%u. Please make " + "sure you have the correct backend library version.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + return Error::Internal; + } + if (qnn_version.major == QNN_API_VERSION_MAJOR && + qnn_version.minor < expected.minor) { + QNN_EXECUTORCH_LOG_WARN( + "%s version %u.%u.%u is mismatched. " + "The minimum supported version is %u.%u.%u. Please make " + "sure you have the correct backend library version.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + } + if ((qnn_version.major == QNN_API_VERSION_MAJOR && + qnn_version.minor > expected.minor)) { + QNN_EXECUTORCH_LOG_WARN( + "%s version %u.%u.%u is used. " + "The version is tested against %u.%u.%u.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + } + return Error::Ok; +} } // namespace qnn } // namespace executor } // namespace torch diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.h b/backends/qualcomm/runtime/backends/QnnBackendCommon.h index e6ea0adff8..de007898e5 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCommon.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.h @@ -13,8 +13,10 @@ #include +#include "HTP/QnnHtpCommon.h" #include "QnnBackend.h" #include "QnnCommon.h" +#include "QnnTypes.h" namespace torch { namespace executor { namespace qnn { @@ -43,7 +45,10 @@ class QnnBackend { return handle_; } + Error VerifyQNNSDKVersion(const QnnExecuTorchBackendType backend_id); + protected: + virtual Qnn_Version_t GetExpectedBackendVersion() const = 0; virtual Error MakeConfig(std::vector& config) { return Error::Ok; }; @@ -52,6 +57,10 @@ class QnnBackend { Qnn_BackendHandle_t handle_; const QnnImplementation& implementation_; QnnLogger* logger_; + Error VersionChecker( + const Qnn_Version_t& qnn_version, + const Qnn_Version_t& expected, + const std::string& prefix); }; } // namespace qnn } // namespace executor diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index acb9552468..9fb292613a 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -16,6 +16,7 @@ std::unique_ptr QnnBackendFactory::Create( const QnnExecuTorchContextBinary& qnn_context_blob, const QnnExecuTorchOptions* options) { auto backend_params = std::make_unique(); + switch (options->backend_options()->backend_type()) { case QnnExecuTorchBackendType::kHtpBackend: { auto htp_options = options->backend_options()->htp_options(); @@ -51,6 +52,7 @@ std::unique_ptr QnnBackendFactory::Create( } backend_params->qnn_backend_ptr_ = std::make_unique(implementation, logger); + backend_params->qnn_device_ptr_ = std::make_unique( implementation, logger, options->soc_info(), htp_options); @@ -72,7 +74,6 @@ std::unique_ptr QnnBackendFactory::Create( backend_params->qnn_mem_manager_ptr_ = std::make_unique( implementation, backend_params->qnn_context_ptr_.get()); backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED; - return backend_params; } break; case QnnExecuTorchBackendType::kGpuBackend: case QnnExecuTorchBackendType::kDspBackend: @@ -81,7 +82,11 @@ std::unique_ptr QnnBackendFactory::Create( return nullptr; } - // should not reach here + if (backend_params->qnn_backend_ptr_->VerifyQNNSDKVersion( + options->backend_options()->backend_type()) == Error::Ok) { + return backend_params; + } + return nullptr; } } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp index fa5829d23b..ae336a800b 100644 --- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp +++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp @@ -7,7 +7,6 @@ */ #include -#include namespace torch { namespace executor { diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h index d4b14178a4..d00bd50cdc 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h @@ -8,7 +8,9 @@ #pragma once #include +#include "HTP/QnnHtpCommon.h" #include "HTP/QnnHtpProfile.h" +#include "QnnTypes.h" namespace torch { namespace executor { namespace qnn { @@ -24,6 +26,14 @@ class HtpBackend : public QnnBackend { event_type == QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE); } + Qnn_Version_t GetExpectedBackendVersion() const override { + Qnn_Version_t backend_version; + backend_version.major = QNN_HTP_API_VERSION_MAJOR; + backend_version.minor = QNN_HTP_API_VERSION_MINOR; + backend_version.patch = QNN_HTP_API_VERSION_PATCH; + return backend_version; + } + protected: Error MakeConfig(std::vector& config) override { return Error::Ok; diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h index 00568bdc32..35fc2d373d 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h @@ -39,7 +39,7 @@ class HtpContextCustomConfig { return htp_context_config_.back().get(); } - const QnnContext* context_; + [[maybe_unused]] const QnnContext* context_; std::vector> htp_context_config_; [[maybe_unused]] const QnnExecuTorchHtpBackendOptions* htp_options_; diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl new file mode 100644 index 0000000000..61650fab26 --- /dev/null +++ b/backends/qualcomm/runtime/targets.bzl @@ -0,0 +1,68 @@ +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "ANDROID", +) +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + runtime.cxx_library( + name = "logging", + srcs = [ + "Logging.cpp", + ], + exported_headers = [ + "Logging.h", + ], + define_static_target = True, + platforms = [ANDROID], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "fbsource//third-party/qualcomm/qnn:api", + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + ], + exported_deps = [ + "//executorch/backends/qualcomm:schema", + ], + ) + + runtime.cxx_library( + name = "runtime", + srcs = glob( + [ + "*.cpp", + "backends/*.cpp", + "backends/htpbackend/*.cpp", + "backends/htpbackend/aarch64/*.cpp", + ], + exclude = ["Logging.cpp"], + ), + exported_headers = glob( + [ + "*.h", + "backends/*.h", + "backends/htpbackend/*.h", + ], + exclude = ["Logging.h"], + ), + define_static_target = True, + link_whole = True, # needed for executorch/examples/models/llama2:main to register QnnBackend + platforms = [ANDROID], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "fbsource//third-party/qualcomm/qnn:api", + ":logging", + "//executorch/backends/qualcomm:schema", + "//executorch/backends/qualcomm/aot/ir:qcir_utils", + "//executorch/backends/qualcomm/aot/wrappers:wrappers", + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + "//executorch/extension/tensor:tensor", + ], + ) diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index 3712a83fde..4cb2f50bbd 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. set -e +set -o xtrace if [[ -z ${QNN_SDK_ROOT} ]]; then echo "Please export QNN_SDK_ROOT=/path/to/qnn_sdk" @@ -16,7 +17,7 @@ usage() { echo "Usage: Build the aarch64 version of executor runner or the python interface of Qnn Manager" echo "First, you need to set the environment variable for QNN_SDK_ROOT" echo ", and if you want to build the aarch64 version of executor runner" - echo ", you need to set ANDROID_NDK_ROOT" + echo ", you need to export ANDROID_NDK_ROOT=/path/to/android_ndkXX" echo "e.g.: executorch$ ./backends/qualcomm/scripts/build.sh --skip_x86_64" exit 1 } @@ -25,11 +26,12 @@ usage() { [ "$1" = -h ] && usage BUILD_X86_64="true" -CMAKE_X86_64="build_x86_64" +CMAKE_X86_64="build-x86" BUILD_AARCH64="true" -CMAKE_AARCH64="build_android" +CMAKE_AARCH64="build-android" CLEAN="true" BUILD_TYPE="Debug" +BUILD_JOB_NUMBER="16" if [ -z PYTHON_EXECUTABLE ]; then PYTHON_EXECUTABLE="python3" @@ -39,7 +41,7 @@ if [ -z BUCK2 ]; then BUCK2="buck2" fi -long_options=skip_x86_64,skip_aarch64,no_clean,release +long_options=skip_x86_64,skip_aarch64,no_clean,release,job_number: parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@") eval set -- "$parsed_args" @@ -51,6 +53,7 @@ while true ; do --skip_aarch64) BUILD_AARCH64="false"; shift;; --no_clean) CLEAN="false"; shift;; --release) BUILD_TYPE="Release"; shift;; + --job_number) BUILD_JOB_NUMBER="$2"; shift 2;; --) shift; break;; esac done @@ -59,12 +62,16 @@ PRJ_ROOT="$( cd "$(dirname "$0")/../../.." ; pwd -P)" if [ "$BUILD_AARCH64" = true ]; then if [[ -z ${ANDROID_NDK_ROOT} ]]; then - echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndk" + echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndkXX" exit -1 fi + BUILD_ROOT=$PRJ_ROOT/$CMAKE_AARCH64 if [ "$CLEAN" = true ]; then rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT + else + # Force rebuild flatccrt for the correct platform + cd $BUILD_ROOT/devtools && make clean fi cd $BUILD_ROOT @@ -72,8 +79,9 @@ if [ "$BUILD_AARCH64" = true ]; then -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \ -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DEXECUTORCH_BUILD_QNN=ON \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ @@ -82,7 +90,7 @@ if [ "$BUILD_AARCH64" = true ]; then -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -B$BUILD_ROOT - cmake --build $BUILD_ROOT -j16 --target install + cmake --build $BUILD_ROOT -j$BUILD_JOB_NUMBER --target install EXAMPLE_ROOT=examples/qualcomm CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;" @@ -97,29 +105,46 @@ if [ "$BUILD_AARCH64" = true ]; then -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -B$EXAMPLE_ROOT - cmake --build $EXAMPLE_ROOT -j16 + cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER fi if [ "$BUILD_X86_64" = true ]; then - # Build python interface BUILD_ROOT=$PRJ_ROOT/$CMAKE_X86_64 if [ "$CLEAN" = true ]; then rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT + else + # Force rebuild flatccrt for the correct platform + cd $BUILD_ROOT/devtools && make clean fi + cd $BUILD_ROOT cmake \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \ -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ -DEXECUTORCH_BUILD_QNN=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ - -DBUCK2=$BUCK2 \ -S $PRJ_ROOT \ -B $BUILD_ROOT \ - cmake \ - --build $BUILD_ROOT \ - -t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j16 + cmake --build $BUILD_ROOT -j$BUILD_JOB_NUMBER --target install rm -f $PRJ_ROOT/backends/qualcomm/python/* cp -fv $BUILD_ROOT/backends/qualcomm/Py* "$PRJ_ROOT/backends/qualcomm/python" + + EXAMPLE_ROOT=examples/qualcomm + CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;" + + cmake $PRJ_ROOT/$EXAMPLE_ROOT \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \ + -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ + -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ + -B$EXAMPLE_ROOT + + cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER fi diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py index 338f61997e..8471aad982 100644 --- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py +++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py @@ -129,7 +129,7 @@ class QnnExecuTorchOptions: library_path: str = "" log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff online_prepare: bool = False - tensor_dump_output_path: str = "" + dump_intermediate_outputs: bool = False profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff shared_buffer: bool = False is_from_context_binary: bool = False diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs index 4288c83b13..4e7fdb56e8 100644 --- a/backends/qualcomm/serialization/schema.fbs +++ b/backends/qualcomm/serialization/schema.fbs @@ -164,11 +164,9 @@ table QnnExecuTorchOptions { /// Check if on-device graph construction. Default is false. online_prepare:bool; - /// Tensor dump output path. If a path is given, Delegate would write - /// outputs of each OP there. - /// In ALL cases, we don't recommend to set this option. - /// This option exist just for debugging some accuracy issues. - tensor_dump_output_path:string; + /// If tensor dump is enabled, all intermediate tensors output will be dumped. + /// This option exists for debugging accuracy issues. Default is off. + dump_intermediate_outputs:bool; /// Profiling level of the delegate and the backend. Default is off. profile_level:QnnExecuTorchProfileLevel; diff --git a/backends/qualcomm/setup.md b/backends/qualcomm/setup.md index b4b0f2ea72..37d8e04c21 100644 --- a/backends/qualcomm/setup.md +++ b/backends/qualcomm/setup.md @@ -1,189 +1,7 @@ # Setting up QNN Backend -This is a tutorial for building and running Qualcomm AI Engine Direct backend, +Please refer to [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md). + +That is a tutorial for building and running Qualcomm AI Engine Direct backend, including compiling a model on a x64 host and running the inference on a Android device. - - -## Prerequisite - -Please finish tutorial [Setting up executorch](../../docs/source/getting-started-setup.md). - - -## Conventions - -`$QNN_SDK_ROOT` refers to the root of Qualcomm AI Engine Direct SDK, -i.e., the directory containing `QNN_README.txt`. - -`$ANDROID_NDK_ROOT` refers to the root of Android NDK. - -`$EXECUTORCH_ROOT` refers to the root of executorch git repository. - - -## Environment Setup - -### Download Qualcomm AI Engine Direct SDK - -Navigate to [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) and follow the download button. - -You might need to apply for a Qualcomm account to download the SDK. - -After logging in, search Qualcomm AI Stack at the *Tool* panel. -You can find Qualcomm AI Engine Direct SDK under the AI Stack group. - -Please download the Linux version, and follow instructions on the page to -extract the file. - -The SDK should be installed to somewhere `/opt/qcom/aistack/qnn` by default. - -### Download Android NDK - -Please navigate to [Android NDK](https://developer.android.com/ndk) and download -a version of NDK. We recommend LTS version, currently r25c. - -### Setup environment variables - -We need to make sure Qualcomm AI Engine Direct libraries can be found by -the dynamic linker on x64. Hence we set `LD_LIBRARY_PATH`. In production, -we recommend users to put libraries in default search path or use `rpath` -to indicate the location of libraries. - -Further, we set up `$PYTHONPATH` because it's easier to develop and import executorch Python APIs. Users might also build and install executorch package as usual python package. - -```bash -export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH -export PYTHONPATH=$EXECUTORCH_ROOT/.. -``` - -Note: Since we set `PYTHONPATH`, we may have issue with finding `program.fbs` -and `scalar_type.fbs` when we export a model, because they are installed into -`pip-out` directory with the same package name pattern. A workaround is that -we copy `$EXECUTORCH_ROOT/pip-out/lib.linux-x86_64-cpython-310/executorch/exir/_serialize/program.fbs` -and `$EXECUTORCH_ROOT/pip-out/lib.linux-x86_64-cpython-310/executorch/exir/_serialize/scalar_type.fbs` -to `$EXECUTORCH_ROOT/exir/_serialize/`. - - -## End to End Inference - -### Step 1: Build Python APIs for AOT compilation on x64 - -Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct binary. -Make sure `buck2` is under a directory in `PATH`. - -```bash -cd $EXECUTORCH_ROOT -mkdir build_x86_64 -cd build_x86_64 -cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT} -cmake --build . -t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j8 - -# install Python APIs to correct import path -# The filename might vary depending on your Python and host version. -cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python -cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python -``` - - -### Step 2: Build `qnn_executor_runner` for Android - -`qnn_executor_runner` is an executable running the compiled model. - -You might want to ensure the correct `flatc`. `flatc` can be built along with the above step. For example, we can find `flatc` in `build_x86_64/third-party/flatbuffers/`. - -We can prepend `$EXECUTORCH_ROOT/build_x86_64/third-party/flatbuffers` to `PATH`. Then below cross-compiling can find the correct flatbuffer compiler. - -Commands to build `qnn_executor_runner` for Android: - -```bash -cd $EXECUTORCH_ROOT -mkdir build_android -cd build_android -# build executorch & qnn_executorch_backend -cmake .. \ - -DCMAKE_INSTALL_PREFIX=$PWD \ - -DEXECUTORCH_BUILD_QNN=ON \ - -DEXECUTORCH_BUILD_SDK=ON \ - -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ - -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI='arm64-v8a' \ - -DANDROID_NATIVE_API_LEVEL=23 \ - -B$PWD - -cmake --build $PWD -j16 --target install - -cmake ../examples/qualcomm \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI='arm64-v8a' \ - -DANDROID_NATIVE_API_LEVEL=23 \ - -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \ - -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ - -Bexamples/qualcomm - -cmake --build examples/qualcomm -j16 -``` -**Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options. - -You can find `qnn_executor_runner` under `build_android/examples/qualcomm/`. - - -### Step 3: Compile a model - -``` -python -m examples.qualcomm.scripts.export_example --model_name mv2 -``` - -Then the generated `mv2.pte` can be run on the device by -`build_android/backends/qualcomm/qnn_executor_runner` with Qualcomm AI Engine -Direct backend. - -[**Note**] To get proper accuracy, please apply calibrations with representative -dataset, which could be learnt more from examples under `examples/qualcomm/`. - - -### Step 4: Model Inference - -The backend rely on Qualcomm AI Engine Direct SDK libraries. - -You might want to follow docs in Qualcomm AI Engine Direct SDK to setup the device environment. -Or see below for a quick setup for testing: - -```bash -# make sure you have write-permission on below path. -DEVICE_DIR=/data/local/tmp/executorch_test/ -adb shell "mkdir -p ${DEVICE_DIR}" -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR} -``` - -We also need to indicate dynamic linkers on Android and Hexagon where to find these libraries -by setting `ADSP_LIBRARY_PATH` and `LD_LIBRARY_PATH`. - -So, we can run `qnn_executor_runner` like -```bash -adb push mv2.pte ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR} -adb shell "cd ${DEVICE_DIR} \ - && export LD_LIBRARY_PATH=${DEVICE_DIR} \ - && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \ - && ./qnn_executor_runner --model_path ./mv2_qnn.pte" -``` - -You should see the following result. -Note that no output file will be generated in this example. -``` -I 00:00:00.133366 executorch:qnn_executor_runner.cpp:156] Method loaded. -I 00:00:00.133590 executorch:util.h:104] input already initialized, refilling. -I 00:00:00.135162 executorch:qnn_executor_runner.cpp:161] Inputs prepared. -I 00:00:00.136768 executorch:qnn_executor_runner.cpp:278] Model executed successfully. -[INFO][Qnn ExecuTorch] Destroy Qnn backend parameters -[INFO][Qnn ExecuTorch] Destroy Qnn context -[INFO][Qnn ExecuTorch] Destroy Qnn device -[INFO][Qnn ExecuTorch] Destroy Qnn backend -``` diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl new file mode 100644 index 0000000000..55fe390f6b --- /dev/null +++ b/backends/qualcomm/targets.bzl @@ -0,0 +1,95 @@ +load( + "@fbsource//tools/build_defs:default_platform_defs.bzl", + "ANDROID", +) +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + + +# Construct the input and output file names. All input and output files rely on scalar_type file. +SCHEMA_NAME = "schema" + +INPUT_SCHEMA = "serialization/" + SCHEMA_NAME + ".fbs" + +OUTPUT_SCHEMA_HEADER = SCHEMA_NAME + "_generated.h" + +SCHEMA_GEN_RULE_NAME = "schema_generated" + +SCHEMA_LIRRARY_NAME = SCHEMA_NAME + +def generate_schema_header(rule_name, srcs, headers, default_header): + """Generate header file given flatbuffer schema + """ + runtime.genrule( + name = rule_name, + srcs = srcs, + # We're only generating a single file, so it seems like we could use + # `out`, but `flatc` takes a directory as a parameter, not a single + # file. Use `outs` so that `${OUT}` is expanded as the containing + # directory instead of the file itself. + outs = {header: [header] for header in headers}, + default_outs = [default_header], + cmd = " ".join([ + "$(exe {})".format(runtime.external_dep_location("flatc")), + "--cpp", + "--cpp-std c++11", + "--gen-mutable", + "--scoped-enums", + "-o ${OUT}", + "${SRCS}", + # Let our infra know that the file was generated. + " ".join(["&& echo // @" + "generated >> ${OUT}/" + header for header in headers]), + ]), + visibility = [], # Private + ) + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + generate_schema_header( + SCHEMA_GEN_RULE_NAME, + [INPUT_SCHEMA], + [OUTPUT_SCHEMA_HEADER], + OUTPUT_SCHEMA_HEADER, + ) + # Header-only library target with the generate executorch program schema header. + runtime.cxx_library( + name = "schema", + srcs = [], + visibility = [ + # Lock this down as tightly as possible to ensure that flatbuffers + # are an implementation detail. Ideally this list would only include + # //executorch/runtime/executor/... + "//executorch/codegen/tools/...", + "//executorch/runtime/executor/...", + "//executorch/backends/qualcomm/...", + "//executorch/backends/qualcomm/runtime/...", + ], + exported_headers = { + OUTPUT_SCHEMA_HEADER: ":{}[{}]".format(SCHEMA_GEN_RULE_NAME, OUTPUT_SCHEMA_HEADER), + }, + exported_external_deps = ["flatbuffers-api"], + define_static_target = True, + platforms = [ANDROID], + ) + + + runtime.cxx_library( + name = "qnn_executorch_backend", + srcs = [], + headers = [], + define_static_target = True, + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "fbsource//third-party/qualcomm/qnn:api", + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + "//executorch/backends/qualcomm/runtime:runtime", + ], + exported_deps = [ + ":schema", + ], + ) diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index fe72b1e893..e448a21928 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -55,6 +55,16 @@ def forward(self, x): return self.avgPool(x) +class BatchNorm(torch.nn.Module): + def __init__(self, n_features): + super().__init__() + self.native_batchnorm = torch.nn.BatchNorm2d(n_features) + self.eval() + + def forward(self, x): + return self.native_batchnorm(x) + + class Bmm(torch.nn.Module): def __init__(self): super().__init__() @@ -203,14 +213,14 @@ def example_inputs(self): class Conv1dSequential(torch.nn.Module): - def __init__(self): + def __init__(self, bias=True): super().__init__() self.first = torch.nn.Conv1d( in_channels=1, out_channels=3, kernel_size=(3), padding=1, - bias=True, + bias=bias, ) self.second = torch.nn.Conv1d( @@ -218,7 +228,7 @@ def __init__(self): out_channels=2, kernel_size=(3), padding=1, - bias=True, + bias=bias, ) def forward(self, x): @@ -315,21 +325,21 @@ def forward(self, x): class Conv2dSequential(torch.nn.Module): - def __init__(self): + def __init__(self, bias=True): super().__init__() self.first = torch.nn.Conv2d( in_channels=1, out_channels=3, kernel_size=(3, 3), padding=1, - bias=True, + bias=bias, ) self.second = torch.nn.Conv2d( in_channels=3, out_channels=2, kernel_size=(3, 3), padding=1, - bias=True, + bias=bias, ) def forward(self, x): @@ -337,14 +347,14 @@ def forward(self, x): class Conv2dSingle(torch.nn.Module): - def __init__(self): + def __init__(self, bias=True): super().__init__() self.conv = torch.nn.Conv2d( in_channels=1, out_channels=3, kernel_size=(3, 3), padding=1, - bias=True, + bias=bias, ) def forward(self, x): @@ -443,6 +453,29 @@ def forward(self, x): return self.hardtanh(x) +class Index(torch.nn.Module): + def __init__(self): + super().__init__() + self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]]) + self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]]) + + def forward(self, x): + return x[self.idx0] + x[self.idx1] + + +class IndexPut(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer( + "k_cache", + torch.zeros((1, 1024, 12, 64), dtype=torch.float32), + ) + + def forward(self, input_pos, k_val): + k_out = torch.ops.aten.index_put_(self.k_cache, [None, input_pos], k_val) + return k_out + + class LayerNorm(torch.nn.Module): def __init__(self): super().__init__() @@ -711,6 +744,16 @@ def forward(self, x): ) +class RmsNorm(torch.nn.Module): + def __init__(self): + super().__init__() + self.eps = 1e-5 + self.rms = torch.nn.RMSNorm([4], 1e-5) + + def forward(self, x): + return self.rms(x) + + class Rsqrt(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index f9d05131bb..d022ac96c4 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -3,6 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import io import json import subprocess import sys @@ -15,6 +16,7 @@ from executorch.backends.qualcomm.tests.utils import ( generate_context_binary, QnnPartitioner, + QnnQuantizer, QuantDtype, TestQNN, to_backend, @@ -32,9 +34,10 @@ from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + skip_annotation, ) -from executorch.examples.qualcomm.scripts.utils import setup_common_args_and_variables +from executorch.examples.qualcomm.utils import setup_common_args_and_variables from executorch.backends.qualcomm.tests.models import * # noqa: F403 @@ -49,8 +52,8 @@ from executorch.examples.models.mobilenet_v3 import MV3Model from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel from executorch.examples.models.wav2letter import Wav2LetterModel +from executorch.exir import to_edge from executorch.exir.backend.backend_api import disable_validation -from executorch.exir.program._program import EdgeCompileConfig, ExirExportedProgram class TestQNNFloatingPointOperator(TestQNN): @@ -65,7 +68,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) @@ -80,8 +83,14 @@ def test_qnn_backend_avg_pool2d(self): sample_input = (torch.randn(1, 3, 2, 2),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 + torch.manual_seed(8) sample_input = (torch.randn([4, 8, 32]), torch.randn([4, 32, 8])) self.lower_module_and_test_output(module, sample_input) @@ -108,14 +117,18 @@ def test_qnn_backend_clamp(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_conv1d(self): - module = Conv1dSequential() # noqa: F405 + modules = [Conv1dSequential(), Conv1dSequential(bias=False)] # noqa: F405 sample_input = (torch.randn([1, 1, 3]),) - self.lower_module_and_test_output(module, sample_input) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_conv2d(self): - module = Conv2dSequential() # noqa: F405 + modules = [Conv2dSequential(), Conv2dSequential(bias=False)] # noqa: F405 sample_input = (torch.randn([1, 1, 3, 3]),) - self.lower_module_and_test_output(module, sample_input) + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_element_wise_add(self): test_comb = [ @@ -147,6 +160,7 @@ def test_qnn_backend_element_wise_ceil(self): def test_qnn_backend_element_wise_div(self): eps = 1e-03 + torch.manual_seed(8) test_comb = [ { QCOM_MODULE: [Div()], # noqa: F405 @@ -256,6 +270,19 @@ def test_qnn_backend_hardtanh(self): sample_input = (torch.randn([2, 5, 1, 3]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_index(self): + module = Index() # noqa: F405 + sample_input = (torch.randn([8, 172, 64]),) + self.lower_module_and_test_output(module, sample_input) + + def test_qnn_backend_index_put(self): + module = IndexPut() # noqa: F405 + sample_input = ( + torch.tensor([2], dtype=torch.int32), + torch.randn([1, 1, 12, 64]), + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_interpolate_bilinear_2d(self): module = ResizeBilinear2D() # noqa: F405 sample_input = (torch.randn(2, 3, 4, 5),) @@ -271,7 +298,6 @@ def test_qnn_backend_layer_norm(self): sample_input = (torch.randn(196, 768),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_leaky_relu(self): test_comb = [ { @@ -314,14 +340,12 @@ def test_qnn_backend_mean_dim(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) - @unittest.skip("it will hang in runtime") + @unittest.skip("failed to lower in QNN 2.26") def test_qnn_backend_mha(self): module = MultiheadAttention() # noqa: F405 sample_input = (torch.randn(1, 197, 96),) self.lower_module_and_test_output(module, sample_input) - # fp16 pad op might hit corner case in runtime - @unittest.expectedFailure def test_qnn_backend_pad(self): module = Pad() # noqa: F405 sample_input = (torch.randn([1, 8, 128]),) @@ -342,7 +366,6 @@ def test_qnn_backend_pow_tensor_scalar(self): sample_input = (torch.rand([2, 4, 3, 3]),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_prelu(self): test_comb = [ { @@ -373,6 +396,11 @@ def test_qnn_backend_reshape(self): sample_input = (torch.randn([3, 4]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -462,13 +490,14 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_chunk_add(self): module = ChunkAdd() # noqa: F405 + torch.manual_seed(8) sample_input = (torch.randn(1, 2, 4, 2),) self.lower_module_and_test_output(module, sample_input) @@ -519,6 +548,7 @@ def test_qnn_backend_simple_model(self): def test_qnn_backend_view_permute_matmul(self): module = ViewPermuteMatMul() # noqa: F405 + torch.manual_seed(8) sample_input = (torch.randn([1, 8, 512]), torch.randn([1, 2, 8, 256])) self.lower_module_and_test_output(module, sample_input) @@ -574,18 +604,20 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_16a4w_conv2d(self): - module = Conv2dSingle() # noqa: F405 + modules = [Conv2dSingle(), Conv2dSingle(bias=False)] # noqa: F405 sample_input = (torch.randn([1, 1, 3, 3]),) - module = self.get_qdq_module( - module, sample_input, quant_dtype=QuantDtype.use_16a4w - ) - self.lower_module_and_test_output(module, sample_input) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a4w + ) + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_16a4w_linear(self): module = Linear() # noqa: F405 @@ -597,6 +629,7 @@ def test_qnn_backend_16a4w_linear(self): ) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in QNN 2.26") def test_qnn_backend_16a4w_per_channel_linear(self): module = Linear(use_bias=False) # noqa: F405 sample_input = (torch.randn([3, 4]),) @@ -631,16 +664,16 @@ def test_qnn_backend_avg_pool2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_bmm(self): - module = Bmm() # noqa: F405 - sample_input = (torch.randn([4, 8, 32]), torch.randn([4, 32, 8])) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("not applicable") - def test_qnn_backend_cast(self): - module = Cast() # noqa: F405 - sample_input = (10 * torch.rand((9, 4, 5, 3)),) + def test_qnn_backend_bmm(self): + module = Bmm() # noqa: F405 + torch.manual_seed(8) + sample_input = (torch.randn([4, 8, 32]), torch.randn([4, 32, 8])) module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) @@ -665,16 +698,20 @@ def test_qnn_backend_clamp(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_conv1d(self): - module = Conv1dSequential() # noqa: F405 + modules = [Conv1dSequential(), Conv1dSequential(bias=False)] # noqa: F405 sample_input = (torch.randn([1, 1, 3]),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_conv2d(self): - module = Conv2dSequential() # noqa: F405 + modules = [Conv2dSequential(), Conv2dSequential(bias=False)] # noqa: F405 sample_input = (torch.randn([1, 1, 3, 3]),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_element_wise_add(self): test_comb = [ @@ -708,6 +745,7 @@ def test_qnn_backend_element_wise_ceil(self): def test_qnn_backend_element_wise_div(self): eps = 1e-03 + torch.manual_seed(8) test_comb = [ { QCOM_MODULE: [Div()], # noqa: F405 @@ -827,6 +865,21 @@ def test_qnn_backend_hardtanh(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_index(self): + module = Index() # noqa: F405 + sample_input = (torch.randn([8, 172, 64]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + + def test_qnn_backend_index_put(self): + module = IndexPut() # noqa: F405 + sample_input = ( + torch.tensor([2], dtype=torch.int32), + torch.randn([1, 1, 12, 64]), + ) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_interpolate_bilinear_2d(self): module = ResizeBilinear2D() # noqa: F405 sample_input = (torch.randn(2, 3, 4, 5),) @@ -955,6 +1008,14 @@ def test_qnn_backend_reshape(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a4w + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -1060,13 +1121,14 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) def test_qnn_backend_chunk_add(self): module = ChunkAdd() # noqa: F405 + torch.manual_seed(8) sample_input = (torch.randn(1, 1, 4, 2),) module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) @@ -1127,6 +1189,7 @@ def test_qnn_backend_simple_model(self): def test_qnn_backend_view_permute_matmul(self): module = ViewPermuteMatMul() # noqa: F405 + torch.manual_seed(8) sample_input = (torch.randn([1, 8, 512]), torch.randn([1, 2, 8, 256])) module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) @@ -1224,6 +1287,22 @@ def setUp(self): saver=False, ) + def test_qnn_backend_dump_intermediate_outputs(self): + backend_options = generate_htp_compiler_spec(use_fp16=True) + TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + dump_intermediate_outputs=True, + ) + module = Relu() # noqa: F405 + sample_input = (torch.randn([2, 5, 1, 3]),) + self.lower_module_and_test_output( + module, + sample_input, + expected_partitions=1, + expected_intermediate_events=3, + ) + def test_qnn_backend_skip_node_id(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -1282,20 +1361,13 @@ def test_qnn_backend_multi_contexts_composite(self): lowered_method=to_backend, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) - @unittest.expectedFailure def test_qnn_backend_profile_op(self): TestQNN.enable_profile = True backend_options = generate_htp_compiler_spec(use_fp16=True) @@ -1310,7 +1382,7 @@ def test_qnn_backend_profile_op(self): module, sample_input, expected_partitions=1, - expected_profile_events=25, + expected_profile_events=24, ) def test_qnn_backend_shared_buffer(self): @@ -1342,6 +1414,7 @@ def test_qnn_backend_online_prepare(self): sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -1385,7 +1458,24 @@ def setUp(self): saver=False, ) - def test_qnn_backend_skip_node_id(self): + def test_qnn_backend_dump_intermediate_outputs(self): + backend_options = generate_htp_compiler_spec(use_fp16=False) + TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + dump_intermediate_outputs=True, + ) + module = Relu() # noqa: F405 + sample_input = (torch.randn([2, 5, 1, 3]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output( + module, + sample_input, + expected_partitions=1, + expected_intermediate_events=5, + ) + + def test_qnn_backend_skip_node_id_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1396,7 +1486,43 @@ def test_qnn_backend_skip_node_id(self): skip_node_id_set={"aten_add_tensor", "aten_mean_dim"}, ) - def test_qnn_backend_skip_node_op(self): + def test_qnn_backend_skip_node_id_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"conv2d"}, + ) + self.assertEqual(len(exported_progs), 1) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_skip_node_op_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1407,6 +1533,79 @@ def test_qnn_backend_skip_node_op(self): skip_node_op_set={"aten.add.Tensor"}, ) + def test_qnn_backend_skip_node_op_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_op_set={torch.ops.aten.add.Tensor}, + ) + self.assertEqual(len(exported_progs), 2) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_graph_level_mixed_precision(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"add", "mean"}, + fallback_to_cpu=False, + ) + self.assertEqual(len(exported_progs), 5) + # lower all graph again, the skipped operators will be delegated with fp16 + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + def test_qnn_backend_multi_contexts(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -1447,20 +1646,13 @@ def test_qnn_backend_multi_contexts_composite(self): quantize_method=self.get_qdq_module, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) - @unittest.expectedFailure def test_qnn_backend_profile_op(self): TestQNN.enable_profile = True backend_options = generate_htp_compiler_spec(use_fp16=False) @@ -1476,7 +1668,7 @@ def test_qnn_backend_profile_op(self): module, sample_input, expected_partitions=1, - expected_profile_events=26, + expected_profile_events=25, ) def test_qnn_backend_shared_buffer(self): @@ -1510,6 +1702,7 @@ def test_qnn_backend_online_prepare(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -1623,6 +1816,46 @@ def test_gMLP(self): self.assertGreaterEqual(msg["top_1"], 60) self.assertGreaterEqual(msg["top_5"], 90) + def test_regnet(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + + weights = ["regnet_y_400mf", "regnet_x_400mf"] + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/regnet.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + for weight in weights: + p = subprocess.Popen( + cmds + ["--weights", weight], stdout=subprocess.DEVNULL + ) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 85) + def test_ssd300_vgg16(self): if not self.required_envs([self.pretrained_weight, self.oss_repo]): self.skipTest("missing required envs") @@ -1769,6 +2002,245 @@ def test_squeezenet(self): self.assertGreaterEqual(msg["top_5"], 70) +class TestExampleQaihubScript(TestQNN): + + def required_envs(self, conditions=None) -> bool: + conditions = [] if conditions is None else conditions + return all( + [ + self.executorch_root, + self.artifact_dir, + *conditions, + ] + ) + + def test_utils_export(self): + with tempfile.TemporaryDirectory() as tmp_dir: + module = ContextBinaryExample() # noqa: F405 + generate_context_binary( + module=module, + inputs=module.example_inputs(), + quantized=True, + artifact_dir=tmp_dir, + ) + ctx_path = f"{tmp_dir}/model_ctx.bin" + fpath = f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/utils/export.py" + + # do compilation + compile_cmds = [ + "python", + fpath, + "compile", + "-a", + ctx_path, + "-m", + self.model, + "-l", + "False", + "-b", + self.build_folder, + "-o", + f"{tmp_dir}/output_pte", + ] + compile_process = subprocess.Popen( + compile_cmds, stdout=subprocess.DEVNULL, cwd=self.executorch_root + ) + output_pte_dir = f"{tmp_dir}/output_pte/model_ctx" + compile_process.communicate() + + # check artifacts are correctly generated + self.assertTrue( + all( + [ + Path(output_pte_dir).exists(), + Path(f"{output_pte_dir}/model_ctx.json").exists(), + Path(f"{output_pte_dir}/model_ctx.svg").exists(), + ] + ) + ) + + # prepare input files + input_list, inputs = [], module.example_inputs() + for name, tensor in inputs.items(): + tensor_path = f"{output_pte_dir}/{name}.pt" + torch.save(tensor, tensor_path) + input_list.append(tensor_path) + + # do execution + output_data_dir = f"{tmp_dir}/output_data" + execute_cmds = [ + "python", + fpath, + "execute", + "-p", + output_pte_dir, + "-i", + *input_list, + "-s", + self.device, + "-z", + "-b", + self.build_folder, + "-o", + output_data_dir, + ] + if self.host is not None: + execute_cmds.append(f"-H {self.host}") + execute_process = subprocess.Popen(execute_cmds, cwd=self.executorch_root) + execute_process.communicate() + + # read outputs + with open(f"{output_pte_dir}/model_ctx.json", "r") as f: + graph_info = json.load(f) + + device_output = [] + for output in graph_info["outputs"]: + with open(f"{output_data_dir}/{output['name']}.pt", "rb") as f: + buffer = io.BytesIO(f.read()) + device_output.append(torch.load(buffer, weights_only=False)) + + # validate outputs + golden_output = module.forward(inputs["x"], inputs["y"]) + self.atol, self.rtol = 1e-1, 1 + self._assert_outputs_equal(golden_output, device_output) + + def test_llama2_7b(self): + if not self.required_envs(): + self.skipTest("missing required envs") + + prompt = "Explain the rules of baseball" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--tokenizer_bin", + f"{self.artifact_dir}/tokenizer.bin", + "--context_binaries", + f"{self.artifact_dir}", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + model_out = msg["result"] + self.assertTrue(model_out.startswith(prompt)) + + def test_llama3_8b(self): + if not self.required_envs(): + self.skipTest("missing required envs") + + prompt = "Explain the rules of baseball" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--tokenizer_model", + f"{self.artifact_dir}/tokenizer.model", + "--context_binaries", + f"{self.artifact_dir}", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + model_out = msg["result"] + expected_result = ( + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + + prompt + + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + ) + self.assertTrue(model_out.startswith(expected_result)) + + def test_stable_diffusion(self): + if not self.required_envs(): + self.skipTest("missing required envs") + + prompt = "a photo of an astronaut riding a horse on mars" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--text_encoder_bin", + f"{self.artifact_dir}/text_encoder.serialized.bin", + "--unet_bin", + f"{self.artifact_dir}/unet.serialized.bin", + "--vae_bin", + f"{self.artifact_dir}/vae.serialized.bin", + "--vocab_json", + f"{self.artifact_dir}/vocab.json", + "--num_time_steps", + "20", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--fix_latents", + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + # For the default settings and prompt, the expected results will be {PSNR: 23.258, SSIM: 0.852} + self.assertGreaterEqual(msg["PSNR"], 20) + self.assertGreaterEqual(msg["SSIM"], 0.8) + + class TestExampleScript(TestQNN): def required_envs(self, conditions=None) -> bool: conditions = [] if conditions is None else conditions @@ -1967,7 +2439,7 @@ def test_vit(self): if "Error" in msg: self.fail(msg["Error"]) else: - self.assertGreaterEqual(msg["top_1"], 70) + self.assertGreaterEqual(msg["top_1"], 65) self.assertGreaterEqual(msg["top_5"], 90) def test_edsr(self): @@ -2051,7 +2523,7 @@ def test_stories_single_llama(self): cmds = [ "python", - f"{self.executorch_root}/examples/qualcomm/llama2/llama.py", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py", "--artifact", self.artifact_dir, "--build_folder", @@ -2094,6 +2566,7 @@ def test_stories_single_llama(self): model_out = msg["result"][0] self.assertTrue(model_out.startswith(golden_start_with)) + @unittest.skip("dynamic shape inputs appear in recent torch.export.export") def test_mobilebert(self): if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2134,13 +2607,8 @@ def test_mobilebert(self): for k, v in cpu.items(): self.assertLessEqual(abs(v[0] - htp[k][0]), 2) - @unittest.skip("will be enabled after TODOs got resolved") + @unittest.skip("eagar mode fake quant works well, need further investigation") def test_ptq_mobilebert(self): - # TODO: 2 approaches to resolve accuracy issue - # 1. fallback embedding layers: - # - skip annotation in quantizer (need PR to provide helper funciton) - # - skip operators in partitioner (use existent "skip_node_op_set") - # 2. investigate different quantization configurations / mechanisms if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2157,6 +2625,8 @@ def test_ptq_mobilebert(self): self.model, "--pretrained_weight", self.pretrained_weight, + "--ptq", + "16a16w", "--ip", self.ip, "--port", @@ -2260,6 +2730,12 @@ def setup_environment(): help="Path to open source software model repository", type=str, ) + parser.add_argument( + "-x", + "--enable_x86_64", + help="Enable unittest to be executed on x86_64 platform", + action="store_true", + ) args, ns_args = parser.parse_known_args(namespace=unittest) TestQNN.host = args.host @@ -2276,6 +2752,8 @@ def setup_environment(): TestQNN.error_only = args.error_only TestQNN.oss_repo = args.oss_repo TestQNN.shared_buffer = args.shared_buffer + TestQNN.enable_x86_64 = args.enable_x86_64 + TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index f31f07562b..7209b0a267 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -27,7 +27,12 @@ QcomChipset, ) from executorch.backends.qualcomm.utils.utils import capture_program -from executorch.examples.qualcomm.scripts.utils import SimpleADB +from executorch.devtools import generate_etrecord, Inspector +from executorch.examples.qualcomm.utils import ( + generate_inputs, + make_output_dir, + SimpleADB, +) from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.compile_spec_schema import CompileSpec @@ -35,9 +40,7 @@ from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.pass_base import ExportPass from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.exir.program._program import ExecutorchProgram -from executorch.sdk import generate_etrecord -from executorch.sdk.inspector import Inspector +from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -133,6 +136,7 @@ class TestQNN(unittest.TestCase): use_16a16w: str = "16a16w" use_16a4w: str = "16a4w" shared_buffer: bool = False + enable_x86_64: bool = False def _assert_outputs_equal(self, model_output, ref_output): self.assertTrue(len(ref_output) == len(model_output)) @@ -176,18 +180,21 @@ def _save_model_and_expected_output( return input_list, ref_outputs, pte_fname - def verify_output( + def verify_output( # noqa: C901 self, module: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], executorch_prog: ExecutorchProgram | LoweredBackendModule, etrecord_path: str = "etrecord.bin", expected_profile_events: int = -1, + expected_intermediate_events: int = -1, ): with tempfile.TemporaryDirectory() as tmp_dir: buffer = ( executorch_prog.buffer - if isinstance(executorch_prog, ExecutorchProgram) + if isinstance( + executorch_prog, (ExecutorchProgram, ExecutorchProgramManager) + ) else executorch_prog.buffer() ) ( @@ -201,16 +208,17 @@ def verify_output( tmp_dir, ) - device_output_dir = f"{tmp_dir}/outputs" - device_outputs = [] + output_dir = f"{tmp_dir}/outputs" + outputs = [] etdump_path = f"{tmp_dir}/etdump.etdp" + debug_output_path = f"{tmp_dir}/debug_output.bin" def post_process(): - for i, f in enumerate(sorted(os.listdir(device_output_dir))): - filename = os.path.join(device_output_dir, f) + for i, f in enumerate(sorted(os.listdir(output_dir))): + filename = os.path.join(output_dir, f) output = np.fromfile(filename, dtype=ref_outputs[i].numpy().dtype) output = torch.from_numpy(output).reshape(ref_outputs[i].shape) - device_outputs.append(output) + outputs.append(output) def validate_profile(): inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path) @@ -218,23 +226,99 @@ def validate_profile(): len(inspector.to_dataframe().index) == expected_profile_events ) - adb = SimpleADB( - qnn_sdk=os.getenv("QNN_SDK_ROOT"), - build_path=self.build_folder, - pte_path=pte_fname, - workspace="/data/local/tmp/qnn_executorch_test", - device_id=self.device, - host_id=self.host, - soc_model=self.model, - error_only=self.error_only, - ) - adb.push(inputs=[sample_inputs], input_list=input_list) - adb.execute() - adb.pull(output_path=tmp_dir, callback=post_process) - self._assert_outputs_equal(device_outputs, ref_outputs) + def validate_intermediate_tensor(): + inspector = Inspector( + etdump_path=etdump_path, debug_buffer_path=debug_output_path + ) + for event_block in inspector.event_blocks: + if event_block.name == "Execute": + self.assertTrue( + len(event_block.events) == expected_intermediate_events + ) + + if self.enable_x86_64: + generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list) + make_output_dir(output_dir) + + target = "x86_64-linux-clang" + qnn_sdk = os.environ.get("QNN_SDK_ROOT", None) + assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable" + + build_folder = self.build_folder + if os.path.isabs(self.build_folder): + # obey user's opinion + pass + else: + # ok, assuming the user give a relative path to cwd + build_folder = os.path.join(os.getcwd(), self.build_folder) + + cmd = [ + # qnn_executor_runner + f"{build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner", + "--model_path", + f"{pte_fname}", + "--input_list_path", + f"{tmp_dir}/input_list.txt", + "--output_folder_path", + f"{output_dir}", + ] + + env = dict(os.environ) + env["LD_LIBRARY_PATH"] = f"{qnn_sdk}/lib/{target}/:{build_folder}/lib" + proc = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + cwd=tmp_dir, + ) - if expected_profile_events != -1: - adb.pull_etdump(etdump_path, callback=validate_profile) + self.assertEqual( + proc.returncode, + 0, + f"The process running qnn_executorch_runner return {proc.returncode}, " + "STDOUT=\n" + f"{proc.stdout.decode('utf-8')}", + ) + + # Verify the outputs + post_process() + self._assert_outputs_equal(outputs, ref_outputs) + + # Verify the etdump + if expected_profile_events != -1: + validate_profile() + + if expected_intermediate_events != -1: + validate_intermediate_tensor() + else: + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=self.build_folder, + pte_path=pte_fname, + workspace="/data/local/tmp/qnn_executorch_test", + device_id=self.device, + host_id=self.host, + soc_model=self.model, + error_only=self.error_only, + dump_intermediate_outputs=( + True if expected_intermediate_events != -1 else False + ), + ) + adb.push(inputs=[sample_inputs], input_list=input_list) + adb.execute() + adb.pull(output_path=tmp_dir, callback=post_process) + self._assert_outputs_equal(outputs, ref_outputs) + + if expected_profile_events != -1: + adb.pull_etdump(etdump_path, callback=validate_profile) + + if expected_intermediate_events != -1: + adb.pull_debug_output( + etdump_path, + debug_output_path, + callback=validate_intermediate_tensor, + ) def lower_module_and_test_output( self, @@ -242,6 +326,7 @@ def lower_module_and_test_output( sample_inputs: Tuple[torch.Tensor], expected_partitions: int = 1, expected_profile_events: int = -1, + expected_intermediate_events: int = -1, assert_output_equal: bool = True, skip_node_id_set: set = None, skip_node_op_set: set = None, @@ -265,7 +350,6 @@ def lower_module_and_test_output( # Therefore, won't want to pre-allocate # by memory manager in runtime. memory_planning_pass=MemoryPlanningPass( - memory_planning_algo="greedy", alloc_graph_input=not self.shared_buffer, alloc_graph_output=not self.shared_buffer, ), @@ -286,11 +370,19 @@ def lower_module_and_test_output( etrecord_path = "etrecord.bin" if self.enable_profile: generate_etrecord(etrecord_path, edge_copy, exec_prog) - # Check numerics - if assert_output_equal or expected_profile_events != -1: + if ( + assert_output_equal + or expected_profile_events != -1 + or expected_intermediate_events != -1 + ): self.verify_output( - module, sample_inputs, exec_prog, etrecord_path, expected_profile_events + module, + sample_inputs, + exec_prog, + etrecord_path, + expected_profile_events, + expected_intermediate_events, ) def get_qdq_module( @@ -362,6 +454,8 @@ def _insert_clone( (node,), ) inserted_node.meta["val"] = node.meta["val"] + if "quant_attrs" in node.meta: + inserted_node.meta["quant_attrs"] = node.meta["quant_attrs"] for user in users: user.replace_input_with(node, inserted_node) diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py index 58538eb91e..9875c9f5af 100644 --- a/backends/qualcomm/utils/constants.py +++ b/backends/qualcomm/utils/constants.py @@ -7,16 +7,23 @@ # Qualcomm specific key # constants in backends/qualcomm/passes & backends/qualcomm/builders +QCOM_AXIS = "axis" QCOM_AXIS_ORDER = "axis_order" QCOM_BITWIDTH = "bitwidth" QCOM_DATA = "data" +QCOM_DTYPE = "dtype" QCOM_ENCODING = "encoding" QCOM_INSERTED_PERMUTE = "qnn_permute" +QCOM_OFFSET = "offset" QCOM_QUANTIZED_IO = "q_tensor_io" QCOM_QUANT_ATTRS = "quant_attrs" +QCOM_QUANT_MIN = "quant_min" +QCOM_QUANT_MAX = "quant_max" QCOM_REQUANTIZE = "requantize" +QCOM_SCALE = "scale" QCOM_SCALES = "scales" QCOM_SCALE_OFFSET = "scale_offset" +QCOM_ZERO_POINT = "zero_point" QCOM_ZERO_POINTS = "zero_points" # constants in backends/qualcomm/tests diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 85b965a146..a0c0abf729 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator +import warnings from collections import OrderedDict from typing import Callable, Dict, List, Tuple @@ -38,7 +40,11 @@ from executorch.backends.qualcomm.passes.recompose_pixel_unshuffle import ( RecomposePixelUnshuffle, ) +from executorch.backends.qualcomm.passes.recompose_rms_norm import RecomposeRmsNorm from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy +from executorch.backends.qualcomm.passes.replace_index_put_input import ( + ReplaceIndexPutInput, +) from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( _soc_info_table, QcomChipset, @@ -56,6 +62,7 @@ convert_to_option, ) from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC + from executorch.exir import ExirExportedProgram from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.lowered_backend_module import LoweredBackendModule @@ -63,9 +70,74 @@ from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions from torch.export.exported_program import ExportedProgram from torch.fx import passes +from torch.fx.passes.operator_support import OperatorSupportBase from torch.library import Library +class _AnnotationSkipper(OperatorSupportBase): + """ + Class used to partition out unwanted graph nodes. + e.g. - nodes are prevented from quantization annotation + - nodes have been grouped together as a submodule + + Attributes + ---------- + fp_node_id_set : set + a set contains nodes' name to be left in fp precision + fp_node_op_set : set + a set contains nodes' target (aten dialect) to be left in fp precision + skip_annotated_submodule : bool + flag to skip annotated submodule or not + + Methods + ------- + should_delegate(n: torch.fx.Node) + identify the residual nodes haven't be lowered with fixed-precision + should_skip(n: torch.fx.Node) + identify the nodes should be kept out with fixed-precision or not + is_node_supported(_, node: torch.fx.Node) + overridden method for graph partitioning + """ + + def __init__( + self, + fp_node_id_set: set = None, + fp_node_op_set: set = None, + skip_annotated_submodule: bool = False, + ): + self.fp_node_id_set = fp_node_id_set + self.fp_node_op_set = fp_node_op_set + self.skip_annotated_submodule = skip_annotated_submodule + + def should_delegate(self, n: torch.fx.Node): + return n.op == "call_function" and n.target != operator.getitem + + def should_skip(self, n: torch.fx.Node): + return n.name in self.fp_node_id_set or n.target in self.fp_node_op_set + + def is_node_supported(self, _, node: torch.fx.Node) -> bool: + if self.skip_annotated_submodule: + if node.op == "get_attr": + return all(self.should_delegate(user) for user in node.users) + return self.should_delegate(node) + + if any( + [ + node.op in ("placeholder", "output"), + self.should_skip(node), + # check if parameters belong to fallbacked operator + ( + node.op == "get_attr" + and all(self.should_skip(user) for user in node.users) + ), + ] + ): + print(f"[QNN Quantizer Annotation]: {node.name} | Skipped") + return False + + return True + + def qnn_capture_config(): return exir.CaptureConfig(enable_aot=True) @@ -121,6 +193,7 @@ def replace_linear(module: torch.nn.Module): def canonicalize_program( exported_program: ExportedProgram | List[LoweredBackendModule], + custom_buffer_size=None, ): # check if user specifies to use multi_contexts # this is a generic approach in case there exists multiple backends @@ -140,7 +213,12 @@ def process_exported_program(prog): return max_sf_buf_size, module_map def process_lowered_module(module): - return len(module.processed_bytes), { + spill_fill_size = ( + len(module.processed_bytes) + if custom_buffer_size is None + else custom_buffer_size + ) + return spill_fill_size, { module: convert_to_option(module.compile_specs[0].value) } @@ -178,8 +256,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]: # The below super ops are supported by QNN remove_decompositions = [ torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.pixel_unshuffle.default, torch.ops.aten.hardsigmoid.default, torch.ops.aten.hardswish.default, + torch.ops.aten._safe_softmax.default, ] for key in remove_decompositions: @@ -195,6 +275,7 @@ def _transform(edge_program: ExportedProgram) -> None: graph_module = edge_program.graph_module RemoveRedundancy()(graph_module) RecomposePixelUnshuffle()(graph_module) + RecomposeRmsNorm()(graph_module) ConvertToLinear()(graph_module) ConvertPReLU(edge_program)(graph_module) ConvertBmmToMatmul()(graph_module) @@ -205,6 +286,14 @@ def _transform(edge_program: ExportedProgram) -> None: AnnotateDecomposed(edge_program)(graph_module) FoldQDQ()(graph_module) LayoutTransform(edge_program)(graph_module) + ReplaceIndexPutInput(edge_program)(graph_module) + + # Since QDQ nodes are stripped, update graph signature again to validate program + edge_program._graph_signature = _get_updated_graph_signature( + edge_program.graph_signature, + edge_program.graph_module, + ) + edge_program._validate() def capture_program( @@ -222,16 +311,291 @@ def capture_program( core_ep.transform(ConvertBinaryOpsWithScalar()) edge_ep = core_ep.to_edge(qnn_edge_config()) _transform(edge_ep.exported_program) - # Since QDQ nodes are stripped, update graph signature again to validate program - edge_ep.exported_program._graph_signature = _get_updated_graph_signature( - edge_ep.exported_program.graph_signature, - edge_ep.exported_program.graph_module, - ) - edge_ep.exported_program._validate() return edge_ep -def from_context_binary(ctx_path: str, op_name: str): +def _partition_graph_into_submodules(gm, subgm_tag, subgm_cb, ptn): + from torch.fx.passes.utils.fuser_utils import ( + erase_nodes, + fuse_as_graphmodule, + insert_subgm, + legalize_graph, + topo_sort, + ) + + partitions = ptn.propose_partitions() + # insert meta for each partition group + for i, partition in enumerate(partitions): + for node in partition.nodes: + node.meta[subgm_tag] = i + + for i in range(len(partitions)): + # find nodes with same group id in current graph + node_list = [ + node for node in gm.graph.nodes if node.meta.get(subgm_tag, "") == i + ] + # fuse group nodes into submodule + sorted_nodes = topo_sort(node_list) + submodule_name = f"{subgm_tag}_{i}" + subgm, orig_inputs, orig_outputs = fuse_as_graphmodule( + gm, sorted_nodes, submodule_name + ) + # insert submodule & trim group nodes + gm = insert_subgm( + gm, + subgm_cb(subgm, submodule_name), + orig_inputs, + orig_outputs, + ) + erase_nodes(gm, sorted_nodes) + legalize_graph(gm) + + gm.recompile() + return gm + + +def _canonicalize_graph_with_lowered_module(gm, subgm_tag, ptn): + from executorch.exir.backend.backend_api import to_backend + + # return lowered program for user to debug + exported_progs = [] + # partition each submodule which went through convert_pt2e + for node in gm.graph.nodes: + if node.op == "call_module" and subgm_tag in node.name: + # obtain sample inputs through meta + subgm_input = [ + torch.ones(arg.meta["val"].shape, dtype=arg.meta["val"].dtype) + for arg in node.args + ] + # program meets QNN backend requirement + sub_prog = capture_program(gm.get_submodule(node.name), tuple(subgm_input)) + # start lowering with given partitioner + exported_progs.append(to_backend(sub_prog.exported_program, ptn)) + # replace submodule with lowered module + gm.set_submodule( + node.name, + exported_progs[-1].graph_module, + ) + # if node has multiple outputs, getitems will be default generated + if all(n.target != operator.getitem for n in node.users): + with gm.graph.inserting_after(node): + getitem_node = gm.graph.call_function( + operator.getitem, + (node, 0), + ) + getitem_node.meta = node.meta + node.replace_all_uses_with( + replace_with=getitem_node, + delete_user_cb=lambda user: user.target != operator.getitem, + ) + + gm.recompile() + return gm, exported_progs + + +def skip_annotation( + nn_module: torch.nn.Module, + quantizer, + partitioner, + sample_input: Tuple[torch.Tensor, ...], + calibration_cb: Callable[[torch.fx.GraphModule], None], + fp_node_id_set: set = None, + fp_node_op_set: set = None, + fallback_to_cpu: bool = True, +): + r""" + Exclude speific operators from quantizer annotation. + Skipped operators will defaultly stay in CPU, set 'fallback_to_cpu' + to False for trying to delegate them with FP16 precision. + + e.g.: consider following graph: + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \_______ _______/ + add_1 + (torch.ops.aten.add.default) + | + output + + If user wants to skip convolution op by names with + 'skip_node_id_set' = {"conv2d_1"} + "bias_1 / weight_1 / input_1 / input_2 / conv2d_1" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 input_2 + | (placeholder) | | + \ | / | + \ | / | + \ | / | + conv2d_1 | + \ / + \ / + \ / + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to skip convolution op by target with + 'skip_node_op_set' = {torch.ops.aten.conv2d.default} + "bias_1 / weight_1 / input_1 / conv2d_1, + bias_2 / weight_2 / input_2 / conv2d_2" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \__ __/ + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to delegate the skipped conv2d from above graph + with 'fallback_to_cpu' = False: + + [Generated graph] + input_1 input_2 + (placeholder) (placeholder) + | | + \ / + lowered_module_2 + (QNN fp16 precision) + | + | + lowered_module_1 + (QNN fixed precision) + | + output + + Args: + nn_module (torch.nn.Module): The module to be lowered. + quantizer (QnnQuantizer): Instance of QnnQuantizer. + partitioner (QnnPartitioner): Instance of QnnPartitioner. + sample_input ((torch.Tensor, ...)): Sample input tensors for graph exporting. + calibration_cb (callable): Callback function for user-defined calibration. + fp_node_id_set ({str, ...}): Set of operator names to be left in fp precision. + fp_node_op_set ({torch.ops.aten.xxx, ...}): Set of operator targets to be left in fp precision. + fallback_to_cpu (bool): Whether to lower skipped nodes to fp16 or not. + + Returns: + exported_programs: List of programs lowered to QnnBackend (quantized graphs only). + """ + from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QnnExecuTorchHtpPrecision, + ) + from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import ( + convert_to_option, + ) + from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner + + def prepare_subgm(subgm, subgm_name): + # prepare current submodule for quantization annotation + subgm_prepared = prepare_pt2e(subgm, quantizer) + # overwrite this attribute or name will be set to "GraphModule" + # we could not identify each submodule if action is not performed + subgm_prepared.__class__.__name__ = subgm_name + return subgm_prepared + + fp_node_id_set = fp_node_id_set if fp_node_id_set is not None else set() + fp_node_op_set = fp_node_op_set if fp_node_op_set is not None else set() + graph_module = torch.export.export(nn_module, sample_input).module() + # define node support type + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(fp_node_id_set, fp_node_op_set), + allows_single_node_partition=True, + ) + subgm_tag = "annotated_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=prepare_subgm, + ptn=capability_partitioner, + ) + # perform calibration + calibration_cb(graph_module) + # convert sub modules which went through prepare_pt2e + for node in graph_module.graph.nodes: + if node.op == "call_module": + graph_module.set_submodule( + node.name, convert_pt2e(graph_module.get_submodule(node.name)) + ) + # canonicalize graph for lowering again + graph_module, exported_progs = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + + if not fallback_to_cpu: + try: + from executorch.exir.backend.partitioner import DelegationSpec + + # change HTP compiler spec for hardware to enable fp16 + qnn_option = generate_qnn_executorch_option( + partitioner.compiler_specs_snapshot + ) + compile_option = convert_to_option(qnn_option) + htp_options = compile_option.backend_options.htp_options + htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16 + partitioner.delegation_spec = DelegationSpec( + "QnnBackend", + [ + CompileSpec( + QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option) + ) + ], + ) + except: + print( + "Failed to change HTP compiler spec with 'use_fp16' as True," + " skipped operators will fallback to cpu," + ) + return graph_module, exported_progs + + # try lowering skipped operator into fp16 + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(skip_annotated_submodule=True), + allows_single_node_partition=True, + ) + subgm_tag = "skipped_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=lambda subgm, _: subgm, + ptn=capability_partitioner, + ) + graph_module, exported_progs_fp = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + exported_progs.extend(exported_progs_fp) + + return graph_module, exported_progs + + +def from_context_binary( + ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650 +): def implement_op(custom_op, op_name, outputs): @torch.library.impl( custom_op, str(op_name), dispatch_key="CompositeExplicitAutograd" @@ -282,7 +646,7 @@ def build_tensor(tensors, dtype_map): # dummy compiler spec would be fine, since we're not compiling backend_options = generate_htp_compiler_spec(use_fp16=False) compiler_specs = generate_qnn_executorch_compiler_spec( - soc_model=QcomChipset.SM8650, + soc_model=soc_model, backend_options=backend_options, is_from_context_binary=True, ) @@ -371,7 +735,7 @@ def generate_qnn_executorch_compiler_spec( debug: bool = False, saver: bool = False, online_prepare: bool = False, - tensor_dump_output_path: str = "", + dump_intermediate_outputs: bool = False, profile: bool = False, shared_buffer: bool = False, is_from_context_binary: bool = False, @@ -393,10 +757,8 @@ def generate_qnn_executorch_compiler_spec( saver: Instead of compiling the model, run QNN Saver. Please check documents of Qualcomm AI Engine Direct SDK. This feature is usually for debugging purpose. - tensor_dump_output_path: If a path is given, Delegate would write - outputs of each OP there in runtime. In ALL cases, - we don't recommend to set this option. This option exist just - for debugging some accuracy issues. + dump_intermediate_outputs: If tensor dump is enabled, all intermediate tensors output will be dumped. + This option exists for debugging accuracy issues profile: Enable profile the performance of per operator. Note that for now only support kProfileDetailed to profile the performance of each operator with cycle unit. @@ -414,6 +776,13 @@ def generate_qnn_executorch_compiler_spec( if soc_model not in _supported_soc_models: raise ValueError(f"unknown SoC model for QNN: {soc_model}") + if profile and dump_intermediate_outputs: + warnings.warn( + "It is not recommended to turn on both profiling and dump_intermediate_outputs the same time" + ", because dump_intermediate_outputs will cause performance drop.", + stacklevel=1, + ) + qnn_executorch_options = QnnExecuTorchOptions( _soc_info_table[soc_model], backend_options ) @@ -424,12 +793,11 @@ def generate_qnn_executorch_compiler_spec( else QnnExecuTorchLogLevel.kLogLevelWarn ) + qnn_executorch_options.dump_intermediate_outputs = dump_intermediate_outputs + if saver: qnn_executorch_options.library_path = "libQnnSaver.so" - if len(tensor_dump_output_path.strip()) != 0: - qnn_executorch_options.tensor_dump_output_path = tensor_dump_output_path - if profile: qnn_executorch_options.profile_level = ( QnnExecuTorchProfileLevel.kProfileDetailed diff --git a/backends/transforms/TARGETS b/backends/transforms/TARGETS index 88de8a84a6..df50e45f09 100644 --- a/backends/transforms/TARGETS +++ b/backends/transforms/TARGETS @@ -73,6 +73,35 @@ runtime.python_library( ], ) +runtime.python_library( + name = "fuse_dequant_linear", + srcs = ["fuse_dequant_linear.py"], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + ":utils", + "//caffe2:torch", + "//executorch/exir:pass_base", + "//executorch/exir:sym_util", + "//executorch/exir/dialects:lib", + ], +) + +runtime.python_library( + name = "view_copy_to_squeeze_unsqueeze", + srcs = ["view_copy_to_squeeze_unsqueeze.py"], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + ":utils", + "//caffe2:torch", + "//executorch/exir:pass_base", + "//executorch/exir/dialects:lib", + ], +) + runtime.python_library( name = "fuse_view_copy", srcs = ["fuse_view_copy.py"], diff --git a/backends/transforms/addmm_mm_to_linear.py b/backends/transforms/addmm_mm_to_linear.py index 7855de617b..358cbb7ac1 100644 --- a/backends/transforms/addmm_mm_to_linear.py +++ b/backends/transforms/addmm_mm_to_linear.py @@ -130,7 +130,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph: "call_function", ops.aten.linear.default, args ) node.replace_all_uses_with(linear_node) - output_val = linear_node.target( + output_val = linear_node.target( # pyre-fixme[29] args[0].meta["val"], args[1].meta["val"], args[2].meta["val"] ) else: @@ -147,7 +147,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph: "call_function", ops.aten.linear.default, args ) node.replace_all_uses_with(linear_node) - output_val = linear_node.target( + output_val = linear_node.target( # pyre-fixme[29] args[0].meta["val"], args[1].meta["val"] ) linear_node.meta = node.meta diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py index 6dbbf564f5..329dab96df 100644 --- a/backends/transforms/decompose_sdpa.py +++ b/backends/transforms/decompose_sdpa.py @@ -34,7 +34,7 @@ def call( # refer to pytorch/test/test_decomp.py decomposed_module = make_fx( node.target, - decomposition_table=get_decompositions( + decomposition_table=get_decompositions( # pyre-fixme[6] [ torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default, ] diff --git a/backends/transforms/fuse_dequant_linear.py b/backends/transforms/fuse_dequant_linear.py new file mode 100644 index 0000000000..235715ac74 --- /dev/null +++ b/backends/transforms/fuse_dequant_linear.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import torch + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class FuseDequantLinearPass(ExportPass): + """ + Fuses weight dequantize_per_channel nodes with linear nodes into + weight_int8pack_mm nodes, for 8-bit weight-only quantization. + + Replaces dq(weight) -> linear(activation, dq) with weight_int8pack_mm + Replaces dq(weight) -> linear(activation, dq, bias) with weight_int8pack_mm -> add + """ + + def fuse_dequant_with_linear( + self, + graph_module: torch.fx.GraphModule, + dequant_node: torch.fx.Node, + linear_node: torch.fx.Node, + ) -> None: + activations = linear_node.args[0] + bias = None + if len(linear_node.args) > 2: + bias = linear_node.args[2] + quant_weight = dequant_node.args[0] + scale = dequant_node.args[1] + + with graph_module.graph.inserting_before(linear_node): + weight_int8pack_mm_node = graph_module.graph.create_node( + "call_function", + exir_ops.edge.aten._weight_int8pack_mm.default, + (activations, quant_weight, scale), + ) + if bias: + add_node = graph_module.graph.create_node( + "call_function", + exir_ops.edge.aten.add.Tensor, + (weight_int8pack_mm_node, bias), + ) + linear_node.replace_all_uses_with(add_node) + else: + linear_node.replace_all_uses_with(weight_int8pack_mm_node) + graph_module.graph.erase_node(linear_node) + graph_module.graph.erase_node(dequant_node) + + def is_node_target( + self, node: torch.fx.Node, target: torch._ops.OperatorBase + ) -> bool: + return node.op == "call_function" and node.target == target + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + for node in graph_module.graph.nodes: + if self.is_node_target(node, exir_ops.edge.aten.linear.default): + weight_node = node.args[1] + if self.is_node_target( + weight_node, + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, + ): + # only fuse if weight tensor is int8 packed + quant_weight = weight_node.args[0] + if quant_weight.meta["val"].dtype != torch.int8: + continue + self.fuse_dequant_with_linear(graph_module, weight_node, node) + + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + + return PassResult(graph_module, True) diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py new file mode 100644 index 0000000000..094ec6a334 --- /dev/null +++ b/backends/transforms/view_copy_to_squeeze_unsqueeze.py @@ -0,0 +1,128 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from typing import List, Optional, Union + +import torch + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ViewCopyToSqueezeUnsqueezePass(ExportPass): + """ + Replaces view_copy nodes with squeeze_copy.dims nodes if the view node reduces dims of size 1. + Replaces view_copy nodes with unsqueeze_copy.default nodes if the view node adds a dim of size 1. + """ + + def __init__(self) -> None: + super().__init__() + self.view_copy_op: torch._ops.OpOverload = exir_ops.edge.aten.view_copy.default + self.squeeze_op: torch._ops.OpOverload = exir_ops.edge.aten.squeeze_copy.dims + self.unsqueeze_op: torch._ops.OpOverload = ( + exir_ops.edge.aten.unsqueeze_copy.default + ) + + def is_node_target( + self, node: torch.fx.Node, target: torch._ops.OperatorBase + ) -> bool: + return node.op == "call_function" and node.target == target + + def find_squeeze_dims( + self, + input_shape: List[int], + view_shape: List[int], + ) -> Optional[List[int]]: + # view_shape should be a subset of input_shape + if len(input_shape) <= len(view_shape): + return None + + # check that all dims are equal except the removed dims + i = 0 + j = 0 + idx = [] + while i < len(input_shape): + if input_shape[i] != view_shape[j]: + if input_shape[i] == 1: + idx.append(i) + j -= 1 + # continue to check remaining dims are equal + else: + return None + i += 1 + j += 1 + return idx + + def find_unsqueeze_dim( + self, + input_shape: List[int], + view_shape: List[int], + ) -> Optional[int]: + # unsqueeze should increase the length of input_shape by 1 + if len(view_shape) - len(input_shape) != 1: + return None + + # check that all dims are equal except the added dim + i = 0 + j = 0 + idx = -1 + while j < len(view_shape): + if input_shape[i] != view_shape[j]: + if view_shape[j] == 1: + idx = j + i -= 1 + # continue to check remaining dims are equal + else: + return None + i += 1 + j += 1 + return idx + + def replace_view_copy_node( + self, + graph_module: torch.fx.GraphModule, + view_node: torch.fx.Node, + op: torch._ops.OpOverload, + arg: Union[List[int], int], + ) -> None: + with graph_module.graph.inserting_before(view_node): + new_node = graph_module.graph.create_node( + "call_function", + op, + (view_node.args[0], arg), + ) + new_node.meta = view_node.meta + view_node.replace_all_uses_with(new_node) + graph_module.graph.erase_node(view_node) + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + for node in graph_module.graph.nodes: + if self.is_node_target(node, self.view_copy_op): + input_node = node.args[0] + input_shape = input_node.meta["val"].shape + view_shape = node.args[1] + squeeze_dims = self.find_squeeze_dims(input_shape, view_shape) + if squeeze_dims: + self.replace_view_copy_node( + graph_module, node, self.squeeze_op, squeeze_dims + ) + modified = True + continue + unsqueeze_dim = self.find_unsqueeze_dim(input_shape, view_shape) + if unsqueeze_dim: + self.replace_view_copy_node( + graph_module, node, self.unsqueeze_op, unsqueeze_dim + ) + modified = True + continue + + if modified: + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, modified) diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index 49dc27056a..b44736d20d 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -50,8 +50,8 @@ function(gen_vulkan_shader_lib_cpp shaders_path) execute_process( COMMAND "${PYTHON_EXECUTABLE}" - ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py - --glsl-path ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} + ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path + ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH} --env ${VULKAN_GEN_ARG_ENV} RESULT_VARIABLE error_code diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index aaff7a7a72..8570859ed3 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -94,8 +94,9 @@ binary using the Android NDK toolchain. cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_VULKAN=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE=python \ diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py index 26436a0eb9..ca7ce72cae 100644 --- a/backends/vulkan/partitioner/supported_ops.py +++ b/backends/vulkan/partitioner/supported_ops.py @@ -8,6 +8,11 @@ import operator +from executorch.backends.vulkan.passes.custom_ops_defs import ( # noqa + conv_with_clamp_op, + grid_priors_op, +) + from executorch.exir.dialects._ops import ops as exir_ops @@ -82,6 +87,7 @@ def __contains__(self, op): CONVOLUTION_OPS = [ exir_ops.edge.aten.convolution.default, + exir_ops.edge.et_vk.conv_with_clamp.default, ] REDUCTION_OPS = [ @@ -129,6 +135,7 @@ def __contains__(self, op): exir_ops.edge.aten.upsample_nearest2d.vec, exir_ops.edge.aten.zeros.default, exir_ops.edge.aten.zeros_like.default, + exir_ops.edge.et_vk.grid_priors.default, ] diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py index 4d24877b63..103297bc75 100644 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ b/backends/vulkan/partitioner/vulkan_partitioner.py @@ -38,6 +38,9 @@ torch.ops.aten.upsample_nearest2d.vec, ] +logger: logging.Logger = logging.getLogger("") +logger.setLevel(logging.INFO) + class VulkanSupportedOperators(OperatorSupportBase): _ops: OpList = enumerate_supported_ops() @@ -110,7 +113,7 @@ def is_node_supported( ) -> bool: r = self._is_node_supported(submodules, node) if not r and node.op == "call_function": - logging.info(f"Skipping node in Vulkan partitioning: {node.format_node()}") + logger.info(f"Skipping node in Vulkan partitioning: {node.format_node()}") return r def _is_node_supported( @@ -179,9 +182,9 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: pl = len(partition_list) if pl == 0: - logging.warning("No Vulkan subgraphs can be partitioned!") + logger.warning("No Vulkan subgraphs can be partitioned!") else: - logging.info(f"Found {pl} Vulkan subgraphs to be partitioned.") + logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.") tag_constant_data(exported_program) diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py index 67e7db828a..fd586b665a 100644 --- a/backends/vulkan/passes/custom_ops_defs.py +++ b/backends/vulkan/passes/custom_ops_defs.py @@ -48,15 +48,55 @@ def conv_with_clamp_impl( conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name) +def conv_with_clamp_out_impl( + input, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + transposed=False, + output_padding=0, + groups=1, + output_min=-float("inf"), + output_max=float("inf"), + out=None, +): + out = conv_with_clamp_impl( + input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_min, + output_max, + ) + return out + + +name = "conv_with_clamp.out" +lib.define( + f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd") + + +# The dimension of x should be larger than 1 def grid_priors_impl( - height, - width, + x, stride, offset, ): - shift_x = (torch.arange(0, width) + offset) * stride - shift_y = (torch.arange(0, height) + offset) * stride - shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x) + height, width = x.shape[-2:] + # Need to specify device of torch.arange to avoid executorch exporting error + shift_x = (torch.arange(0, width, device=x.device) + offset) * stride + shift_y = (torch.arange(0, height, device=x.device) + offset) * stride + # Need to specify indexing parameter ('ij' is the default value) to avoid executorch exporting error + shift_xx, shift_yy = torch.meshgrid([shift_y, shift_x], indexing="ij") shift_xx = shift_xx.reshape(-1) shift_yy = shift_yy.reshape(-1) shifts = torch.stack((shift_yy, shift_xx), dim=-1) @@ -64,6 +104,24 @@ def grid_priors_impl( name = "grid_priors" -lib.define(f"{name}(int height, int width, int stride, float offset) -> Tensor") -lib.impl(name, grid_priors_impl) +lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor") +lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd") grid_priors_op = getattr(getattr(torch.ops, namespace), name) + + +# When lowering to executorch, ops are converted from default to out variant. Hence, custom ops define both variants. +def grid_priors_out_impl( + x, + stride, + offset, + out, +): + out = grid_priors_impl(x, stride, offset) + return out + + +name = "grid_priors_out" +lib.define( + f"{name}(Tensor self, int stride, float offset, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.impl(name, grid_priors_out_impl, "CompositeExplicitAutograd") diff --git a/backends/vulkan/passes/test_custom_ops.py b/backends/vulkan/passes/test_custom_ops.py index a1a3a40f67..c68dd6d679 100644 --- a/backends/vulkan/passes/test_custom_ops.py +++ b/backends/vulkan/passes/test_custom_ops.py @@ -97,14 +97,15 @@ class GridPriors(torch.nn.Module): def __init__(self): super().__init__() - def forward(self, height, width, stride, offset): - return torch.ops.et_vk.grid_priors(height, width, stride, offset) + def forward(self, x, stride, offset): + return torch.ops.et_vk.grid_priors(x, stride, offset) model = GridPriors() - sample_input = (2, 3, 4, 0.5) + sample_input = (torch.rand(2, 5, 2, 3), 4, 0.5) custom_out = model(*sample_input) - def calculate_expected_output(height, width, stride, offset): + def calculate_expected_output(x, stride, offset): + height, width = x.shape[-2:] shift_x = (torch.arange(0, width) + offset) * stride shift_y = (torch.arange(0, height) + offset) * stride shift_xx, shift_yy = torch.meshgrid(shift_y, shift_x) diff --git a/backends/vulkan/quantizer/TARGETS b/backends/vulkan/quantizer/TARGETS new file mode 100644 index 0000000000..7cc5b79eb2 --- /dev/null +++ b/backends/vulkan/quantizer/TARGETS @@ -0,0 +1,13 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("executorch") + +python_library( + name = "vulkan_quantizer", + srcs = [ + "vulkan_quantizer.py", + ], + deps = [ + "//caffe2:torch", + ], +) diff --git a/backends/vulkan/quantizer/vulkan_quantizer.py b/backends/vulkan/quantizer/vulkan_quantizer.py new file mode 100644 index 0000000000..451f18977e --- /dev/null +++ b/backends/vulkan/quantizer/vulkan_quantizer.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from __future__ import annotations + +import functools +from typing import Any, Callable, Dict, Optional + +import torch +from torch.ao.quantization.observer import MinMaxObserver, PerChannelMinMaxObserver +from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor +from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer +from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import ( + _convert_scalars_to_attrs, + OP_TO_ANNOTATOR, + propagate_annotation, + QuantizationConfig, +) +from torch.fx import Node + + +__all__ = [ + "VulkanQuantizer", + "get_weight_quantization_config", +] + + +@functools.lru_cache +def get_weight_quantization_config( + is_per_channel: bool = True, + weight_qmin: int = -128, + weight_qmax: int = 127, +) -> QuantizationConfig: + + weight_qscheme = ( + torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric + ) + weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = ( + PerChannelMinMaxObserver if is_per_channel else MinMaxObserver + ) + extra_args: Dict[str, Any] = {"eps": 2**-12} + + weight_quantization_spec = QuantizationSpec( + dtype=torch.int8, + quant_min=weight_qmin, + quant_max=weight_qmax, + qscheme=weight_qscheme, + ch_axis=0, + is_dynamic=False, + observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args( + **extra_args + ), + ) + + quantization_config = QuantizationConfig( + input_activation=None, + output_activation=None, + weight=weight_quantization_spec, + bias=None, + is_qat=False, + ) + return quantization_config + + +_SUPPORTED_OPS = [ + "linear", +] + + +class VulkanQuantizer(Quantizer): + + def __init__(self) -> None: + super().__init__() + self.global_config: Optional[QuantizationConfig] = None + + def set_global(self, quantization_config: QuantizationConfig) -> VulkanQuantizer: + self.global_config = quantization_config + return self + + def transform_for_annotation( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + """Transforms scalar values to tensor attributes""" + return _convert_scalars_to_attrs(model) + + def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule: + # currently only support static quant on Vulkan + model = self._annotate_for_static_quantization_config(model) + propagate_annotation(model) + return model + + def _annotate_all_static_patterns( + self, + model: torch.fx.GraphModule, + quantization_config: Optional[QuantizationConfig], + filter_fn: Optional[Callable[[Node], bool]] = None, + ) -> torch.fx.GraphModule: + if quantization_config is None: + return model + + for op in _SUPPORTED_OPS: + OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn) + return model + + def _annotate_for_static_quantization_config( + self, model: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + self._annotate_all_static_patterns( + model, + self.global_config, + ) + return model + + def validate(self, model: torch.fx.GraphModule) -> None: + pass diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index bb4d3e6603..7ed9469f77 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -412,7 +412,7 @@ void maybe_resize_output( // VulkanBackend class // -class VulkanBackend final : public PyTorchBackendInterface { +class VulkanBackend final : public ::executorch::runtime::BackendInterface { public: ~VulkanBackend() override = default; @@ -421,7 +421,7 @@ class VulkanBackend final : public PyTorchBackendInterface { return true; } - __ET_NODISCARD Error + ET_NODISCARD Error compileModel(const void* buffer_pointer, ComputeGraph* compute_graph) const { Result header = VulkanDelegateHeader::parse(buffer_pointer); @@ -485,7 +485,7 @@ class VulkanBackend final : public PyTorchBackendInterface { } Error execute( - __ET_UNUSED BackendExecutionContext& context, + ET_UNUSED BackendExecutionContext& context, DelegateHandle* handle, EValue** args) const override { EXECUTORCH_SCOPE_PROF("VulkanBackend::execute"); diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index b94e242df6..4d2a854de3 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -95,9 +95,9 @@ vkapi::DescriptorSet Context::get_descriptor_set( pipeline_layout_cache().retrieve(shader_layout); vkapi::SpecVarList spec_constants = { - SV(local_workgroup_size.data[0u]), - SV(local_workgroup_size.data[1u]), - SV(local_workgroup_size.data[2u])}; + SV(local_workgroup_size[0u]), + SV(local_workgroup_size[1u]), + SV(local_workgroup_size[2u])}; spec_constants.append(additional_constants); @@ -119,11 +119,11 @@ void Context::register_shader_dispatch( const utils::uvec3& global_workgroup_size) { // Adjust the global workgroup size based on the output tile size uint32_t global_wg_w = utils::div_up( - global_workgroup_size.data[0u], shader_descriptor.out_tile_size.data[0u]); + global_workgroup_size[0u], shader_descriptor.out_tile_size[0u]); uint32_t global_wg_h = utils::div_up( - global_workgroup_size.data[1u], shader_descriptor.out_tile_size.data[1u]); + global_workgroup_size[1u], shader_descriptor.out_tile_size[1u]); uint32_t global_wg_d = utils::div_up( - global_workgroup_size.data[2u], shader_descriptor.out_tile_size.data[2u]); + global_workgroup_size[2u], shader_descriptor.out_tile_size[2u]); // Submitting a global work group size of 0 is undefined behaviour. If this is // detected then submit a single workgroup instead. diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h index de77c57fb0..0f496a4af8 100644 --- a/backends/vulkan/runtime/api/api.h +++ b/backends/vulkan/runtime/api/api.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h new file mode 100644 index 0000000000..6f67ae8a64 --- /dev/null +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName + +#include + +#include + +#include + +namespace vkcompute { +namespace api { + +class StagingBuffer final { + private: + Context* context_p_; + vkapi::ScalarType dtype_; + size_t numel_; + size_t nbytes_; + vkapi::VulkanBuffer vulkan_buffer_; + + void* mapped_data_; + + public: + StagingBuffer( + Context* context_p, + const vkapi::ScalarType dtype, + const size_t numel) + : context_p_(context_p), + dtype_(dtype), + numel_(numel), + nbytes_(element_size(dtype_) * numel_), + vulkan_buffer_( + context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)), + mapped_data_(nullptr) {} + + StagingBuffer(const StagingBuffer&) = delete; + StagingBuffer& operator=(const StagingBuffer&) = delete; + + StagingBuffer(StagingBuffer&&) = default; + StagingBuffer& operator=(StagingBuffer&&) = default; + + ~StagingBuffer() { + context_p_->register_buffer_cleanup(vulkan_buffer_); + } + + inline vkapi::ScalarType dtype() { + return dtype_; + } + + inline vkapi::VulkanBuffer& buffer() { + return vulkan_buffer_; + } + + inline void* data() { + if (!mapped_data_) { + mapped_data_ = vulkan_buffer_.allocation_info().pMappedData; + } + return mapped_data_; + } + + inline size_t numel() { + return numel_; + } + + inline size_t nbytes() { + return nbytes_; + } + + inline void copy_from(const void* src, const size_t nbytes) { + VK_CHECK_COND(nbytes <= nbytes_); + memcpy(data(), src, nbytes); + vmaFlushAllocation( + vulkan_buffer_.vma_allocator(), + vulkan_buffer_.allocation(), + 0u, + VK_WHOLE_SIZE); + } + + inline void copy_to(void* dst, const size_t nbytes) { + VK_CHECK_COND(nbytes <= nbytes_); + vmaInvalidateAllocation( + vulkan_buffer_.vma_allocator(), + vulkan_buffer_.allocation(), + 0u, + VK_WHOLE_SIZE); + memcpy(dst, data(), nbytes); + } + + inline void set_staging_zeros() { + memset(data(), 0, nbytes_); + } +}; + +} // namespace api +} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/StorageBuffer.h b/backends/vulkan/runtime/api/containers/StorageBuffer.h deleted file mode 100644 index 17c3470605..0000000000 --- a/backends/vulkan/runtime/api/containers/StorageBuffer.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -namespace vkcompute { -namespace api { - -class StorageBuffer final { - private: - Context* context_p_; - vkapi::ScalarType dtype_; - size_t numel_; - size_t nbytes_; - vkapi::VulkanBuffer vulkan_buffer_; - - public: - StorageBuffer( - Context* context_p, - const vkapi::ScalarType dtype, - const size_t numel, - const bool gpuonly = false) - : context_p_(context_p), - dtype_(dtype), - numel_(numel), - nbytes_(element_size(dtype_) * numel_), - vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer( - nbytes_, - gpuonly)) {} - - StorageBuffer(const StorageBuffer&) = delete; - StorageBuffer& operator=(const StorageBuffer&) = delete; - - StorageBuffer(StorageBuffer&&) = default; - StorageBuffer& operator=(StorageBuffer&&) = default; - - ~StorageBuffer() { - context_p_->register_buffer_cleanup(vulkan_buffer_); - } - - inline vkapi::ScalarType dtype() { - return dtype_; - } - - inline vkapi::VulkanBuffer& buffer() { - return vulkan_buffer_; - } - - inline size_t numel() { - return numel_; - } - - inline size_t nbytes() { - return nbytes_; - } -}; - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 85656f791a..fb93c7a03b 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -13,36 +13,140 @@ namespace vkcompute { namespace api { +/* + * Given the strides of a buffer-backed tensor, estimate the equivalent memory + * layout enum value by identifying the fastest moving dimension. + */ +utils::GPUMemoryLayout estimate_memory_layout( + const std::vector& dim_order) { + int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back(); + if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) { + return utils::GPUMemoryLayout(fastest_dim_whcn); + } + + // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding + // a UNKOWN GPUMemoryLayout. This is not high priority though because we don't + // expect this to ever come up in practice. + VK_THROW("No compatible GPUMemoryLayout value"); +} + +std::vector calculate_dim_order( + const size_t ndim, + const utils::GPUMemoryLayout memory_layout) { + // Special case for zero dim tensors + if (ndim == 0) { + return {0}; + } + std::vector dim_order(ndim); + int64_t last_dim = + ndim - utils::to_packed_dim_nchw_offset(memory_layout); + + int64_t cur_dim = 0; + for (int d = 0; d < ndim; ++d) { + if (d == last_dim) { + cur_dim++; + } + dim_order[d] = cur_dim; + cur_dim++; + } + if (last_dim >= 0) { + dim_order[ndim - 1] = last_dim; + } + + return dim_order; +} + std::vector calculate_strides( const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout, - const bool texel_strides) { - const int64_t dim_offset = - utils::to_packed_dim_nchw_offset(memory_layout); - const int64_t last_dim = sizes.size() - dim_offset; - VK_CHECK_COND(last_dim >= 0); + const std::vector& dim_order) { + // For zero dim tensors + if (sizes.size() == 0) { + return {1}; + } size_t ndim = sizes.size(); std::vector strides(ndim); - const int64_t last_dim_size = - texel_strides ? utils::div_up_4(sizes.at(last_dim)) : sizes.at(last_dim); - - for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) { - strides.at(stride_d) = 1; - if (stride_d == last_dim) { - continue; - } - strides.at(stride_d) = last_dim_size; - for (int size_d = ndim - 1; size_d > stride_d; size_d--) { - if (size_d != last_dim) { - strides.at(stride_d) *= sizes.at(size_d); - } + strides[dim_order[ndim - 1]] = 1; + for (int32_t i = ndim - 2; i >= 0; --i) { + if (sizes[dim_order[i + 1]] == 0) { + strides[dim_order[i]] = strides[dim_order[i + 1]]; + } else { + strides[dim_order[i]] = + strides[dim_order[i + 1]] * sizes[dim_order[i + 1]]; } } + return strides; } +/* + * Axis mapping is somewhat analogous to strides for texture backed tensors. + * + * The axis mapping is normalized to 4 dimensions, similar to the padded sizes. + * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture + * axis that corresponds to the width, height, and channels dimension of the + * tensor. Thus the axis mapping can be considered to be in WHCN dimension + * order. + * + * The last value `axis_map.at(3)` indicates the WHCN index of the tensor + * dimension along which batches will be concatenated. This dimension can be + * referred to as the "inner dimension" To determine which image texture axis is + * used for the concatenation, a double lookup will need to be performed + * (axis_map.at(axis_map.at(3))). + * + * The reason for strucuring axis mapping this way is because for the batch dim, + * two things need to be easily derived: + * + * 1. The dim idx of the inner dimension, so that the size of the inner + * dimension can be easily determined. + * 2. The texture axis used to concatenate batches + * + * By storing the dim index of the inner dimension instead of the texture axis + * it maps to, both pieces of information are readily available. + * + * The axis mapping allows for permuted views of texture-backed tensors. + */ +std::vector default_axis_map() { + // Currently, all compute shaders have an assumption that the channels dim is + // used to combine with the batch dim of a tensor. However, once dim mapping + // is integrated into the tensor indexing logic for each compute shader, we + // can be more flexible with mapping the batch dim to different texture axes + // in order to improve performance or memory footprint. + return {0, 1, 2, 2}; +} + +bool dim_order_is_valid(const std::vector& dim_order) { + int64_t sum = 0; + for (size_t i = 0; i < dim_order.size(); ++i) { + if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) { + return false; + } + sum += dim_order[i]; + } + int64_t n = static_cast(dim_order.size() - 1); + // Sanity check that the sum of the indices in the vector is equal to the sum + // of 0 + 1 + 2 + ... + (ndim - 1) + return sum == n * (n + 1) / 2; +} + +std::vector unsqueeze_strides( + const std::vector& strides, + const int64_t numel) { + const size_t ndim = strides.size(); + const size_t ndim_up4 = utils::align_up_4(strides.size()); + std::vector unsqueezed_strides(ndim_up4); + for (int32_t i = 1; i <= ndim; ++i) { + int64_t dim_stride = strides.at(ndim - i); + unsqueezed_strides.at(ndim_up4 - i) = dim_stride; + } + + for (int32_t i = ndim + 1; i <= ndim_up4; ++i) { + unsqueezed_strides.at(ndim_up4 - i) = numel; + } + return unsqueezed_strides; +} + std::vector calculate_padded_sizes( const std::vector& sizes, const utils::GPUMemoryLayout memory_layout) { @@ -69,230 +173,44 @@ std::vector calculate_padded_sizes( utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const std::vector& axis_map, const utils::GPUMemoryLayout memory_layout) { VK_CHECK_COND(padded_sizes.size() == 4); + VK_CHECK_COND(axis_map.size() == 4); + + utils::uvec3 extents({1, 1, 1}); + // First three elements of axis_map indicate which (X,Y,Z) image axis the + // width, height, and channels dim of the tensor maps to. + for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) { + const int64_t axis = axis_map.at(whcn_dim); + const int64_t dim = padded_sizes.size() - 1 - whcn_dim; + extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); + } - uint32_t N = utils::safe_downcast(padded_sizes.at(0)); - uint32_t C = utils::safe_downcast(padded_sizes.at(1)); - uint32_t H = utils::safe_downcast(padded_sizes.at(2)); - uint32_t W = utils::safe_downcast(padded_sizes.at(3)); + // axis_map[3] indicates the WHCN index of the dimension used for batch + // concatenation. Thus a double lookup is required to determine the image axis + // used for batch concatenation. + const int64_t concatted_whcn_dim = axis_map.at(3); + const int64_t batch_axis = axis_map.at(concatted_whcn_dim); + // Multiply the extents of the batch axis by the batch size. + extents[batch_axis] *= padded_sizes.at(0); switch (memory_layout) { case utils::kWidthPacked: - VK_CHECK_COND(W % 4 == 0); - W /= 4; + VK_CHECK_COND(extents[axis_map.at(0)] % 4 == 0); + extents[axis_map.at(0)] /= 4; break; case utils::kHeightPacked: - VK_CHECK_COND(H % 4 == 0); - H /= 4; + VK_CHECK_COND(extents[axis_map.at(1)] % 4 == 0); + extents[axis_map.at(1)] /= 4; break; case utils::kChannelsPacked: - VK_CHECK_COND(C % 4 == 0); - C /= 4; - break; - } - - return {W, H, C * N}; -} - -// -// vTensor -// - -vTensor::vTensor( - Context* const context, - const std::vector& sizes, - const vkapi::ScalarType dtype, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const bool allocate_memory) - : dtype_(dtype), - memory_layout_(memory_layout), - // Calculate sizes and strides - sizes_(sizes.begin(), sizes.end()), - padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, - texture_limits_{{0, 0, 0}}, - // Utility Uniform Buffers that can be passed to shaders as arguments - sizes_uniform_(), - texture_limits_uniform_(), - texel_strides_uniform_(), - ntexels_uniform_(), - // Construct Tensor storage - storage_( - context, - storage_type, - memory_layout_, - padded_sizes_, - dtype_, - allocate_memory) { - if (storage_type != utils::kBuffer) { - texture_limits_.limits = utils::ivec3{ - utils::safe_downcast(storage_.image_extents_.data[0]), - utils::safe_downcast(storage_.image_extents_.data[1]), - utils::safe_downcast(storage_.image_extents_.data[2])}; - } - - if (dtype == vkapi::kHalf) { - VK_CHECK_COND( - api::context()->adapter_ptr()->has_16bit_storage(), - "Half dtype is only available if the physical device supports float16 " - "storage buffers!"); - } -} - -vkapi::VulkanImage& vTensor::image( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage) & { - storage_.transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ); - return storage_.image_; -} - -vkapi::VulkanImage& vTensor::image( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage, - const vkapi::MemoryAccessFlags access) & { - storage_.transition(pipeline_barrier, stage, access); - return storage_.image_; -} - -vkapi::VulkanBuffer& vTensor::buffer( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage) & { - storage_.transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ); - return storage_.buffer_; -} - -vkapi::VulkanBuffer& vTensor::buffer( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags stage, - const vkapi::MemoryAccessFlags access) & { - storage_.transition(pipeline_barrier, stage, access); - return storage_.buffer_; -} - -const vkapi::BufferBindInfo vTensor::sizes_ubo() { - if (!sizes_uniform_.buffer()) { - sizes_uniform_ = - ParamsBuffer(storage_.context_, utils::make_whcn_ivec4(sizes_)); - } - return vkapi::BufferBindInfo(sizes_uniform_.buffer()); -} - -const vkapi::BufferBindInfo vTensor::texture_limits_ubo() { - if (!texture_limits_uniform_.buffer()) { - texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_); - } - return vkapi::BufferBindInfo(texture_limits_uniform_.buffer()); -} - -const vkapi::BufferBindInfo vTensor::texel_strides_ubo() { - if (!texel_strides_uniform_.buffer()) { - texel_strides_uniform_ = ParamsBuffer( - storage_.context_, - utils::make_whcn_ivec4( - calculate_strides(padded_sizes_, memory_layout_))); - } - return vkapi::BufferBindInfo(texel_strides_uniform_.buffer()); -} - -const vkapi::BufferBindInfo vTensor::ntexels_ubo() { - if (!ntexels_uniform_.buffer()) { - ntexels_uniform_ = ParamsBuffer(storage_.context_, texel_numel()); - } - return vkapi::BufferBindInfo(ntexels_uniform_.buffer()); -} - -VmaAllocationCreateInfo vTensor::get_allocation_create_info() const { - switch (storage_type()) { - case utils::kBuffer: - return storage_.buffer_.allocation_create_info(); - case utils::kTexture2D: - case utils::kTexture3D: - return storage_.image_.allocation_create_info(); - } - return {}; -} - -VkMemoryRequirements vTensor::get_memory_requirements() const { - switch (storage_type()) { - case utils::kBuffer: - return storage_.buffer_.get_memory_requirements(); - case utils::kTexture2D: - case utils::kTexture3D: - return storage_.image_.get_memory_requirements(); - } - return {}; -} - -void vTensor::bind_allocation(const vkapi::Allocation& allocation) { - switch (storage_type()) { - case utils::kBuffer: - storage_.buffer_.bind_allocation(allocation); + VK_CHECK_COND(extents[axis_map.at(2)] % 4 == 0); + extents[axis_map.at(2)] /= 4; break; - case utils::kTexture2D: - case utils::kTexture3D: - storage_.image_.bind_allocation(allocation); - break; - } -} - -void vTensor::update_size_metadata(const std::vector& new_sizes) { - sizes_ = new_sizes; - padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_); - - // Calculate the extents of the image texture that would have been required - // for a tensor of the new sizes. - utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); - - // Update the texture limits to reflect the new virtual extents. - texture_limits_.limits = utils::ivec3{ - utils::safe_downcast(virtual_extents.data[0]), - utils::safe_downcast(virtual_extents.data[1]), - utils::safe_downcast(virtual_extents.data[2])}; - - if (sizes_uniform_.buffer()) { - sizes_uniform_.update(utils::make_whcn_ivec4(sizes_)); - } - if (texture_limits_uniform_.buffer()) { - texture_limits_uniform_.update(texture_limits_); - } - if (texel_strides_uniform_.buffer()) { - texel_strides_uniform_.update(utils::make_whcn_ivec4( - calculate_strides(padded_sizes_, memory_layout_))); - } - if (ntexels_uniform_.buffer()) { - ntexels_uniform_.update(texel_numel()); - } -} - -void vTensor::reallocate(const std::vector& new_sizes) { - update_size_metadata(new_sizes); - storage_.discard_and_reallocate( - calculate_padded_sizes(new_sizes, memory_layout_), - memory_layout_, - dtype_); -} - -void vTensor::virtual_resize(const std::vector& new_sizes) { - if (storage_type() != utils::kBuffer) { - // For texture storage check that the current texture is large enough for - // the new sizes of the tensor. - utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); - - bool valid_resize = virtual_extents.data[0] <= image_extents().data[0]; - valid_resize = - valid_resize && virtual_extents.data[1] <= image_extents().data[1]; - valid_resize = - valid_resize && virtual_extents.data[2] <= image_extents().data[2]; - - VK_CHECK_COND( - valid_resize, - "Cannot use virtual resize if new sizes requires a larger texture."); } - update_size_metadata(new_sizes); + return extents; } // @@ -361,20 +279,23 @@ vkapi::VulkanBuffer allocate_buffer( } return adapter_ptr->vma().create_storage_buffer( - element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory); + element_size(dtype) * numel, allocate_memory); } vTensorStorage::vTensorStorage( Context* const context, const utils::StorageType storage_type, const utils::GPUMemoryLayout gpu_memory_layout, + const std::vector& axis_map, const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory) : context_(context), storage_type_{storage_type}, - image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)), + image_extents_( + calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout)), buffer_length_{utils::multiply_integers(padded_sizes)}, + buffer_offset_{0}, image_(allocate_image( context_, image_extents_, @@ -389,6 +310,18 @@ vTensorStorage::vTensorStorage( allocate_memory)), last_access_{} {} +vTensorStorage::vTensorStorage( + const vTensorStorage& other, + const int64_t buffer_offset) + : context_(other.context_), + storage_type_{other.storage_type_}, + image_extents_(other.image_extents_), + buffer_length_{other.buffer_length_}, + buffer_offset_{buffer_offset}, + image_(other.image_), + buffer_(other.buffer_, buffer_offset), + last_access_{other.last_access_} {} + vTensorStorage::~vTensorStorage() { flush(); } @@ -456,26 +389,361 @@ void vTensorStorage::transition( last_access_.access = cur_access; } -void vTensorStorage::discard_and_reallocate( - const std::vector& padded_sizes, - const utils::GPUMemoryLayout gpu_memory_layout, - const vkapi::ScalarType dtype) { - const bool image_owns_memory = image_.owns_memory(); - const bool buffer_owns_memory = buffer_.owns_memory(); +bool vTensorStorage::is_copy_of(const vTensorStorage& other) const { + if (storage_type_ == utils::kBuffer) { + return buffer_.is_copy_of(other.buffer_); + } + return image_.is_copy_of(other.image_); +} - flush(); +// +// vTensor +// + +vTensor::vTensor( + Context* const context, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout, + const bool allocate_memory) + : dtype_(dtype), + memory_layout_(memory_layout), + // Calculate tensor metadata + sizes_(sizes.begin(), sizes.end()), + dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)), + axis_map_(default_axis_map()), + strides_(calculate_strides(sizes, dim_order_)), + numel_(utils::multiply_integers(sizes_)), + padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, + unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, + padded_numel_(utils::multiply_integers(padded_sizes_)), + logical_limits_{{0, 0, 0}}, + // Utility Uniform Buffers that can be passed to shaders as arguments + sizes_uniform_(), + strides_uniform_(), + numel_uniform_(), + axis_map_uniform_(), + logical_limits_uniform_(), + // Construct Tensor storage + storage_( + context, + storage_type, + memory_layout_, + axis_map_, + padded_sizes_, + dtype_, + allocate_memory) { + VK_CHECK_COND( + dim_order_is_valid(dim_order_), "computed dim order is invalid"); + + if (storage_type != utils::kBuffer) { + set_logical_limits(storage_.image_extents_); + } + + if (dtype == vkapi::kHalf) { + VK_CHECK_COND( + api::context()->adapter_ptr()->has_16bit_storage(), + "Half dtype is only available if the physical device supports float16 " + "storage buffers!"); + } +} + +vTensor::vTensor(const vTensor& other) + : dtype_(other.dtype_), + memory_layout_(other.memory_layout_), + // Copy tensor size metadata + sizes_(other.sizes_.begin(), other.sizes_.end()), + dim_order_(other.dim_order_.begin(), other.dim_order_.end()), + axis_map_(other.axis_map_.begin(), other.axis_map_.end()), + strides_(other.strides_.begin(), other.strides_.end()), + numel_(other.numel_), + padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, + unsqueezed_strides_{ + other.unsqueezed_strides_.begin(), + other.unsqueezed_strides_.end()}, + padded_numel_(other.padded_numel_), + logical_limits_{other.logical_limits_}, + // Empty initialize Utility Uniform Buffers + sizes_uniform_(), + strides_uniform_(), + numel_uniform_(), + axis_map_uniform_(), + logical_limits_uniform_(), + // Copy Tensor storage + storage_(other.storage_) {} + +vTensor::vTensor( + const vTensor& other, + const std::vector& sizes, + const std::vector& dim_order, + const int64_t offset_numel) + : dtype_(other.dtype_), + memory_layout_(estimate_memory_layout(dim_order)), + // Copy tensor size metadata + sizes_(sizes.begin(), sizes.end()), + dim_order_(dim_order.begin(), dim_order.end()), + axis_map_(default_axis_map()), + strides_(calculate_strides(sizes_, dim_order_)), + numel_(utils::multiply_integers(sizes_)), + padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, + unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, + padded_numel_(utils::multiply_integers(padded_sizes_)), + logical_limits_(other.logical_limits_), + // Empty initialize Utility Uniform Buffers + sizes_uniform_(), + strides_uniform_(), + numel_uniform_(), + axis_map_uniform_(), + logical_limits_uniform_(), + // Copy Tensor storage + storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { + VK_CHECK_COND( + dim_order_is_valid(dim_order_), "new dim order provided is invalid"); + VK_CHECK_COND( + offset_numel + numel_ <= other.numel(), + "Tensor alias cannot access more elements than available in the original" + "tensor"); +} + +vkapi::VulkanImage& vTensor::image( + vkapi::PipelineBarrier& pipeline_barrier, + const vkapi::PipelineStageFlags stage) & { + storage_.transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ); + return storage_.image_; +} + +vkapi::VulkanImage& vTensor::image( + vkapi::PipelineBarrier& pipeline_barrier, + const vkapi::PipelineStageFlags stage, + const vkapi::MemoryAccessFlags access) & { + storage_.transition(pipeline_barrier, stage, access); + return storage_.image_; +} + +vkapi::VulkanBuffer& vTensor::buffer( + vkapi::PipelineBarrier& pipeline_barrier, + const vkapi::PipelineStageFlags stage) & { + storage_.transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ); + return storage_.buffer_; +} + +vkapi::VulkanBuffer& vTensor::buffer( + vkapi::PipelineBarrier& pipeline_barrier, + const vkapi::PipelineStageFlags stage, + const vkapi::MemoryAccessFlags access) & { + storage_.transition(pipeline_barrier, stage, access); + return storage_.buffer_; +} + +void vTensor::set_logical_limits(const utils::uvec3& image_extents) { + logical_limits_.limits[0] = image_extents[axis_map_.at(0)]; + logical_limits_.limits[1] = image_extents[axis_map_.at(1)]; + logical_limits_.limits[2] = image_extents[axis_map_.at(2)]; +} + +const vkapi::BufferBindInfo vTensor::sizes_ubo() { + if (!sizes_uniform_.buffer()) { + sizes_uniform_ = + ParamsBuffer(storage_.context_, utils::make_whcn_ivec4(sizes_)); + } + return vkapi::BufferBindInfo(sizes_uniform_.buffer()); +} + +const vkapi::BufferBindInfo vTensor::strides_ubo() { + if (!strides_uniform_.buffer()) { + strides_uniform_ = ParamsBuffer( + storage_.context_, utils::make_whcn_ivec4(unsqueezed_strides_)); + } + return vkapi::BufferBindInfo(strides_uniform_.buffer()); +} + +const vkapi::BufferBindInfo vTensor::axis_map_ubo() { + if (!axis_map_uniform_.buffer()) { + axis_map_uniform_ = + ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_)); + } + return vkapi::BufferBindInfo(axis_map_uniform_.buffer()); +} + +const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { + if (!logical_limits_uniform_.buffer()) { + logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_); + } + return vkapi::BufferBindInfo(logical_limits_uniform_.buffer()); +} + +const vkapi::BufferBindInfo vTensor::numel_ubo() { + if (!numel_uniform_.buffer()) { + numel_uniform_ = ParamsBuffer(storage_.context_, numel_); + } + return vkapi::BufferBindInfo(numel_uniform_.buffer()); +} + +size_t vTensor::staging_buffer_numel() const { + const bool is_int8 = dtype_ == vkapi::kChar; + const bool int8_supported = + storage_.context_->adapter_ptr()->has_full_int8_buffers_support(); + if (is_int8 && !int8_supported) { + return utils::align_up_4(numel_); + } + if (storage_type() == utils::kBuffer) { + return numel_; + } + return padded_numel_; +} + +VkMemoryRequirements vTensor::get_memory_requirements() const { + switch (storage_type()) { + case utils::kBuffer: + return storage_.buffer_.get_memory_requirements(); + case utils::kTexture2D: + case utils::kTexture3D: + return storage_.image_.get_memory_requirements(); + } + return {}; +} + +void vTensor::bind_allocation(const vkapi::Allocation& allocation) { + switch (storage_type()) { + case utils::kBuffer: + storage_.buffer_.bind_allocation(allocation); + break; + case utils::kTexture2D: + case utils::kTexture3D: + storage_.image_.bind_allocation(allocation); + break; + } +} + +void vTensor::update_metadata() { + strides_ = calculate_strides(sizes_, dim_order_); + // Only update the memory layout for buffer-backed tensors. Strides are + // meaningless for texture-backed tensors and do not impact the memory layout. + if (storage_type() == utils::kBuffer) { + memory_layout_ = estimate_memory_layout(dim_order_); + } + numel_ = utils::multiply_integers(sizes_); - image_extents_ = calculate_image_extents(padded_sizes, gpu_memory_layout); - image_ = allocate_image( - context_, - image_extents_, - storage_type_, - to_vkformat(dtype), - image_owns_memory); - - buffer_length_ = utils::multiply_integers(padded_sizes); - buffer_ = allocate_buffer( - context_, buffer_length_, storage_type_, dtype, buffer_owns_memory); + padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_); + unsqueezed_strides_ = unsqueeze_strides(strides_, numel_); + padded_numel_ = utils::multiply_integers(padded_sizes_); + + // Calculate the image extents that would have been used to allocate a texture + // withthe current sizes, and use that to set the logical limits. + set_logical_limits( + calculate_image_extents(padded_sizes_, axis_map_, memory_layout_)); + + if (sizes_uniform_.buffer()) { + sizes_uniform_.update(utils::make_whcn_ivec4(sizes_)); + } + if (strides_uniform_.buffer()) { + strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_)); + } + if (numel_uniform_.buffer()) { + numel_uniform_.update(numel_); + } + if (axis_map_uniform_.buffer()) { + axis_map_uniform_.update(utils::make_ivec4(axis_map_)); + } + if (logical_limits_uniform_.buffer()) { + logical_limits_uniform_.update(logical_limits_); + } +} + +void vTensor::check_sizes(const std::vector& sizes) const { + if (storage_type() != utils::kBuffer) { + // For texture storage check that the current texture is large enough for + // the new sizes of the tensor. + utils::uvec3 virtual_extents = + calculate_image_extents(padded_sizes_, axis_map_, memory_layout_); + + bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0]; + valid_resize = + valid_resize && virtual_extents[1] <= storage_.image_extents_[1]; + valid_resize = + valid_resize && virtual_extents[2] <= storage_.image_extents_[2]; + + VK_CHECK_COND( + valid_resize, + "tensor sizes requires a larger texture than the current one."); + } else { + // For buffer storage check that the current buffer is large enough for the + // new sizes of the tensor. + int64_t numel = utils::multiply_integers(sizes); + bool valid_resize = + numel + storage_.buffer_offset_ <= storage_.buffer_length_; + VK_CHECK_COND( + valid_resize, + "tensor sizes requires a larger buffer than the current one."); + } +} + +void vTensor::virtual_reconfigure( + const std::vector& new_sizes, + const std::vector& new_dim_order) { + VK_CHECK_COND( + storage_type() == utils::kBuffer, + "virtual_reconfigure is only applicable for buffer backed tensors"); + VK_CHECK_COND(new_sizes.size() == new_dim_order.size()); + VK_CHECK_COND(dim_order_is_valid(new_dim_order)); + + check_sizes(new_sizes); + sizes_ = new_sizes; + dim_order_ = new_dim_order; + update_metadata(); +} + +void vTensor::virtual_resize(const std::vector& new_sizes) { + VK_CHECK_COND( + new_sizes.size() == dim_order_.size(), + "new sizes cannot modify the dimensionality of the tensor "); + + check_sizes(new_sizes); + sizes_ = new_sizes; + update_metadata(); +} + +/* + * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped + * their "identities", so we need to swap the values of dim0 and dim1 wherever + * they appear in the dim order vector. Compare this to just swapping the + * elements at dim0 and dim1 in the `sizes` vectors. + */ +void transpose_dim_order_inplace( + std::vector& dim_order, + const int64_t dim0, + const int64_t dim1) { + for (int i = 0; i < dim_order.size(); ++i) { + if (dim_order[i] == dim0) { + dim_order[i] = dim1; + } else if (dim_order[i] == dim1) { + dim_order[i] = dim0; + } + } +} + +void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { + std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1); + if (storage_type() == utils::kBuffer) { + transpose_dim_order_inplace(dim_order_, dim0, dim1); + } else { + const int dim0_whcn = sizes_.size() - 1 - dim0; + const int dim1_whcn = sizes_.size() - 1 - dim1; + // Cannot transpose batch dimension for texture storage + VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3); + + std::iter_swap( + axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn); + + if (packed_dim_whcn_idx() == dim0_whcn) { + memory_layout_ = utils::GPUMemoryLayout(dim1_whcn); + } + if (packed_dim_whcn_idx() == dim1_whcn) { + memory_layout_ = utils::GPUMemoryLayout(dim0_whcn); + } + } + update_metadata(); } } // namespace api diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 6ec5ba5b09..6327a0e8fd 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -20,20 +20,25 @@ namespace vkcompute { namespace api { /* - * Given the sizes of a tensor and the GPU memory layout, calculate the strides - * of the tensor in NCHW dimension order. The GPU memory layout will be used to - * determine which dimension is packed along a texel; that dimension will be - * used as the "fasted moving" dimension with a stride of 1. - * - * If texel_strides is true, then the strides will be calculated for a texel - * buffer (i.e. the size of the packed dimension will be modified by the - * div_up_4 function before being used in calculations). Otherwise, the strides - * will be calculated assuming a contiguous scalar buffer. + * Given a GPUMemoryLayout value, produce a dim order vector that matches the + * given memory layout. The produced dim order vector will be in the NCHW + * dimension order + */ +std::vector calculate_dim_order( + const size_t ndim, + const utils::GPUMemoryLayout memory_layout); + +/* + * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) + * dimension order, calculate the strides of the tensor. */ std::vector calculate_strides( const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout, - const bool texel_strides = true); + const std::vector& dim_order); + +std::vector unsqueeze_strides( + const std::vector& strides, + const int64_t numel); /* * When stored on the GPU, tensor data is stored using texels (i.e. a vector of @@ -55,11 +60,11 @@ std::vector calculate_padded_sizes( const utils::GPUMemoryLayout memory_layout); /* - * Given the padded sizes of a tensor and the GPU memory layout, calculate the - * 3D image extents required to store the tensor data as an image texture. + * Calculate the image extents required of a texture backed tensor. */ utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const std::vector& axis_map, const utils::GPUMemoryLayout memory_layout); struct LastAccess { @@ -85,11 +90,24 @@ class vTensorStorage final { Context* context, const utils::StorageType storage_type, const utils::GPUMemoryLayout gpu_memory_layout, - const std::vector& sizes, + const std::vector& axis_map, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); - vTensorStorage(const vTensorStorage& other) = delete; + protected: + /* + * This allows for creation of tensors that use the same underlying storage + * as another tensor. Note that this functionality is currently enabled for + * tensors that have buffer storage only. The created tensor will not have + * ownership of the underlying VkBuffer. This constructor is marked protected + * because this behaviour is unsafe, since the original tensor may be + * destroyed before the copy is destroyed. + */ + vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0); + + public: + // To discourage creating copies, the assignment operator is still deleted. vTensorStorage& operator=(const vTensorStorage& other) = delete; vTensorStorage(vTensorStorage&& other) = default; @@ -108,6 +126,7 @@ class vTensorStorage final { // Resource sizings utils::uvec3 image_extents_{}; int64_t buffer_length_{}; + int64_t buffer_offset_{}; // GPU Storage mutable vkapi::VulkanImage image_; @@ -134,10 +153,10 @@ class vTensorStorage final { return image_.format(); } - void discard_and_reallocate( - const std::vector& padded_sizes, - const utils::GPUMemoryLayout gpu_memory_layout, - const vkapi::ScalarType dtype); + /* + * Used for checking if this vTensorStorage is a copy of another instance + */ + bool is_copy_of(const vTensorStorage& other) const; }; class vTensor final { @@ -157,24 +176,108 @@ class vTensor final { const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked, const bool allocate_memory = true); - vTensor(const vTensor& other) = delete; + /* + * This constructor allows for the creation of a vTensor that references the + * same buffer resource of another vTensor, with the same sizes and strides + * metadata. The created vTensor will not own the underlying resource. This is + * only applicable for buffer backed tensors at the moment. + * + * Once created, the sizes and strides of the aliased vTensor can be changed + * using the `virtual_reconfigure` member function. + */ + vTensor(const vTensor& other); + + /* + * This constructor allows for the creation of a vTensor that references the + * same buffer resource of another vTensor, but with different sizes and + * strides metatdata. The created vTensor will not own the underlying + * resource. This is only applicable for buffer backed tensors at the moment. + * + * Note that dim order is used as the source of truth regarding the strides, + * and the new strides are computed from the new sizes and new dim order. + * Thus only the dim order is provided as an argument to this function. + * + * The offset_numel argument allows the aliased tensor's memory region to + * begin at an offset of N elements from the start of the original tensor's + * buffer. + */ + vTensor( + const vTensor& other, + const std::vector& sizes, + const std::vector& dim_order, + const int64_t offset_numel = 0); + + // To discourage making copies, the copy assignment operator is still deleted vTensor& operator=(const vTensor& other) = delete; vTensor(vTensor&& other) = default; vTensor& operator=(vTensor&& other) = default; private: + /* + * "Core" tensor metadata. They are the minimum amount of information required + * to construct a tensor. + */ + + // Whether the tensor has elements of type float, int, etc. vkapi::ScalarType dtype_; + // Describes which dimension is "tightly packed". For texture backed tensors, + // this describes which dimension is packed along a texel. For buffer backed + // tensors, this describes which dimension has a stride of 1 (i.e. is last in + // the dim order). utils::GPUMemoryLayout memory_layout_; - // sizes of the tensor in NCHW dimension order std::vector sizes_; + + /* + * "Layout" metadata. These describe with further detail how tensor data is + * laid out in memory. However, they are considered secondary to the "core" + * metadata members above because defaults can be assumed based on a given + * memory layout. When permuting the tensor without performing a copy, these + * metadata members are the ones that will be changed. All other metadata is + * derived from a combination of sizes, memory layout, and the below members. + */ + + // dim order of the tensor; dimension indices are in NCHW dimension order + // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger + // strides precede the dims with smaller strides in the dim order. The last + // dim is always the fastest moving dim with a stride of 1. + std::vector dim_order_; + // Describes which axis of an image texture each dimension of the tensor maps + // to. The axis mapping allows texture based tensors to be permuted and + // transposed without modifying the underlying texture storage. For a more in + // depth explanation of axis mapping, see the `default_axis_map()` + // function. + std::vector axis_map_; + + /* + * The below can be consider "layout" metadata as well, but are derived from + * the above data members. + */ + + // strides of the tensor in NCHW dimension order + std::vector strides_; + // Contains the number of elements in the tensor according to the canonical + // sizes. + size_t numel_; + + /* + * The below metadata members are derived from the above, and are typically + * to i.e. pass tensor metadata to compute shaders. + */ + // padded sizes of the tensor in NCHW dimension order. See the - // calculate_padded_sizes() function for more context. + // calculate_padded_sizes() function for more context. Note that padded sizes + // are only used for texture storage, and not for buffer storage. std::vector padded_sizes_; - // Contains the "virtual" texture extents of the tensor. See the - // texture_limits() function for more context. - TextureLimits texture_limits_; + // Contains the strides of the tensor, with the dimensionality padded to the + // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max. + std::vector unsqueezed_strides_; + // Contains the number of elements in the tensor according to the padded + // sizes. + size_t padded_numel_; + // See the comments documenting logical_limits() for more context. + TextureLimits logical_limits_; /* * Utility GPU buffers that can be passed to shaders in order to convey tensor @@ -186,9 +289,10 @@ class vTensor final { * context about the data contained in each buffer. */ ParamsBuffer sizes_uniform_; - ParamsBuffer texture_limits_uniform_; - ParamsBuffer texel_strides_uniform_; - ParamsBuffer ntexels_uniform_; + ParamsBuffer strides_uniform_; + ParamsBuffer numel_uniform_; + ParamsBuffer axis_map_uniform_; + ParamsBuffer logical_limits_uniform_; vTensorStorage storage_; @@ -235,8 +339,29 @@ class vTensor final { return storage_.storage_type_ == utils::kBuffer; } - inline const utils::uvec3& image_extents() const { - return storage_.image_extents_; + private: + void set_logical_limits(const utils::uvec3& image_extents); + + public: + /* + * The logical limits of the tensor are derived from the image extents of the + * image texture used to store the tensor, but with two key differences. + * + * First, the image extents are permuted according to the axis map. This + * makes it so that the first element of the logical limit is the limit of the + * texture axis corresponding to the width dimension of the tensor, the next + * element is the limit of the texture axis corresponding to the height + * dimension and the last element is the limit of the texture axis that + * corresponds to the channels dimension of the tensor. + * + * Second, the logical limits may use smaller extents than the actual image + * extents of the image texture. This is due to dynamic shape; if the tensor's + * `virtual_resize()` function is called, then the logical limits will reflect + * the extents that would be needed to support a tensor with the updated sizes + * instead of the original sizes. + */ + inline const utils::ivec3& logical_limits() const { + return logical_limits_.limits; } /* @@ -266,6 +391,22 @@ class vTensor final { return sizes_.size(); } + inline const std::vector& dim_order() const { + return dim_order_; + } + + inline const std::vector& axis_map() const { + return axis_map_; + } + + inline const std::vector& strides() const { + return strides_; + } + + inline const std::vector& unsqueezed_strides() const { + return unsqueezed_strides_; + } + /* * Returns a GPU buffer containing the sizes of the tensor in WHCN order. * Note that dimensions that are not present in the tensor's sizes are set to @@ -274,33 +415,32 @@ class vTensor final { const vkapi::BufferBindInfo sizes_ubo(); /* - * Returns a GPU buffer containing the virtual image extents of the tensor. - * Since a tensor can be resized with the virtual_resize() function, this - * GPU buffer contains the image extents of the tensor calculated using the - * virtual_resize() function. This allows shaders to exit early if they are - * working outside the limits of the texture. - * - * This buffer should only be used to + * Returns a GPU buffer containing the strides of the tensor in WHCN order. + * Note that the strides are extended to a dimensionality that is a multiple + * of 4, thus dimensions that are not present in the tensor's sizes are set to + * have a stride equal to the stride of the "slowest moving" dimension. */ - const vkapi::BufferBindInfo texture_limits_ubo(); + const vkapi::BufferBindInfo strides_ubo(); /* - * Returns the strides of the texel buffer used to store the tensor, as - * calculated by calculate_strides(). + * Returns a GPU buffer containing the texture axis mapping for each dimension + * of the tensor, in WHCN dimension order. */ - const vkapi::BufferBindInfo texel_strides_ubo(); + const vkapi::BufferBindInfo axis_map_ubo(); /* - * Returns the number of texels in the texel buffer used to store the tensor. + * Returns a GPU buffer containing the logical limits of the tensor. See the + * comments for logical_limits() for more context. */ - const vkapi::BufferBindInfo ntexels_ubo(); + const vkapi::BufferBindInfo logical_limits_ubo(); - inline const utils::ivec3 texture_limits() const { - return texture_limits_.limits; - } + /* + * Returns the number of elements in the buffer used to store the tensor. + */ + const vkapi::BufferBindInfo numel_ubo(); inline size_t numel() const { - return utils::multiply_integers(sizes()); + return numel_; } inline size_t nbytes() const { @@ -310,23 +450,14 @@ class vTensor final { /* * Returns numel but based on padded_sizes_ instead of sizes_ */ - inline size_t gpu_numel() const { - return utils::multiply_integers(padded_sizes_); + inline size_t padded_numel() const { + return padded_numel_; } - /* - * Returns the number of texels in the image texture or texel buffer used to - * store the tensor's data. - */ - inline int32_t texel_numel() const { - return utils::safe_downcast(gpu_numel() / 4); - } + size_t staging_buffer_numel() const; - /* - * Return nbytes but based on padded_sizes_ instead of sizes_ - */ - inline VkDeviceSize gpu_nbytes() const { - return element_size(dtype()) * gpu_numel(); + inline size_t staging_buffer_nbytes() const { + return element_size(dtype()) * staging_buffer_numel(); } /* @@ -346,24 +477,49 @@ class vTensor final { private: /* - * Update the size metadata of the vTensor to be new sizes. Should not be used - * directly, reallocate() or virtual_resize() should be used instead. + * Assuming sizes, dim order, or axis mapping was modified, recompute all + * derived metadata and update metadata UBO with new values. */ - void update_size_metadata(const std::vector& new_sizes); + void update_metadata(); + + /* + * Check that tensor sizes are valid given the current storage resource's + * limits. + */ + void check_sizes(const std::vector& sizes) const; public: /* - * Discard the underlying VkImage or VkBuffer and re-allocate based on new - * tensor sizes + * Change how the tensor should be interpreted by compute shaders via updating + * the size and dim order of the tensor. The new sizes and dim order may have + * different dimensionality than the current dimensionality of the tensor. + * + * This function can only be used for buffer-backed tensors, since texture + * backed buffers cannot change dimensionality or memory layout. */ - void reallocate(const std::vector& new_sizes); + void virtual_reconfigure( + const std::vector& new_sizes, + const std::vector& new_dim_order); /* * Perform a virtual resize of the vTensor by modifying the size metadata that * gets used in compute shaders. This allows the shader to treat the - * underlying resource as if it were a different size. + * underlying resource as if it were a different size. The new sizes cannot + * modify the dimensionality of the tensor. */ void virtual_resize(const std::vector& new_sizes); + + /* + * Transpose the tensor in-place by updating its metadata. + */ + void virtual_transpose(const int64_t dim0, const int64_t dim1); + + /* + * Check if this vTensor instance is a view of another vTensor instance + */ + inline bool is_view_of(const vTensor& other) const { + return storage_.is_copy_of(other.storage_); + } }; } // namespace api diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index c734ed395e..6ee29d45f1 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -38,6 +38,10 @@ # Basic configuration settings for shaders DEFAULT_ENV: Dict[str, Any] = { "PRECISION": "highp", + # B is shorthand for "binding". This is used to automatically increment the + # layout binding index when declaring layout bindings. Note that a container + # type is used because integers are immutable in Python. + "B": [0], } # Establishes relationships between different tensor types and different GLSL types @@ -179,8 +183,14 @@ def get_access_qualifier(access_type: Optional[str]) -> str: raise AssertionError(f"Invalid access type: {access_type}") +def get_slot_val(slot: Union[int, List[int]]) -> int: + if isinstance(slot, list): + return slot[0] + return slot + + def layout_declare_buffer( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -192,15 +202,18 @@ def layout_declare_buffer( array_type = buffer_scalar_type(dtype) out_str = f""" -layout(set = 0, binding = {slot}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ +layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ {array_type} {var_name}[]; }}; """ + + if isinstance(slot, list): + slot[0] = slot[0] + 1 return out_str def layout_declare_image( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -209,11 +222,16 @@ def layout_declare_image( ) -> str: image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype] - return f"layout(set = 0, binding = {slot}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" + + ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" + + if isinstance(slot, list): + slot[0] = slot[0] + 1 + return ret_str def layout_declare_sampler( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -222,16 +240,21 @@ def layout_declare_sampler( image_ndim: int = 3, ) -> str: sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype] - return f"layout(set = 0, binding = {slot}) uniform {precision} {sampler_type} {var_name};" + + ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};" + + if isinstance(slot, list): + slot[0] = slot[0] + 1 + return ret_str def layout_declare_tensor( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, storage_type: str, - is_scalar_array: bool = False, + is_scalar_array: bool = True, precision: str = "PRECISION", ) -> str: assert storage_type.lower() in ["buffer", "texture3d", "texture2d"] @@ -262,7 +285,9 @@ def layout_declare_tensor( ) -def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str: +def layout_declare_ubo( + slot: Union[int, List[int]], *args, precision: str = "PRECISION" +) -> str: assert len(args) % 2 == 0 var_list = list(zip(args[::2], args[1::2])) @@ -272,12 +297,14 @@ def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str: ubo_name += var_name + "_" out_str = f""" -layout(set = 0, binding = {slot}) uniform {precision} restrict readonly {ubo_name}UBO {{ +layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{ """ for type_name, var_name in var_list: out_str += f"{type_name} {var_name};\n" out_str += "};" + if isinstance(slot, list): + slot[0] = slot[0] + 1 return out_str diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 2046e78e88..e5028082ff 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -38,14 +38,81 @@ namespace vkcompute { VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor) VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef) -VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging) +VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging) VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector, ValueList) +VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt) #undef VALUE_PTR_CLASS_IMPL +// +// TmpTensor +// + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor( + sizes, + dtype, + storage_type, + memory_layout, + sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, sobj_idx)) {} + +TmpTensor::~TmpTensor() { + // Lifetime of this temporary tensor is expired; return the shared object to + // the pool, as long as the sobj index is valid + if (sobj_idx >= 0) { + graph_p->tmp_shared_object_idxs_.emplace(sobj_idx); + } +} + +int64_t TmpTensor::get_sobj_idx() { + int64_t sobj_idx; + // If no available temporary shared objects, request a new one to be created + if (graph_p->tmp_shared_object_idxs_.empty()) { + sobj_idx = graph_p->shared_objects_.size(); + } else { + // Get the first available shared object idx + sobj_idx = graph_p->tmp_shared_object_idxs_.top(); + graph_p->tmp_shared_object_idxs_.pop(); + } + return sobj_idx; +} + // // ComputeGraph // @@ -203,6 +270,24 @@ ValueRef ComputeGraph::add_tensor( sizes, dtype, suggested_memory_layout(sizes), shared_object_idx); } +ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) { + const vTensorPtr t = get_tensor(vref); + ValueRef idx(static_cast(values_.size())); + values_.emplace_back(api::vTensor(*t)); + return idx; +} + +ValueRef ComputeGraph::add_tensor_view( + const ValueRef vref, + const std::vector& sizes, + const std::vector& strides, + const size_t offset_numel) { + const vTensorPtr t = get_tensor(vref); + ValueRef idx(static_cast(values_.size())); + values_.emplace_back(api::vTensor(*t, sizes, strides, offset_numel)); + return idx; +} + ValueRef ComputeGraph::add_tensorref( const std::vector& sizes, const vkapi::ScalarType dtype, @@ -218,7 +303,7 @@ ValueRef ComputeGraph::add_staging( const size_t numel) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); - values_.emplace_back(api::StorageBuffer(context(), dtype, numel)); + values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); return idx; } @@ -243,13 +328,22 @@ ValueRef ComputeGraph::add_string(std::string&& str) { return idx; } +ValueRef ComputeGraph::add_symint(const int32_t val) { + ValueRef idx(static_cast(values_.size())); + check_no_active_value_ptrs(); + values_.emplace_back(SymInt(context(), val)); + return idx; +} + ValueRef ComputeGraph::set_input_tensor( const ValueRef idx, const bool use_staging) { if (use_staging) { vkapi::ScalarType dtype = get_tensor(idx)->dtype(); - size_t gpu_numel = get_tensor(idx)->gpu_numel(); - ValueRef staging_idx = add_staging(dtype, gpu_numel); + // For texture storage, the buffer size needs to account for the zero + // padding applied by unused texel elements. + size_t buf_numel = get_tensor(idx)->staging_buffer_numel(); + ValueRef staging_idx = add_staging(dtype, buf_numel); add_staging_to_tensor_node(*this, staging_idx, idx); inputs_.push_back({idx, staging_idx}); return staging_idx; @@ -263,12 +357,14 @@ ValueRef ComputeGraph::set_output_tensor( const bool use_staging) { if (use_staging) { vkapi::ScalarType dtype = get_tensor(idx)->dtype(); - size_t gpu_numel = get_tensor(idx)->gpu_numel(); - ValueRef staging_idx = add_staging(dtype, gpu_numel); + // For texture storage, the buffer size needs to account for the zero + // padding applied by unused texel elements. + size_t buf_numel = get_tensor(idx)->staging_buffer_numel(); + ValueRef staging_idx = add_staging(dtype, buf_numel); // We only run this when the tensor is non-empty. When the underlying - // tensor is empty (e.g. gpu_numel == 0), we do not allocate a VkImage to + // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to // tensor, we will not be able to bind the node for execution. - if (gpu_numel > 0) { + if (buf_numel > 0) { add_tensor_to_staging_node(*this, idx, staging_idx); } outputs_.push_back({idx, staging_idx}); @@ -278,6 +374,22 @@ ValueRef ComputeGraph::set_output_tensor( return idx; } +vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( + const ValueRef idx) { + if (values_.at(idx).isInt()) { + const int32_t val = extract_scalar(idx); + create_params_buffer(val); + } else if (values_.at(idx).isSymInt()) { + SymIntPtr symint = get_symint(idx); + return vkapi::BufferBindInfo(symint->gpu_buffer.buffer()); + } + VK_THROW("Cannot create a int param buffer for the given value"); +} + +void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) { + get_symint(idx)->set(val); +} + SharedObject& ComputeGraph::get_shared_object(const int64_t idx) { if (idx >= shared_objects_.size()) { shared_objects_.resize(static_cast(idx + 1)); @@ -314,48 +426,48 @@ void ComputeGraph::update_descriptor_counts( utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) { if (is_buffer_storage(idx)) { - return {uint32_t(texel_numel_of(idx)), 1u, 1u}; + return {uint32_t(numel_of(idx)), 1u, 1u}; } - return image_extents_of(idx); + return logical_limits_of(idx); } -utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) { +utils::uvec3 ComputeGraph::create_local_wg_size( + const utils::uvec3 global_wg_size) { if (config_.enable_local_wg_size_override) { return config_.local_wg_size_override; } - if (is_buffer_storage(idx)) { - return {64u, 1u, 1u}; - } - - const utils::uvec3 image_extents = image_extents_of(idx); utils::uvec3 local_group_size = {4, 4, 4}; - if (image_extents.data[2u] == 1) { - if (image_extents.data[1u] == 1) { - local_group_size.data[0u] = 64; - local_group_size.data[1u] = 1; - local_group_size.data[2u] = 1; - } else if (image_extents.data[1u] < 8) { - local_group_size.data[0u] = 16; - local_group_size.data[1u] = 4; - local_group_size.data[2u] = 1; + if (global_wg_size[2u] == 1) { + if (global_wg_size[1u] == 1) { + local_group_size[0u] = 64; + local_group_size[1u] = 1; + local_group_size[2u] = 1; + } else if (global_wg_size[1u] < 8) { + local_group_size[0u] = 16; + local_group_size[1u] = 4; + local_group_size[2u] = 1; } else { - local_group_size.data[0u] = 8; - local_group_size.data[1u] = 8; - local_group_size.data[2u] = 1; + local_group_size[0u] = 8; + local_group_size[1u] = 8; + local_group_size[2u] = 1; } } return local_group_size; } +utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) { + return create_local_wg_size(create_global_wg_size(idx)); +} + void ComputeGraph::copy_into_staging( const ValueRef idx, const void* data, const size_t numel) { StagingPtr staging = get_staging(idx); size_t nbytes = numel * vkapi::element_size(staging->dtype()); - copy_ptr_to_staging(data, *staging, nbytes); + staging->copy_from(data, nbytes); } void ComputeGraph::copy_from_staging( @@ -364,7 +476,7 @@ void ComputeGraph::copy_from_staging( const size_t numel) { StagingPtr staging = get_staging(idx); size_t nbytes = numel * vkapi::element_size(staging->dtype()); - copy_staging_to_ptr(*staging, data, nbytes); + staging->copy_to(data, nbytes); } void ComputeGraph::prepare() { diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 5237a7746d..2e550340ac 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -11,6 +11,7 @@ // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName #include +#include #include @@ -58,14 +59,88 @@ class ComputeGraph; DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) -DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer) +DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector) +DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt); #undef DECL_VALUE_PTR_CLASS +// +// TmpTensor +// + +/* + * This struct is used to recycle the memory of temporary tensors that are + * created during the execution of a node. Upon construction, this struct will + * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance + * if any shared objects are available; if not, then a new one is created. A + * tensor value is then added to the `ComputeGraph` instance with the requested + * specifications. Upon destruction, the shared object index of the temporary + * tensor is returned to `tmp_shared_object_idxs_`. + * + * Note that instances of this struct can be used as if they were `ValueRef` due + * to implementation of a custom casting operator. + * + * This class should only be used to create tensors whose lifetimes exist only + * in a well defined scope (i.e. within a function). + */ +struct TmpTensor { + ComputeGraph* graph_p; + int64_t sobj_idx; + ValueRef vref; + + // + // Match all available overloads of `add_tensor` + // + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype); + + // No copy construction or assignment + TmpTensor(TmpTensor& other) = delete; + TmpTensor& operator=(TmpTensor& other) = delete; + + // No move construction or assignment + TmpTensor(TmpTensor&& other) = delete; + TmpTensor& operator=(TmpTensor&& other) = delete; + + // Custom cast to ValueRef + operator ValueRef() const { + return vref; + }; + + ~TmpTensor(); + + private: + // Helper function to get first available shared object index or request a new + // one to be created. + int64_t get_sobj_idx(); +}; + // // ComputeGraph // @@ -93,7 +168,12 @@ class ComputeGraph final { vkapi::DescriptorPoolConfig execute_descriptor_counts_; std::unique_ptr context_; + std::vector shared_objects_; + // This stack is used by `TmpTensor` instances to recycle shared objects + // for temporary tensors. See the comments of `TmpTensor` for more details + std::stack tmp_shared_object_idxs_; + std::vector values_; std::vector param_ubos_; @@ -154,6 +234,7 @@ class ComputeGraph final { GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList) + GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt); #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS @@ -180,18 +261,35 @@ class ComputeGraph final { return values_.at(idx).type(); } - // Get Tensor Property + // + // Tensor Properties Accessors + // std::vector sizes_of(const ValueRef idx) const; + /* + * Returns the size of the tensor at `idx` along the specified dimension. + * Negative indexing is allowed. + */ + template + T size_at(const int64_t dim, const ValueRef idx) const { + const Value& val = values_.at(idx); + if (val.isTensor()) { + return static_cast(utils::val_at(dim, val.toConstTensor().sizes())); + } else if (val.isTensorRef()) { + return static_cast(utils::val_at(dim, val.toConstTensorRef().sizes)); + } + VK_THROW("Could not get sizes of value with type ", val.type()); + } + vkapi::ScalarType dtype_of(const ValueRef idx) const; - inline utils::uvec3 image_extents_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().image_extents(); + inline const utils::ivec3& logical_limits_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().logical_limits(); } - inline int32_t texel_numel_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().texel_numel(); + inline int32_t numel_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().numel(); } inline utils::StorageType storage_type_of(const ValueRef idx) const { @@ -202,6 +300,13 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().has_buffer_storage(); } + inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base) + const { + return values_.at(maybe_view) + .toConstTensor() + .is_view_of(values_.at(base).toConstTensor()); + } + inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().gpu_memory_layout(); } @@ -214,19 +319,25 @@ class ComputeGraph final { return values_.at(idx).toTensor().sizes_ubo(); } - inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().texture_limits_ubo(); + inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().strides_ubo(); } - inline vkapi::BufferBindInfo texel_strides_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().texel_strides_ubo(); + inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().numel_ubo(); } - inline vkapi::BufferBindInfo ntexels_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().ntexels_ubo(); + inline vkapi::BufferBindInfo axis_map_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().axis_map_ubo(); } + inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().logical_limits_ubo(); + } + + // // Scalar Value Extraction + // template T extract_scalar(const ValueRef idx) { @@ -347,6 +458,24 @@ class ComputeGraph final { const ValueRef vref, const utils::GPUMemoryLayout memory_layout); + /* + * Use the copy constructor of `api::vTensor` to create a "view" of the + * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for + * more details. + */ + ValueRef add_tensor_view(const ValueRef vref); + + /* + * Use the copy constructor of `api::vTensor` to create a "view" of the + * `vTensor` value at `vref` with different sizes and dim order. See the copy + * constructor of `api::vTensor` for more details. + */ + ValueRef add_tensor_view( + const ValueRef vref, + const std::vector& sizes, + const std::vector& dim_order, + const size_t offset_numel = 0); + /* * Add a `TensorRef` value to the graph with the specific properties. A * `TensorRef` is a reference to a `api::vTensor` whose data is stored in an @@ -378,15 +507,28 @@ class ComputeGraph final { ValueRef add_string(std::string&& str); + ValueRef add_symint(const int32_t val); + ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true); ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true); template - const vkapi::BufferBindInfo create_params_buffer(const Block& data) { + vkapi::BufferBindInfo create_params_buffer(const Block& data) { param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data)); return vkapi::BufferBindInfo(param_ubos_.back().buffer()); } + /* + * Given a ValueRef, do the following depending on the type of the Value: + * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object + * backing the SymInt. + * - If it is a regular Int, create a new ParamsBuffer using the integer value + * and return the BufferBindInfo of the created ParamsBuffer. + */ + vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx); + + void set_symint(const ValueRef idx, const int32_t val); + /* * Convenience function to add an input tensor along with its staging buffer */ @@ -459,9 +601,7 @@ class ComputeGraph final { utils::uvec3 create_global_wg_size(const ValueRef idx); /* - * Suggest a local workgroup size for a given `api::vTensor` value, assuming - * that every shader invocation calculates one texel element of the output - * tensor. + * Suggest a local workgroup size for a given global workgroup size. * * The local workgroup size will be formed to try and minimize the number of * inactive invocations. @@ -469,6 +609,13 @@ class ComputeGraph final { * Currently, the local workgroup size is hard-coded to contain a total of 64 * shader invocations. In the future, this value can be configured. */ + utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size); + + /* + * Convenience function to suggest a local workgroup size for a given + * `api::vTensor` value, assuming that every shader invocation calculates one + * texel element of the output tensor. + */ utils::uvec3 create_local_wg_size(const ValueRef idx); // @@ -500,6 +647,17 @@ class ComputeGraph final { void resize_input(const int64_t idx, const std::vector& new_sizes); void propagate_resize(); + // + // Miscellaneous Utilities + // + + /* + * Check whether the GPU supports 8 bit buffers. + */ + inline bool int8_buffers_enabled() const { + return context_->adapter_ptr()->has_full_int8_buffers_support(); + } + // // Debug support (implemented in Logging.cpp) // @@ -517,6 +675,9 @@ class ComputeGraph final { friend class DoubleListPtr; friend class BoolListPtr; friend class ValueListPtr; + friend class SymIntPtr; + + friend struct TmpTensor; }; template diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp index 2e4833bfc6..e05fa4e487 100644 --- a/backends/vulkan/runtime/graph/Logging.cpp +++ b/backends/vulkan/runtime/graph/Logging.cpp @@ -71,8 +71,8 @@ void ComputeGraph::print_readable() { << std::setfill(' ') << std::endl; std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20) - << "sizes" << std::setw(10) << "node_type" << std::setw(10) - << "so_idx" << std::endl; + << "sizes" << std::setw(10) << "node_type" << std::setw(15) + << "storage_bytes" << std::setw(10) << "so_idx" << std::endl; size_t value_idx = 0; for (Value& val : values_) { @@ -108,6 +108,16 @@ void ComputeGraph::print_readable() { } } + // Actual storage bytes used + std::cout << std::setw(15); + if (val.isTensor()) { + const api::vTensor& v_tensor = val.toTensor(); + auto memory_reqs = v_tensor.get_memory_requirements(); + std::cout << memory_reqs.size; + } else { + std::cout << ""; + } + std::cout << std::setw(10); if (value_ref_to_shared_object_idx.count(value_idx) > 0) { size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx); diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp index 0d8b77a5b7..f2474da667 100644 --- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp +++ b/backends/vulkan/runtime/graph/containers/SharedObject.cpp @@ -15,10 +15,7 @@ namespace vkcompute { void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) { vTensorPtr t = graph->get_tensor(idx); - // // Aggregate Memory Requirements - // - const VkMemoryRequirements mem_reqs = t->get_memory_requirements(); aggregate_memory_requirements.size = std::max(mem_reqs.size, aggregate_memory_requirements.size); @@ -26,27 +23,6 @@ void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) { std::max(mem_reqs.alignment, aggregate_memory_requirements.alignment); aggregate_memory_requirements.memoryTypeBits |= mem_reqs.memoryTypeBits; - // - // Aggregate Allocation Create Info - // - - const VmaAllocationCreateInfo create_info = t->get_allocation_create_info(); - // Clear out CREATE_STRATEGY bit flags in case of conflict - VmaAllocationCreateFlags clear_mask = ~VMA_ALLOCATION_CREATE_STRATEGY_MASK; - VmaAllocationCreateFlags create_flags = create_info.flags & clear_mask; - // Use the default allocation strategy - aggregate_create_info.flags = - create_flags | vkapi::DEFAULT_ALLOCATION_STRATEGY; - - // Set the usage flag if it is currently not set - if (aggregate_create_info.usage == VMA_MEMORY_USAGE_UNKNOWN) { - aggregate_create_info.usage = create_info.usage; - } - // Otherwise check that there is no conflict regarding usage - VK_CHECK_COND(aggregate_create_info.usage == create_info.usage); - aggregate_create_info.requiredFlags |= create_info.requiredFlags; - aggregate_create_info.preferredFlags |= create_info.preferredFlags; - users.emplace_back(idx); } @@ -54,8 +30,12 @@ void SharedObject::allocate(ComputeGraph* const graph) { if (aggregate_memory_requirements.size == 0) { return; } + + VmaAllocationCreateInfo alloc_create_info = + graph->context()->adapter_ptr()->vma().gpuonly_resource_create_info(); + allocation = graph->context()->adapter_ptr()->vma().create_allocation( - aggregate_memory_requirements, aggregate_create_info); + aggregate_memory_requirements, alloc_create_info); } void SharedObject::bind_users(ComputeGraph* const graph) { diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h index 37e80257f4..bd77f6f39b 100644 --- a/backends/vulkan/runtime/graph/containers/SharedObject.h +++ b/backends/vulkan/runtime/graph/containers/SharedObject.h @@ -28,7 +28,6 @@ struct SharedObject { explicit SharedObject() = default; VkMemoryRequirements aggregate_memory_requirements; - VmaAllocationCreateInfo aggregate_create_info; std::vector users; vkapi::Allocation allocation; diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp new file mode 100644 index 0000000000..c91db84b78 --- /dev/null +++ b/backends/vulkan/runtime/graph/containers/SymInt.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +SymInt::SymInt(api::Context* context_p, const int32_t val) + : gpu_buffer(context_p, val){}; + +void SymInt::set(const int32_t val) { + gpu_buffer.update(val); +} + +void SymInt::operator=(const int32_t val) { + gpu_buffer.update(val); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h new file mode 100644 index 0000000000..0c9fbee5fe --- /dev/null +++ b/backends/vulkan/runtime/graph/containers/SymInt.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace vkcompute { + +/* + * Represents a symbolic integer whose value can be variable. It is implemented + * as a thin wrapper around a `ParamsBuffer` object that holds the value of the + * integer. The `ParamsBuffer` object allows the value of the symbolic integer + * to be changed from the CPU and have those changes be visible to all shaders + * that use the symbolic integer; it also allows the value of the symbolic + * integer to be the result of a compute shader. + * + * Regular scalar types represented by `TypeTag::INT` cannot be used for + * symbolic integers because their value is assumed to be constant; therefore + * the `Value` instance holding the value of the scalar does not contain + * any reference to the GPU buffers used to pass its value into compute shaders. + * Therefore, updating the value of the scalar does not impact the value seen + * by compute shaders. + */ +struct SymInt final { + api::ParamsBuffer gpu_buffer; + + explicit SymInt(api::Context* context_p, const int32_t val); + + void set(const int32_t val); + + void operator=(const int32_t val); +}; + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp index c5ffc65add..e7a8951a55 100644 --- a/backends/vulkan/runtime/graph/containers/Types.cpp +++ b/backends/vulkan/runtime/graph/containers/Types.cpp @@ -29,6 +29,7 @@ std::ostream& operator<<(std::ostream& out, const TypeTag& tag) { PRINT_CASE(BOOLLIST) PRINT_CASE(VALUELIST) PRINT_CASE(STRING) + PRINT_CASE(SYMINT) } return out; } diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h index 79edbd50d3..5840d1695e 100644 --- a/backends/vulkan/runtime/graph/containers/Types.h +++ b/backends/vulkan/runtime/graph/containers/Types.h @@ -36,6 +36,7 @@ enum class TypeTag : uint32_t { // Special Type VALUELIST, STRING, + SYMINT, }; std::ostream& operator<<(std::ostream& out, const TypeTag& tag); diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index ba82213c6f..8773f0c0b0 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -13,6 +13,7 @@ #include #include +#include #include namespace vkcompute { @@ -28,6 +29,11 @@ inline bool is_valid(ValueRef value_ref) { struct IOValueRef { ValueRef value; ValueRef staging; + + // Custom cast to ValueRef + operator ValueRef() const { + return value; + }; }; /* @@ -53,7 +59,7 @@ struct Value final { } u; api::vTensor as_tensor; - api::StorageBuffer as_staging; + api::StagingBuffer as_staging; TensorRef as_tensorref; std::vector as_int_list; @@ -67,6 +73,8 @@ struct Value final { std::string as_string; + SymInt as_symint; + Payload() : u() {} // NOLINTNEXTLINE ~Payload(){}; @@ -108,7 +116,7 @@ struct Value final { CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSOR, api::vTensor, as_tensor, vTensor); CASE_MOVE_MOVEABLE_TYPE( - TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer); + TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer); CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef); // Scalar lists @@ -123,6 +131,7 @@ struct Value final { TypeTag::VALUELIST, std::vector, as_value_list, vector); CASE_MOVE_MOVEABLE_TYPE( TypeTag::STRING, std::string, as_string, basic_string); + CASE_MOVE_MOVEABLE_TYPE(TypeTag::SYMINT, SymInt, as_symint, SymInt); case TypeTag::NONE: clearToNone(); @@ -152,7 +161,7 @@ struct Value final { payload.as_tensor.~vTensor(); break; case TypeTag::STAGING: - payload.as_staging.~StorageBuffer(); + payload.as_staging.~StagingBuffer(); break; case TypeTag::TENSORREF: payload.as_tensorref.~TensorRef(); @@ -172,6 +181,9 @@ struct Value final { case TypeTag::STRING: payload.as_string.~basic_string(); break; + case TypeTag::SYMINT: + payload.as_symint.~SymInt(); + break; // Manually list out the types so that if a type here is added later and // not handled the compiler can catch it. case TypeTag::NONE: @@ -247,7 +259,7 @@ struct Value final { as_tensor); SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - api::StorageBuffer, + api::StagingBuffer, Staging, TypeTag::STAGING, as_staging); @@ -288,6 +300,8 @@ struct Value final { TypeTag::STRING, as_string); + SUPPORT_TRIVIALLY_MOVEABLE_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint); + #undef SUPPORT_TRIVIALLY_COPYABLE_TYPE #undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp index 3b2a826f87..2cb00ba65a 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -35,7 +35,22 @@ ExecuteNode::ExecuteNode( graph.update_descriptor_counts(shader, /*execute = */ true); } +ExecuteNode::ExecuteNode( + const ResizeFunction& resize_fn, + const std::vector& resize_args) + : shader_(), + global_workgroup_size_({0u, 0u, 0u}), + local_workgroup_size_({0u, 0u, 0u}), + args_(), + params_(), + spec_vars_(), + resize_fn_(resize_fn), + resize_args_(resize_args) {} + void ExecuteNode::encode(ComputeGraph* graph) { + if (!shader_) { + return; + } api::Context* const context = graph->context(); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 1fff14e020..dece9ddb50 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -48,7 +48,7 @@ class ExecuteNode final { const std::vector&, const std::vector&)>; - ExecuteNode( + explicit ExecuteNode( ComputeGraph& graph, const vkapi::ShaderInfo& shader, const utils::uvec3& global_workgroup_size, @@ -59,6 +59,15 @@ class ExecuteNode final { const ResizeFunction& resize_fn = nullptr, const std::vector& resize_args = {}); + /* + * This overload of the ExecuteNode constructor is used to register ops which + * update a tensor view. No shader is dispatched, but the node still needs to + * update the view's sizes and strides after a resize. + */ + explicit ExecuteNode( + const ResizeFunction& resize_fn = nullptr, + const std::vector& resize_args = {}); + ~ExecuteNode() = default; void encode(ComputeGraph* graph); @@ -83,6 +92,11 @@ class ExecuteNode final { const vkapi::SpecVarList spec_vars_; const ResizeFunction resize_fn_; const std::vector resize_args_; + + public: + operator bool() const { + return shader_; + } }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index b77c62920d..61b24cd409 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -45,24 +45,23 @@ PrepackNode::PrepackNode( graph.update_descriptor_counts(noop_shader_, /*execute = */ false); } -api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { +api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { vTensorPtr packed = graph->get_tensor(packed_); // If no TensorRef is provided, create a staging buffer of zeros according to // the vkapi::vTensor metadata. if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); - api::StorageBuffer staging(graph->context(), packed->dtype(), numel); - size_t nbytes = numel * vkapi::element_size(packed->dtype()); - set_staging_zeros(staging, nbytes); + api::StagingBuffer staging(graph->context(), packed->dtype(), numel); + staging.set_staging_zeros(); return staging; } TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); - api::StorageBuffer staging(graph->context(), tref->dtype, numel); + api::StagingBuffer staging(graph->context(), tref->dtype, numel); size_t nbytes = numel * vkapi::element_size(tref->dtype); - copy_ptr_to_staging(tref->data, staging, nbytes); + staging.copy_from(tref->data, nbytes); return staging; } @@ -70,7 +69,7 @@ void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); vTensorPtr packed = graph->get_tensor(packed_); - api::StorageBuffer staging = create_staging_buffer(graph); + api::StagingBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index c3ac8b963f..3e713303c3 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -56,7 +56,7 @@ class PrepackNode final { const vkapi::SpecVarList spec_vars_; private: - api::StorageBuffer create_staging_buffer(ComputeGraph* graph); + api::StagingBuffer create_staging_buffer(ComputeGraph* graph); }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h new file mode 100644 index 0000000000..94c9e1274d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/activations.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +float hardswish(float x) { + if (x <= -3) { + return 0; + } else if (x >= 3) { + return x; + } else { + return x * (x + 3) / 6; + } +} + +vec4 hardswish(vec4 tex) { + return vec4( + hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w)); +} + +float hardshrink(float x, float lambda, float neg_lambda) { + return x * (float(x > lambda) + float(x < neg_lambda)); +} + +vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) { + return tex * + (vec4(greaterThan(tex, vec4(lambda))) + + vec4(lessThan(tex, vec4(neg_lambda)))); +} + +float hardsigmoid(float x) { + return mix(float(x >= 0.0), x / 6 + 0.5, float(abs(x) <= 3.0)); +} + +vec4 hardsigmoid(vec4 tex) { + return vec4( + hardsigmoid(tex.x), + hardsigmoid(tex.y), + hardsigmoid(tex.z), + hardsigmoid(tex.w)); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl deleted file mode 100644 index dbc87eb794..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; -layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InSizes { - ivec4 in_sizes; -}; - -layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes { - ivec3 self_sizes; -}; - -layout(set = 0, binding = 7) uniform PRECISION restrict AddmmParams { - float alpha; - float beta; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - vec4 texel = vec4(0); - - $if MAT1_PACKING == "W_packed": - $if MAT2_PACKING == "H_packed": - ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z); - texel = matmul_naive_W_packed_H_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $elif MAT2_PACKING == "W_packed": - texel = matmul_naive_W_packed_W_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $else: - $raise Exception("Unsupported value for MAT2_PACKING") - $else: - $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING") - - vec4 self_texel = get_texel_W_packed( - im_self, - pos, - self_sizes.x == 1, - self_sizes.y == 1); - - texel = beta * self_texel + alpha * texel; - imageStore(im_out, pos, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml deleted file mode 100644 index 48db85cb56..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -addmm_naive: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - MAT1_PACKING: W_packed - MAT2_PACKING: H_packed - MAT2_IS_TRANSPOSED: false - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: addmm_naive_W_packed_H_packed - - NAME: addmm_naive_W_packed_W_packed - MAT2_PACKING: W_packed - - NAME: linear_naive_W_packed_W_packed - MAT2_PACKING: W_packed - MAT2_IS_TRANSPOSED: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl new file mode 100644 index 0000000000..2104f7d796 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl @@ -0,0 +1,174 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +$if MAT2_IS_TRANSPOSED: + #define MAT2_IS_TRANSPOSED + +$if HAS_BIAS: + #define HAS_BIAS + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec3", "out_limits")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat1_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_axis_map")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + ${layout_declare_ubo(B, "ivec4", "bias_axis_map")} + ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int out_packed_dim = C_DIM; +layout(constant_id = 4) const int mat1_packed_dim = W_DIM; +layout(constant_id = 5) const int mat2_packed_dim = H_DIM; +layout(constant_id = 6) const int bias_packed_dim = W_DIM; + +#ifdef HAS_BIAS +vec4 get_bias_texel_W_packed(ivec3 logical_pos) { + ivec3 bias_pos = ivec3(0); + if (bias_sizes.y == 1) { + bias_pos[bias_axis_map.y] = 0; + } else { + bias_pos[bias_axis_map.y] = logical_pos.y; + } + if (bias_sizes.x == 1) { + bias_pos[bias_axis_map.x] = 0; + vec4 bias_texel = texelFetch(bias_tensor, bias_pos, 0); + // Only the first value is valid, the rest is 0 padding + return vec4(bias_texel.x); + } else { + bias_pos[bias_axis_map.x] = logical_pos.x; + } + + return texelFetch(bias_tensor, bias_pos, 0); +} +#endif // HAS_BIAS + +vec4 matmul_naive_k_dim_packed(const ivec3 out_lpos) { + ivec3 mat1_pos; + mat1_pos[mat1_axis_map.x] = 0; + mat1_pos[mat1_axis_map.y] = out_lpos.y; + mat1_pos[mat1_axis_map.z] = out_lpos.z; +#ifdef MAT2_IS_TRANSPOSED + const int mat2_k_axis = mat2_axis_map.x; + const int mat2_row_axis = mat2_axis_map.y; +#else + const int mat2_k_axis = mat2_axis_map.y; + const int mat2_row_axis = mat2_axis_map.x; +#endif // MAT2_IS_TRANSPOSED + + vec4 texel = vec4(0); + const int K = divup4(mat1_sizes.x); + + for (int i = 0; i < K; ++i) { + const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0); + + vec4 sums; + for (int r = 0; r < 4; ++r) { + // On-demand construction of mat2_pos appears to provide the lowest + // latency. Surprisingly, this doesn't translate to mat1_pos. + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_k_axis] = i; + mat2_pos[mat2_row_axis] = out_lpos.x * 4 + r; +#ifndef MAT2_IS_TRANSPOSED + mat2_pos[mat2_axis_map.z] = out_lpos.z; +#endif // MAT2_IS_TRANSPOSED + sums[r] = dot(mat1_tex, texelFetch(mat2_tensor, mat2_pos, 0)); + } + + texel += sums; + + mat1_pos[mat1_axis_map.x]++; + } + + return texel; +} + +vec4 matmul_naive_k_dim_packed_row_dim_packed(const ivec3 out_lpos) { + ivec3 mat1_pos; + mat1_pos[mat1_axis_map.x] = 0; + mat1_pos[mat1_axis_map.y] = out_lpos.y; + mat1_pos[mat1_axis_map.z] = out_lpos.z; + + ivec3 mat2_pos; + mat2_pos[mat2_axis_map.x] = out_lpos.x; + mat2_pos[mat2_axis_map.y] = 0; + mat2_pos[mat2_axis_map.z] = out_lpos.z; + + ivec3 mat2_pos_offset = ivec3(0); + mat2_pos_offset[mat2_axis_map.y] = 1; + + const int mat2_y_axis = mat2_axis_map.y; + + vec4 texel = vec4(0); + const int K = divup4(mat1_sizes.x); + + for (int i = 0; + i < K; + ++i, mat1_pos[mat1_axis_map.x]++, mat2_pos[mat2_axis_map.y]+=4) { + const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0); + + for (int r = 0; r < 4; ++r) { + // On-demand construction of mat2_pos appears to provide the lowest + // latency. Surprisingly, this doesn't translate to mat1_pos. + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_axis_map.x] = out_lpos.x; + mat2_pos[mat2_axis_map.y] = 4 * i + r; + mat2_pos[mat2_axis_map.z] = out_lpos.z; + + vec4 mat1_comp_vec = vec4(mat1_tex[r]); + texel = fma(mat1_comp_vec, texelFetch(mat2_tensor, mat2_pos, 0), texel); + } + } + + return texel; +} + +void main() { + const ivec3 out_lpos = ivec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(out_lpos, out_limits))) { + return; + } + + vec4 texel = vec4(0); + +#ifdef MAT2_IS_TRANSPOSED + if (mat2_packed_dim == W_DIM) { + texel = matmul_naive_k_dim_packed(out_lpos); + } else { + texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos); + } +#else + if (mat2_packed_dim == W_DIM) { + texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos); + } else { + texel = matmul_naive_k_dim_packed(out_lpos); + } +#endif // MAT2_IS_TRANSPOSED + +#ifdef HAS_BIAS + vec4 bias_texel = get_bias_texel_W_packed(out_lpos); + texel = beta * bias_texel + alpha * texel; +#endif // HAS_BIAS + + imageStore(out_tensor, lpos_to_pos(out_lpos, out_axis_map), texel); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml new file mode 100644 index 0000000000..33b617eed1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml @@ -0,0 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +addmm_naive_texture3d: + parameter_names_with_default_values: + DTYPE: float + MAT2_IS_TRANSPOSED: false + HAS_BIAS: true + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: addmm_naive_texture3d + - NAME: matmul_naive_texture3d + HAS_BIAS: false + - NAME: linear_naive_texture3d + MAT2_IS_TRANSPOSED: true + - NAME: matmul_transposed_naive_texture3d + MAT2_IS_TRANSPOSED: true + HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl index 1698efb0b1..ad794d6db4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl @@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED: $if BATCH_MODE: #define BATCH_MODE -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 +$if HAS_BIAS: + #define HAS_BIAS #include "indexing_utils.h" -#include "matmul.h" -// addmm will have additional arguments compared to regular mm -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; -layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self; +${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat1_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_axis_map")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + ${layout_declare_ubo(B, "ivec4", "bias_axis_map")} + ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; +layout(constant_id = 3) const int out_packed_dim = C_DIM; -layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes { - ivec4 self_sizes; -}; +// To convince the SPIR-V compiler to unroll the loops optimally, need this +// macro +#define FOUR 4 -layout(set = 0, binding = 7) uniform PRECISION restrict InLimits { - ivec3 in_limits; +#define TILE_ROWS ${TILE_ROWS} + +// we avoid mat4 and vec4 usage here as they compile to much less efficient +// SPIR-V +struct FloatMatrix_2d { + float data[TILE_ROWS][FOUR]; }; -layout(set = 0, binding = 8) uniform PRECISION restrict Params { - float alpha; - float beta; +struct FloatMatrix_3d { + float data[TILE_ROWS][FOUR][FOUR]; }; -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +#ifdef BATCH_MODE + #define FloatMatrix FloatMatrix_3d +#else + #define FloatMatrix FloatMatrix_2d +#endif // BATCH_MODE + +#ifdef HAS_BIAS +// get texel from self tensor (channel_packed) in addmm +vec4 get_texel_C_packed(const ivec2 idx) { + ivec3 bias_pos = ivec3(0); + if (bias_sizes.x > 1) { + bias_pos[bias_axis_map.x] = idx.x; + } + if (bias_sizes.y > 1) { + bias_pos[bias_axis_map.y] = idx.y; + } -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); + return texelFetch(bias_tensor, bias_pos, 0); +} +#endif // HAS_BIAS + +FloatMatrix matmul_partial(const ivec4 out_idx_tl) { + FloatMatrix results; + for (int i = 0; i < TILE_ROWS; i++) { + for (int j = 0; j < FOUR; j++) { +#ifdef BATCH_MODE + for (int k = 0; k < FOUR; k++) { + results.data[i][j][k] = 0.0f; + } +#else + results.data[i][j] = 0.0f; +#endif // BATCH_MODE + } + } + vec4 mat1_tensor_partial_load[TILE_ROWS]; + vec4 mat2_tensor_partial_load[FOUR]; + +#ifdef MAT2_IS_TRANSPOSED + const int mat2_k_axis = mat2_axis_map.x; + const int mat2_row_axis = mat2_axis_map.y; +#else + const int mat2_k_axis = mat2_axis_map.y; + const int mat2_row_axis = mat2_axis_map.x; +#endif // MAT2_IS_TRANSPOSED + +#ifdef BATCH_MODE + for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { + if (out_idx_tl.z + batch_idx >= out_sizes.z) { + break; + } +#endif // BATCH_MODE + for (int k = 0; k < mat1_sizes.x; k+=4) { + const int k_div4 = k >> 2; + // read and cache (4 x TILE_ROWS) tile of mat1 + for (int r = 0; r < TILE_ROWS; r++) { + ivec3 mat1_pos = ivec3(0); + mat1_pos[mat1_axis_map.x] = k_div4; + mat1_pos[mat1_axis_map.y] = out_idx_tl.y + r; +#ifdef BATCH_MODE + mat1_pos[mat1_axis_map.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0); + } - if (any(greaterThanEqual(pos, out_limits))) { - return; + // read and cache (4 x 4) tile of mat2 + for (int r = 0; r < FOUR; ++r) { + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_k_axis] = k_div4; + mat2_pos[mat2_row_axis] = out_idx_tl.x + r; +#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED) + mat2_pos[mat2_axis_map.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0); + } + + // perform partial dot products and add partial result to results + for (int out_row = 0; out_row < TILE_ROWS; out_row++) { + for (int out_col = 0; out_col < FOUR; out_col++) { +#ifdef BATCH_MODE + results.data[out_row][out_col][batch_idx] += +#else + results.data[out_row][out_col] += +#endif // BATCH_MODE + dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]); + } + } } +#ifdef BATCH_MODE + } +#endif // BATCH_MODE + + return results; +} - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - vec4 self_texel = get_texel_C_packed( - im_self, - out_pos, - self_sizes.x == 1, - self_sizes.y == 1); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r][0], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][1], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][2], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); +// +// Write result matrix to output (3D matmul) +// + +void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) { + ivec3 out_pos = tidx_to_pos( + out_idx_tl, out_sizes, out_axis_map, out_packed_dim); + + for (int tile_c = 0; + tile_c < TILE_ROWS; + tile_c++, out_pos[out_axis_map.y]++) { + out_pos[out_axis_map.x] = out_idx_tl.x; + + for (int tile_r = 0; + tile_r < FOUR; + tile_r++, out_pos[out_axis_map.x]++) { + +#ifdef HAS_BIAS + ivec2 bias_idx; + bias_idx[bias_axis_map.x] = out_pos[out_axis_map.x]; + bias_idx[bias_axis_map.y] = out_pos[out_axis_map.y]; + float bias_val = get_texel_C_packed(bias_idx).x; +#ifdef BATCH_MODE + vec4 bias_texel = vec4(bias_val); +#else + vec4 bias_texel = vec4(bias_val, 0, 0, 0); +#endif // BATCH_MODE +#endif // HAS_BIAS + +#ifdef BATCH_MODE + vec4 out_texel = vec4( + results.data[tile_c][tile_r][0], + results.data[tile_c][tile_r][1], + results.data[tile_c][tile_r][2], + results.data[tile_c][tile_r][3]); +#else + vec4 out_texel = vec4( + results.data[tile_c][tile_r], + 0.0, + 0.0, + 0.0); +#endif // BATCH_MODE + +#ifdef HAS_BIAS + imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel); +#else + imageStore(out_tensor, out_pos, out_texel); +#endif // HAS_BIAS } } } + +void main() { + // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of + // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4) + // tile of output elements will be computed. Note the sizes are written in + // (W x H x C) format. + const ivec3 tile_idx = ivec3(gl_GlobalInvocationID); + + // Calculate the tensor index of the top left element in the output tile + const ivec4 out_idx_topleft = ivec4( + tile_idx.x * 4, + tile_idx.y * TILE_ROWS, +#ifdef BATCH_MODE + tile_idx.z * 4, +#else + tile_idx.z, +#endif // BATCH_MODE + 0); + + // If the top left element is already out of range, then skip + if (any(greaterThanEqual(out_idx_topleft, out_sizes))) { + return; + } + + FloatMatrix results = matmul_partial(out_idx_topleft); + + write_results_C_packed(out_idx_topleft, results); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml index b958d3b954..c82c2003d2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml @@ -7,24 +7,37 @@ addmm_optimized: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - PACKING: C_packed MAT2_IS_TRANSPOSED: false BATCH_MODE: false - TILE_ROW: tile_row_4 + TILE_ROWS: 4 + HAS_BIAS: true generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 + TILE_ROWS: + - VALUE: 4 + SUFFIX: tile_row_4 + - VALUE: 2 + SUFFIX: tile_row_2 DTYPE: - VALUE: float - VALUE: half shader_variants: - NAME: addmm_optimized + - NAME: matmul_optimized + HAS_BIAS: false - NAME: linear_optimized MAT2_IS_TRANSPOSED: true + - NAME: matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + HAS_BIAS: false - NAME: batch_addmm_optimized BATCH_MODE: true + - NAME: batch_matmul_optimized + BATCH_MODE: true + HAS_BIAS: false - NAME: batch_linear_optimized MAT2_IS_TRANSPOSED: true BATCH_MODE: true + - NAME: batch_matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + BATCH_MODE: true + HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index ec7e1da296..bf68ea2d9a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -19,38 +19,43 @@ layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(2, "r", "t_other", DTYPE, STORAGE)} -${layout_declare_ubo(3, "ivec4", "out_sizes")} -${layout_declare_ubo(4, "ivec4", "in_sizes")} -${layout_declare_ubo(5, "ivec4", "other_sizes")} -${layout_declare_ubo(6, "ivec2", "broadcast_params")} -${layout_declare_ubo(7, "float", "alpha")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "in_sizes")} +${layout_declare_ubo(B, "ivec4", "in_axis_map")} +${layout_declare_ubo(B, "ivec4", "other_sizes")} +${layout_declare_ubo(B, "ivec4", "other_axis_map")} +${layout_declare_ubo(B, "ivec2", "broadcast_params")} +${layout_declare_ubo(B, "float", "alpha")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim); - if (any(greaterThanEqual(idx, out_sizes))) { + if (any(greaterThanEqual(tidx, out_sizes))) { return; } - ivec4 in_idx = broadcast_indices(idx, in_sizes); - VEC4_T in_texel = VEC4_T(texelFetch( + // broadcast on logical sizes + ivec4 in_idx = broadcast_indices(tidx, in_sizes); + VEC4_T in_texel = VEC4_T(load_texel( t_in, - to_texture_pos(in_idx, in_sizes, packed_dim), - 0)); + // read axis mapped texel + tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim))); - ivec4 other_idx = broadcast_indices(idx, other_sizes); - VEC4_T other_texel = VEC4_T(texelFetch( + // broadcast on logical sizes + ivec4 other_idx = broadcast_indices(tidx, other_sizes); + VEC4_T other_texel = VEC4_T(load_texel( t_other, - to_texture_pos(other_idx, other_sizes, packed_dim), - 0)); + // read axis mapped texel + tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim))); // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment. if (broadcast_params.x > 0) { @@ -60,5 +65,7 @@ void main() { other_texel = other_texel.xxxx; } - imageStore(t_out, pos, VEC4_T(op(in_texel, other_texel, alpha))); + imageStore(t_out, + tidx_to_pos(tidx, out_sizes, out_axis_map, packed_dim), + VEC4_T(op(in_texel, other_texel, alpha))); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl new file mode 100644 index 0000000000..9d4b18f0d1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl @@ -0,0 +1,23 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "out_buf", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "in_buf", DTYPE, STORAGE)} +${layout_declare_ubo(2, "int", "numel")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + int tid = int(gl_GlobalInvocationID.x); + if (tid >= numel) { + return; + } + out_buf[tid] = in_buf[tid]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml new file mode 100644 index 0000000000..8ea4cbe561 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +buffer_to_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + shader_variants: + - NAME: buffer_to_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl new file mode 100644 index 0000000000..201b4d1726 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl @@ -0,0 +1,35 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +#include "indexing_utils.h" + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(2, "ivec4", "in_sizes")} +${layout_declare_ubo(3, "ivec4", "in_strides")} +${layout_declare_ubo(4, "int", "numel")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// This constant is unused in this shader but is kept so that the signature is +// consistent with image_to_nchw. +layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM; + +void main() { + int nchwi = int(gl_GlobalInvocationID.x); + if (nchwi >= numel) { + return; + } + + ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes); + const int in_bufi = tidx_to_bufi(in_tidx, in_strides); + + nchw_buf[nchwi] = t_in[in_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml new file mode 100644 index 0000000000..653bda9ccc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +buffer_to_nchw: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + shader_variants: + - NAME: buffer_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl index 18202e4a51..49ce76423d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl @@ -53,7 +53,7 @@ void main() { } // Map tensor_idx to normal buffer_i - const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); + const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl index 493a614ee8..4e8bff9494 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl @@ -53,7 +53,7 @@ void main() { } // Map tensor_idx to normal buffer_i - const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); + const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl index d2978ffe7e..df8589e737 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl @@ -53,7 +53,7 @@ void main() { } // Map tensor_idx to normal buffer_i - const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); + const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl index 3adffe99bd..1a3fef2b31 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl @@ -16,34 +16,36 @@ layout(std430) buffer; #include "indexing_utils.h" -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", "int", STORAGE)} -${layout_declare_tensor(2, "r", "t_weight", DTYPE, STORAGE)} -${layout_declare_ubo(3, "ivec4", "sizes")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)} +${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "in_axis_map")} +${layout_declare_ubo(B, "ivec4", "weight_axis_map")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - - if (pos_out_of_bounds(out_pos, sizes, packed_dim)) { + const ivec3 out_lpos = ivec3(gl_GlobalInvocationID); + const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim); + if (any(greaterThanEqual(out_tidx, sizes))) { return; } - - const ivec4 out_idx = to_tensor_idx(out_pos, sizes, packed_dim); VEC4_T out_texel; // Consider optimizing via W-packing format for t_in and t_weight. for (int i = 0; i < 4; ++i) { // Read input tensor for embedding index. - const ivec3 in_pos = ivec3(out_pos.y, out_idx.z * 4 + i, out_idx.w / 4); - const int in_texel_elem = texelFetch(t_in, in_pos, 0)[out_idx.w % 4]; + const ivec3 in_pos = lpos_to_pos(ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4), in_axis_map); + const int in_texel_elem = load_texel(t_in, in_pos)[out_tidx.w % 4]; // Read weight tensor for embedding. - out_texel[i] = texelFetch(t_weight, ivec3(out_pos.x, in_texel_elem, 0), 0).x; + const ivec3 weight_pos = lpos_to_pos(ivec3(out_tidx.x, in_texel_elem, 0), weight_axis_map); + out_texel[i] = load_texel(t_weight, weight_pos).x; } - imageStore(t_out, out_pos, out_texel); + imageStore(t_out, lpos_to_pos(out_lpos, out_axis_map), out_texel); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl new file mode 100644 index 0000000000..93a2c53e01 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl @@ -0,0 +1,38 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_ubo(1, "ivec4", "in_sizes")} +${layout_declare_ubo(2, "ivec4", "out_sizes")} +${layout_declare_ubo(3, "int", "stride", "float", "offset")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); + + if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { + return; + } + int width = in_sizes.x; + VEC4_T outtex; + if (pos.x == 0) { + float value = (pos.y % width + offset) * stride; + outtex = VEC4_T(value, 0, 0, 0); + } else if (pos.x == 1) { + float value = (pos.y / width + offset) * stride; + outtex = VEC4_T(value, 0, 0, 0); + } + + imageStore(t_out, pos, outtex); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml new file mode 100644 index 0000000000..654edca610 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml @@ -0,0 +1,12 @@ +grid_priors: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + PACKING: C_packed + STORAGE: texture3d + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: grid_priors diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl new file mode 100644 index 0000000000..be3901799f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -0,0 +1,63 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define BUF_T ${buffer_scalar_type(DTYPE)} +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} + +${define_active_storage_type(STORAGE)} + +#include "indexing_utils.h" + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { + const ivec4 buf_indices = tidx_to_nchwi( + tensor_idx, + sizes, + packed_dim); + + if (tensor_idx[packed_dim] < sizes[packed_dim]) { + nchw_out[buf_indices.x] = BUF_T(texel.x); + } + if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) { + nchw_out[buf_indices.y] = BUF_T(texel.y); + } + if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) { + nchw_out[buf_indices.z] = BUF_T(texel.z); + } + if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) { + nchw_out[buf_indices.w] = BUF_T(texel.w); + } +} + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); + + if (any(greaterThanEqual(tidx, sizes))) { + return; + } + + const VEC4_T intex = load_texel(t_in, lpos_to_pos(lpos, axis_map)); + write_out_texel(intex, tidx); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml new file mode 100644 index 0000000000..0898e75110 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +image_to_nchw: + parameter_names_with_default_values: + DTYPE: float + STORAGE: texture3d + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + STORAGE: + - VALUE: texture3d + - VALUE: texture2d + shader_variants: + - NAME: image_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl index ba60000f3d..76ec540838 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl @@ -34,18 +34,18 @@ void main() { } const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim); - const ivec4 buffer_ixs = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim); + const ivec4 buffer_ixs = tidx_to_nchwi(idx, out_sizes, packed_dim); VEC4_T out_texel; for (int i = 0; i < 4; ++i) { - const ivec4 out_idx = from_nchw_buffer_i(buffer_ixs[i], out_sizes); - int out_channel = out_idx.z; + const ivec4 out_tidx = nchwi_to_tidx(buffer_ixs[i], out_sizes); + int out_channel = out_tidx.z; int in_channel = texelFetch(t_idx, ivec3(out_channel, 0, 0), 0).x; - ivec4 in_idx = out_idx; - in_idx.z = in_channel; + ivec4 in_tidx = out_tidx; + in_tidx.z = in_channel; - ivec4 in_elem_pos = to_texture_elem_pos(in_idx, in_sizes, packed_dim); + ivec4 in_elem_pos = to_texture_elem_pos(in_tidx, in_sizes, packed_dim); VEC4_T in_texel = texelFetch(t_in, in_elem_pos.xyz, 0); diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 0ecfb83eac..73df7cfccc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -9,94 +9,235 @@ #ifndef INDEXING_UTILS_H #define INDEXING_UTILS_H -// Width Dim Index, assuming (W, H, C, N) order +/* + * The functions defined in this header file use the following shorthand to + * represent tensor related data structures. + * + * tidx - ivec4 tensor indices, listed in WHCN order. + * + * pos - ivec3 texel position, used to fetch from an image texture via the + * texelFetch(image, pos, lod) GLSL function. + * posi - ivec4 texel element position. It is the same as pos, except with an + * additional component of the index of an element within the texel. + * lpos - ivec3 logical position, listed in WHC order. This is a permutation of + * texture position based on a tensor's axis_map. lpos.x is the position + * component that corresponds to the tensor's width dimension, lpos.y is + * the position component that corresponds to the tensor's height dim, + * and so on. + * + * bufi - int index into a GPU buffer that backs a tensor. + * nchwi - int index into a staging buffer for a tensor. The data in the + * staging buffer is stored in contiguous data layout, irrespective of + * the tensor's strides. + */ + +// Width Dim Index, assuming WHCN order #define W_DIM 0 -// Height, assuming (W, H, C, N) order +// Height, assuming WHCN order #define H_DIM 1 -// Channels, assuming (W, H, C, N) order +// Channels, assuming WHCN order #define C_DIM 2 /* - * Describes which texture axis the "batches" dimension runs along in a 4D - * texture. - * - * Currently it is set to 2 since we represent batches by concatenating along - * the channels dim, which has index 2 in (W, H, C, N) order and maps to the - * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) - * order. + * Fast division by 4 using bit shifting */ -#define BATCH_AXIS 2 - -// -// Basic Indexing Utility Macros and Functions -// +#define div4(x) (x >> 2) /* * Divides input and rounds up to 4 */ -#define divup4(x) ((x + 3) / 4) +#define divup4(x) ((x + 3) >> 2) /* * Aligns input to the next multiple of 4 */ #define alignup4(x) ((x + 3) & -4) -// -// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion -// +/* + * Find the packed dimension of a tensor given its strides. The packed dimension + * is the "fastest moving" dimension which will have a stride of 1. + */ +int find_packed_dim(const ivec4 strides) { + int packed_dim = 0; + for (int i = 0; i <= 3; i++) { + if (strides[i] == 1) { + packed_dim = i; + break; + } + } + return packed_dim; +} /* - * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim - * is packed along a texel - * Output: A ivec4 containing the buffer indices corresponding to each texel - * element. + * Get the staging buffer indices that contain the data of the texel that + * corresponds to the provided tensor index. Since the texel have 4 elements, + * 4 buffer indices will be retrieved. */ -ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) { +ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) { ivec4 strides = ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z); - int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z + - idx.w * strides.w; + int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + + tidx.w * strides.w; return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; } -/* - * Input: Index into a tensor's data buffer, (W, H, C, N) sizes of a tensor - * Returns: The WCHN index of the tensor that corresponds to the specified - * buffer index, assuming the buffer has contiguous memory layout - */ -ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) { +ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) { return ivec4( - buf_i % sizes.x, - (buf_i / (sizes.x)) % sizes.y, - (buf_i / (sizes.x * sizes.y)) % sizes.z, - (buf_i / (sizes.x * sizes.y * sizes.z))); + nchwi % sizes.x, + (nchwi / (sizes.x)) % sizes.y, + (nchwi / (sizes.x * sizes.y)) % sizes.z, + (nchwi / (sizes.x * sizes.y * sizes.z))); } -/* - * Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is - * packed along a texel - * Returns: The (x, y, z, n) texel position corresponding to the first element - * of the texel at the specified buffer index - */ -ivec4 to_texel_pos(int buf_i, ivec4 strides, int packed_dim) { +int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { + return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y + + tidx.y * sizes.x + tidx.x; +} + +// TODO(ssjia): make this function use dim order so that it can work with any +// dim order. Currently it assumes that the dim order is contiguous, except for +// the packed dim. +ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) { ivec4 idx; for (int i = 3; i >= 0; i--) { if (i != packed_dim) { - idx[i] = buf_i / strides[i]; - buf_i %= strides[i]; + idx[i] = bufi / strides[i]; + bufi %= strides[i]; } } - idx[packed_dim] = buf_i; + idx[packed_dim] = bufi; return idx; } -int to_texel_idx(const ivec4 texel_pos, ivec4 strides) { - return texel_pos.x * strides.x + texel_pos.y * strides.y + - texel_pos.z * strides.z + texel_pos.w * strides.w; +// Convenience overload of the above function, which will determine the packed +// dim from the strides automatically so it doesn't have to be passed in as a +// function argument. +ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) { + int packed_dim = find_packed_dim(strides); + return bufi_to_tidx(bufi, strides, packed_dim); +} + +int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { + return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + + tidx.w * strides.w; +} + +ivec4 lpos_to_tidx( + ivec3 lpos, + ivec4 sizes, + const int batch_inner_dim, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + // Moving 1 texel along the packed dim traverses 4 tensor elements + lpos[packed_dim] *= 4; + + ivec4 tidx = ivec4(lpos, 0); + + if (sizes.w > 1) { + tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim]; + tidx[batch_inner_dim] %= sizes[batch_inner_dim]; + } + return tidx; +} + +ivec3 tidx_to_lpos( + ivec4 tidx, + ivec4 sizes, + const int batch_inner_dim, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 lpos = tidx.xyz; + + // Adjust batch inner dim by batch index if needed + if (sizes.w > 1) { + lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim]; + } + // Fast division by 4, since moving 1 texel along the packed dim traverses 4 + // tensor elements. + lpos[packed_dim] >>= 2; + return lpos; } +ivec3 tidx_to_pos( + ivec4 tidx, + ivec4 sizes, + const ivec4 axis_map, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_map[dim]] = tidx[dim]; + } + + // Adjust batch inner dim by batch index if needed + if (sizes.w > 1) { + pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w]; + } + // Fast division by 4, since moving 1 texel along the packed dim traverses 4 + // tensor elements. + pos[axis_map[packed_dim]] >>= 2; + return pos; +} + +ivec4 tidx_to_posi( + ivec4 tidx, + ivec4 sizes, + const ivec4 axis_map, + const int packed_dim) { + return ivec4( + tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4); +} + +ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { + ivec3 pos; + pos[axis_map.x] = lpos.x; + pos[axis_map.y] = lpos.y; + pos[axis_map.z] = lpos.z; + return pos; +} + +#ifdef USING_BUFFER +#define load_texel(buf, idx) buf[idx] +#elif defined(USING_TEXTURE2D) +#define load_texel(im, pos) texelFetch(im, pos.xy, 0) +#else // defined(USING_TEXTURE3D) +#define load_texel(im, pos) texelFetch(im, pos, 0) +#endif + +#ifdef USING_BUFFER +#define write_texel(buf, idx, texel) buf[idx] = texel +#elif defined(USING_TEXTURE2D) +#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel) +#else // defined(USING_TEXTURE3D) +#define write_texel(im, pos, texel) imageStore(im, pos, texel) +#endif + +/************************ + * Deprecated Functions * + ************************/ + +// The below functions and macros are in the process of being deprecated in +// favor of newer indexing functions that account for axis mapping and have more +// explicit function names and more updated terminology. + +/* + * Describes which texture axis the "batches" dimension runs along in a 4D + * texture. + * + * Currently it is set to 2 since we represent batches by concatenating along + * the channels dim, which has index 2 in (W, H, C, N) order and maps to the + * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) + * order. + */ +#define BATCH_AXIS 2 + // // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion // @@ -181,26 +322,6 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } -// -// Texel Access and Storage -// - -#ifdef USING_BUFFER -#define load_texel(buf, idx) buf[idx] -#elif defined(USING_TEXTURE2D) -#define load_texel(im, pos) texelFetch(im, pos.xy, 0) -#else // defined(USING_TEXTURE3D) -#define load_texel(im, pos) texelFetch(im, pos, 0) -#endif - -#ifdef USING_BUFFER -#define write_texel(buf, idx, texel) buf[idx] = texel -#elif defined(USING_TEXTURE2D) -#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel) -#else // defined(USING_TEXTURE3D) -#define write_texel(im, pos, texel) imageStore(im, pos, texel) -#endif - // // Miscellaneous Utility Functions and Macros // diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl new file mode 100644 index 0000000000..f7133dd045 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#include "indexing_utils.h" + +layout(std430) buffer; + +#extension GL_EXT_control_flow_attributes : require + +${layout_declare_buffer(B, "w", "nchw_out", "int")} +${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")} +${layout_declare_ubo(B, "ivec4", "tensor_sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} +${layout_declare_ubo(B, "int", "out_numel")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +void main() { + const int out_buf_idx = int(gl_GlobalInvocationID.x); + // On the CPU, the number of elements is determined based on a buffer of int8 + // elements. However, on the GPU, since the int8 data type is not supported + // each group of 4 elements is interepreted as 1 int32 element. Thus each + // thread is actually writing to 4 output elements from the perspective of the + // CPU. + if (out_buf_idx * 4 >= out_numel) { + return; + } + + ivec4 values; + int in_buf_idx = 4 * out_buf_idx; + + [[unroll]] for (int i = 0; i < 4; ++i) { + const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes); + const ivec4 texture_pos = to_texture_elem_pos( + tidx, tensor_sizes, packed_dim); + values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w]; + in_buf_idx++; + } + + // Manually pack 4x 8-bit integers into a 32 bit integer. Note that little + // endian is assumed, since most processors use little endian. Thus the + // "later" values are placed in most significant bytes. + int packed = ((values[3] & 0xFF) << 24) + | ((values[2] & 0xFF) << 16) + | ((values[1] & 0xFF) << 8) + | ((values[0] & 0xFF)); + + nchw_out[out_buf_idx] = packed; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h deleted file mode 100644 index 620f1fd0e6..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul.h +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// To convince the SPIR-V compiler to unroll the loops optimally, need this -// macro -#define FOUR 4 - -#ifdef TILE_ROW_2 -#define TILE_ROWS 2 -#else -#define TILE_ROWS 4 -#endif - -// we avoid mat4 and vec4 usage here as they compile to much less efficient -// SPIR-V -struct FloatMatrix_2d { - float data[TILE_ROWS][FOUR]; -}; - -struct FloatMatrix_3d { - float data[TILE_ROWS][FOUR][FOUR]; -}; - -#ifdef MAT2_IS_TRANSPOSED -vec4 matmul_naive_W_packed_W_packed( -#else -vec4 matmul_naive_W_packed_H_packed( -#endif - const sampler3D im_mat1, - const sampler3D im_mat2, - const ivec3 out_pos, - const int width) { - ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z); -#ifdef MAT2_IS_TRANSPOSED - ivec3 mat2_pos = ivec3(0, out_pos.x * 4, 0); -#else - ivec3 mat2_pos = ivec3(out_pos.x * 4, 0, out_pos.z); -#endif - - vec4 texel = vec4(0); - const int K = (width + 3) / 4; - - for (int i = 0; i < K; ++i) { - const vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0); -#ifdef MAT2_IS_TRANSPOSED - const vec4 sums = vec4( - dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 1, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 2, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 3, 0), 0))); -#else - const vec4 sums = vec4( - dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(1, 0, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(2, 0, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(3, 0, 0), 0))); -#endif - - texel += sums; - - mat1_pos.x++; -#ifdef MAT2_IS_TRANSPOSED - mat2_pos.x++; -#else - mat2_pos.y++; -#endif - } - - return texel; -} - -#ifdef MAT2_IS_TRANSPOSED -vec4 matmul_naive_W_packed_H_packed( -#else -vec4 matmul_naive_W_packed_W_packed( -#endif - const sampler3D im_mat1, - const sampler3D im_mat2, - const ivec3 out_pos, - const int width) { - ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z); - ivec3 mat2_pos = ivec3(out_pos.x, 0, out_pos.z); - - vec4 texel = vec4(0); - int K = divup4(width); - - for (int i = 0; i < K; ++i) { - vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0); - texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - texel = fma(mat1_tex.yyyy, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - texel = fma(mat1_tex.zzzz, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - texel = fma(mat1_tex.wwww, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - - mat1_pos.x++; - } - - return texel; -} - -// get texel from self tensor (width_packed) in addmm -vec4 get_texel_W_packed( - sampler3D im_self, - const ivec3 pos, - const bool broadcast_at_width, - const bool broadcast_at_height) { - vec4 self_texel; - // self is of shape {1} - if (broadcast_at_width && broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0).xxxx; - } - // self is of shape {*, 1} - else if (broadcast_at_width) { - self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0).xxxx; - } - // self is of shape {1, *} - else if (broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0); - } else { - self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0); - } - - return self_texel; -} - -// get texel from self tensor (channel_packed) in addmm -vec4 get_texel_C_packed( - sampler3D im_self, - const ivec3 pos, - const bool broadcast_at_width, - const bool broadcast_at_height) { - vec4 self_texel; - // self is of shape {1} - if (broadcast_at_width && broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0); - } - // self is of shape {*, 1} - else if (broadcast_at_width) { - self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0); - } - // self is of shape {1, *} - else if (broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0); - } else { - self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0); - } - - return self_texel; -} - -FloatMatrix_2d matmul_partial_2d( - sampler3D im_mat1, - sampler3D im_mat2, - const ivec3 pos, - const int batch_size, - const int K_texel_len) { - FloatMatrix_2d results; - for (int i = 0; i < TILE_ROWS; i++) { - for (int j = 0; j < FOUR; j++) { - results.data[i][j] = 0.0f; - } - } - vec4 im_mat1_partial_load[TILE_ROWS]; - vec4 im_mat2_partial_load[FOUR]; - - for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) { - for (int offset = 0; offset < TILE_ROWS; offset++) { - // read and cache 2x4 (or 4x4) tile of im_mat1 - const int mat1_y = (TILE_ROWS * pos.y) + offset; - const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0); - im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0); - // read and cache 4x4 tile of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - const int mat2_y = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#else - const int mat2_x = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, 0); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#endif - } - -#ifdef TILE_ROW_2 -// column 3 and 4 of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0); -#else - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, 0), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, 0), 0); -#endif -#endif - - // perform partial dot products and add partial result to results - for (int out_row = 0; out_row < TILE_ROWS; out_row++) { - for (int out_col = 0; out_col < FOUR; out_col++) { - results.data[out_row][out_col] += - dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]); - } - } - } - return results; -} - -FloatMatrix_3d matmul_partial_3d( - sampler3D im_mat1, - sampler3D im_mat2, - const ivec3 pos, - const int batch_size, - const int K_texel_len) { - FloatMatrix_3d results; - for (int i = 0; i < TILE_ROWS; i++) { - for (int j = 0; j < FOUR; j++) { - for (int k = 0; k < FOUR; k++) { - results.data[i][j][k] = 0.0f; - } - } - } - vec4 im_mat1_partial_load[TILE_ROWS]; - vec4 im_mat2_partial_load[FOUR]; - - for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { - if (FOUR * pos.z + batch_idx >= batch_size) { - break; - } - int mat_z = FOUR * pos.z + batch_idx; - for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) { - for (int offset = 0; offset < TILE_ROWS; offset++) { - // read and cache 2x4 (or 4x4) tile of im_mat1 - const int mat1_y = (TILE_ROWS * pos.y) + offset; - const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z); - im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0); - // read and cache 4x4 tile of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - const int mat2_y = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#else - const int mat2_x = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, mat_z); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#endif - } - -#ifdef TILE_ROW_2 -// column 3, and 4 of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0); -#else - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, mat_z), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, mat_z), 0); -#endif -#endif - - // perform partial dot products and add partial result to results - for (int out_row = 0; out_row < TILE_ROWS; out_row++) { - for (int out_col = 0; out_col < FOUR; out_col++) { - results.data[out_row][out_col][batch_idx] += - dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]); - } - } - } - } - return results; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl deleted file mode 100644 index 37a9b60f3c..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict InSizes { - ivec4 in_sizes; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - vec4 texel = vec4(0); - - $if MAT1_PACKING == "W_packed": - $if MAT2_PACKING == "H_packed": - texel = matmul_naive_W_packed_H_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $elif MAT2_PACKING == "W_packed": - texel = matmul_naive_W_packed_W_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $else: - $raise Exception("Unsupported value for MAT2_PACKING") - $else: - $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING") - - imageStore(im_out, pos, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml deleted file mode 100644 index 1c4db3f0ce..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -matmul_naive: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - MAT1_PACKING: W_packed - MAT2_PACKING: H_packed - MAT2_IS_TRANSPOSED: false - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_naive_W_packed_H_packed - - NAME: matmul_naive_W_packed_W_packed - MAT2_PACKING: W_packed - - NAME: matmul_transposed_naive_W_packed_W_packed - MAT2_PACKING: W_packed - MAT2_IS_TRANSPOSED: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl new file mode 100644 index 0000000000..e4064eed2f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(1, "r", "t_mat1", DTYPE, "buffer")} +${layout_declare_tensor(2, "r", "t_mat2", DTYPE, "buffer")} +${layout_declare_ubo(3, "ivec4", "out_sizes")} +${layout_declare_ubo(4, "ivec4", "out_strides")} +${layout_declare_ubo(5, "ivec4", "mat1_sizes")} +${layout_declare_ubo(6, "ivec4", "mat1_strides")} +${layout_declare_ubo(7, "ivec4", "mat2_sizes")} +${layout_declare_ubo(8, "ivec4", "mat2_strides")} +${layout_declare_ubo(9, "int", "out_numel")} + +#include "indexing_utils.h" + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec4 out_bufix = ivec4( + gl_GlobalInvocationID.x, + gl_GlobalInvocationID.y, + gl_GlobalInvocationID.z % out_sizes.z, + gl_GlobalInvocationID.z / out_sizes.z); + + if (any(greaterThanEqual(out_bufix, out_sizes))) { + return; + } + + int mat1_bufi = tidx_to_bufi( + ivec4(0, out_bufix.y, out_bufix.z, out_bufix.w), mat1_strides); + int mat2_bufi = tidx_to_bufi( + ivec4(out_bufix.x, 0, out_bufix.z, out_bufix.w), mat2_strides); + + T sum = T(0.0); + for (int i = 0; i < mat1_sizes.x; ++i) { + sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi]; + + mat1_bufi += mat1_strides.x; + mat2_bufi += mat2_strides.y; + } + + const int out_bufi = tidx_to_bufi(out_bufix, out_strides); + t_out[out_bufi] = T(sum); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml new file mode 100644 index 0000000000..54eb444f73 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +matmul_naive_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: matmul_naive_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl deleted file mode 100644 index 8634371a7b..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -$if BATCH_MODE: - #define BATCH_MODE - -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InLimits { - ivec3 in_limits; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - results.data[idx_c][idx_r][0], - results.data[idx_c][idx_r][1], - results.data[idx_c][idx_r][2], - results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml deleted file mode 100644 index 9268d5a25a..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -matmul_optimized: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - PACKING: C_packed - MAT2_IS_TRANSPOSED: false - BATCH_MODE: false - TILE_ROW: tile_row_4 - generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_optimized - - NAME: matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - - NAME: batch_matmul_optimized - BATCH_MODE: true - - NAME: batch_matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - BATCH_MODE: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl new file mode 100644 index 0000000000..ea4e0d300c --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -0,0 +1,35 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +#include "indexing_utils.h" + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)} +${layout_declare_ubo(2, "ivec4", "out_sizes")} +${layout_declare_ubo(3, "ivec4", "out_strides")} +${layout_declare_ubo(4, "int", "numel")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// This constant is unused in this shader but is kept so that the signature is +// consistent with nchw_to_image. +layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM; + +void main() { + int out_bufi = int(gl_GlobalInvocationID.x); + if (out_bufi >= numel) { + return; + } + + ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides); + const int in_nchwi = tidx_to_nchwi(out_tidx, out_sizes); + + t_out[out_bufi] = nchw_in[in_nchwi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml new file mode 100644 index 0000000000..6292ef9333 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +nchw_to_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + shader_variants: + - NAME: nchw_to_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl new file mode 100644 index 0000000000..b86a59fc23 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -0,0 +1,63 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define SCALAR_T ${texel_load_component_type(DTYPE, STORAGE)} + +${define_active_storage_type(STORAGE)} + +#include "indexing_utils.h" + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_buffer(B, "r", "nchw_in", DTYPE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +VEC4_T read_texel(ivec4 tidx) { + const ivec4 buf_indices = tidx_to_nchwi( + tidx, + sizes, + packed_dim); + + VEC4_T texel = VEC4_T(0); + if (tidx[packed_dim] < sizes[packed_dim]) { + texel.x = SCALAR_T(nchw_in[buf_indices.x]); + } + if (tidx[packed_dim] + 1 < sizes[packed_dim]) { + texel.y = SCALAR_T(nchw_in[buf_indices.y]); + } + if (tidx[packed_dim] + 2 < sizes[packed_dim]) { + texel.z = SCALAR_T(nchw_in[buf_indices.z]); + } + if (tidx[packed_dim] + 3 < sizes[packed_dim]) { + texel.w = SCALAR_T(nchw_in[buf_indices.w]); + } + return texel; +} + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); + if (any(greaterThanEqual(tidx, sizes))) { + return; + } + + write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx)); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml new file mode 100644 index 0000000000..2bf85a7492 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +nchw_to_image: + parameter_names_with_default_values: + STORAGE: texture3d + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + STORAGE: + - VALUE: texture3d + - VALUE: texture2d + shader_variants: + - NAME: nchw_to_image diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl new file mode 100644 index 0000000000..f3a3370f3b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl @@ -0,0 +1,75 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#include "indexing_utils.h" + +layout(std430) buffer; + +#extension GL_EXT_control_flow_attributes : require + +${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")} +${layout_declare_buffer(B, "r", "nchw_in", "int")} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int packed_dim = C_DIM; + +/* + * Extends sign of int8 + */ +int extend_sign(int x) { + if (x >> 7 == 1) { + return x | 0xFFFFFF00; + } + return x; +} + +ivec4 read_texel(ivec4 tidx) { + const ivec4 buf_indices = tidx_to_nchwi( + tidx, sizes, packed_dim); + + int shift = (1 << 8) - 1; + ivec4 masks; + // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that + // little endian is assumed, as most processors use little endian. Thus the + // most significant bytes correspond to the "latter" packed values. + masks.x = shift << (8 * (buf_indices.x % 4)); + masks.y = shift << (8 * (buf_indices.y % 4)); + masks.z = shift << (8 * (buf_indices.z % 4)); + masks.w = shift << (8 * (buf_indices.w % 4)); + + ivec4 out_tex = ivec4(0); + + [[unroll]] for (int i = 0; i < 4; ++i) { + if (tidx[packed_dim] + i < sizes[packed_dim]) { + int in_texel = nchw_in[buf_indices[i] / 4]; + int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4)); + extracted_val = extend_sign(extracted_val); + out_tex[i] = extracted_val; + } + } + + return out_tex; +} + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); + + if (any(greaterThanEqual(tidx, sizes))) { + return; + } + + write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx)); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl deleted file mode 100644 index c0bbc5183a..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} -#define SCALAR_T ${texel_load_component_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#include "indexing_utils.h" - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(1, "r", "nchw_in", DTYPE)} -${layout_declare_ubo(2, "ivec4", "sizes")} -$if STORAGE == "buffer": - ${layout_declare_ubo(3, "ivec4", "gpu_strides")} - ${layout_declare_ubo(4, "int", "ntexels")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -VEC4_T read_texel(ivec4 tensor_idx) { - const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, - sizes, - packed_dim); - - VEC4_T texel = VEC4_T(0); - if (tensor_idx[packed_dim] < sizes[packed_dim]) { - texel.x = SCALAR_T(nchw_in[buf_indices.x]); - } - if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) { - texel.y = SCALAR_T(nchw_in[buf_indices.y]); - } - if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) { - texel.z = SCALAR_T(nchw_in[buf_indices.z]); - } - if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) { - texel.w = SCALAR_T(nchw_in[buf_indices.w]); - } - return texel; -} - -#ifdef USING_BUFFER - -void main() { - const int t_id = int(gl_GlobalInvocationID.x); - if (t_id >= ntexels) { - return; - } - - ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim); - tensor_idx[packed_dim] *= 4; - t_out[t_id] = read_texel(tensor_idx); -} - -#else // USING_TEXTURE - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); - if (any(greaterThanEqual(tensor_idx, sizes))) { - return; - } - - write_texel(t_out, pos, read_texel(tensor_idx)); -} - -#endif // USING_BUFFER diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.yaml deleted file mode 100644 index 96fe55dfb4..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -nchw_to_tensor: - parameter_names_with_default_values: - STORAGE: texture3d - DTYPE: float - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int - - VALUE: int8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d - - VALUE: buffer - shader_variants: - - NAME: nchw_to_tensor diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl new file mode 100644 index 0000000000..d07d45251f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl @@ -0,0 +1,146 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#include "indexing_utils.h" + +#define PRECISION ${PRECISION} + +#define FOUR 4 + +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define FLOAT_T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type(STORAGE)} + +${define_required_extensions(DTYPE)} +${define_required_extensions("int8")} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)} +${layout_declare_tensor(2, "r", "t_mat2", "int8", STORAGE)} +${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)} + +$if STORAGE == "texture3d": + ${layout_declare_ubo(4, "ivec4", "out_sizes")} + ${layout_declare_ubo(5, "ivec4", "mat1_sizes")} + ${layout_declare_ubo(6, "ivec4", "scales_strides")} +$else: + ${layout_declare_ubo(4, "ivec4", "out_sizes")} + ${layout_declare_ubo(5, "ivec4", "out_strides")} + ${layout_declare_ubo(6, "ivec4", "mat1_sizes")} + ${layout_declare_ubo(7, "ivec4", "mat1_strides")} + ${layout_declare_ubo(8, "ivec4", "mat2_strides")} + ${layout_declare_ubo(9, "ivec4", "scales_strides")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int group_size = 1; + +void main() { + + const ivec4 out_pos = ivec4( + gl_GlobalInvocationID.x, // n = 0..N-1 + gl_GlobalInvocationID.y, // m = 0..M-1 + gl_GlobalInvocationID.z % out_sizes.z, + gl_GlobalInvocationID.z / out_sizes.z); + + if (any(greaterThanEqual(out_pos, out_sizes))) { + return; + } + + const uint K = mat1_sizes.x; + const uint n = out_pos.x; + const uint m = out_pos.y; + const uint mask = uint(0x0f); + + float rc = 0.0; + int k = 0; + + #ifdef USING_BUFFER + const uint k_block = (K + group_size - 1) / group_size; + ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w); + ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); + ivec4 scale_pos = ivec4(0, n, 0, out_pos.w); + ivec4 zero_pos = ivec4(0, n, 1, out_pos.w); + + for (int kb = 0; kb < k_block; kb++) { + scale_pos.x = kb; + const int scale_bufi = tidx_to_bufi(scale_pos, scales_strides); + const float scale = float(t_scales_and_zeros[scale_bufi]); + + zero_pos.x = kb; + const int zero_bufi = tidx_to_bufi(zero_pos, scales_strides); + const float zero = float(t_scales_and_zeros[zero_bufi]) - scale * 8.0; + + for(uint idx = 0; idx < group_size && k < K; idx++, k++) { + mat1_pos.x = k; + const int mat1_bufi = tidx_to_bufi(mat1_pos, mat1_strides); + const float mat1_val = float(t_mat1[mat1_bufi]); + + mat2_pos.x = k / 2; + const int mat2_bufi = tidx_to_bufi(mat2_pos, mat2_strides); + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (t_mat2[mat2_bufi] & 0xFF); + mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + + rc += mat1_val * (scale * float(mat2_val) + zero); + } + } + + const int out_bufi = tidx_to_bufi(out_pos, out_strides); + t_out[out_bufi] = FLOAT_T(rc); + + #else // Using texture + const uint texel_group_size = group_size / FOUR; + const uint k_block = (K + texel_group_size - 1) / texel_group_size; + ivec3 mat1_pos = ivec3(0, m, out_pos.z); + ivec3 mat2_pos = ivec3(0, n, out_pos.z); + ivec3 scale_pos = ivec3(0, n, 0); + ivec3 zero_pos = ivec3(0, n, 1); + + for (int kb = 0; kb < k_block; kb++) { + const int texel_kb = kb / FOUR; + const int kb_offset = kb % FOUR; + + scale_pos.x = texel_kb; + const VEC4_T scale_texel = load_texel(t_scales_and_zeros, scale_pos); + const float scale = float(scale_texel[kb_offset]); + + zero_pos.x = texel_kb; + const VEC4_T zero_texel = load_texel(t_scales_and_zeros, zero_pos); + const float zero = float(zero_texel[kb_offset]) - scale * 8.0; + + for(uint idx = 0; idx < texel_group_size && k < K; idx++, k++) { + mat1_pos.x = k; + const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos); + + mat2_pos.x = k / 2; + const i8vec4 mat2_tex = i8vec4(load_texel(t_mat2, mat2_pos)); + + // Every two texels of mat1 correspond to one texel of mat2 + // Even mat1 indeces correspond to first half of mat2 texel and + // odd indeces correspond to second half + const int mat2_offset = (k & 1) == 0 ? 0 : 2; + for (int texel_idx = 0; texel_idx < FOUR; texel_idx++){ + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (mat2_tex[mat2_offset + texel_idx / 2] & 0xFF); + mat2_val = (texel_idx & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + rc += mat1_tex[texel_idx] * (scale * float(mat2_val) + zero); + } + } + } + write_texel(t_out, out_pos.xyz, vec4(rc, 0, 0, 0)); + + #endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml new file mode 100644 index 0000000000..fd65068080 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q_4w_linear: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + STORAGE: + - VALUE: buffer + - VALUE: texture3d + shader_variants: + - NAME: q_4w_linear diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl index 139c82866f..a72df89b63 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl @@ -11,6 +11,7 @@ #define PRECISION ${PRECISION} #define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define FLOAT_T ${buffer_scalar_type(DTYPE)} ${define_active_storage_type(STORAGE)} @@ -28,9 +29,9 @@ ${layout_declare_tensor(3, "r", "t_scales", DTYPE, STORAGE)} $if STORAGE == "buffer": ${layout_declare_ubo(4, "ivec4", "out_sizes")} - ${layout_declare_ubo(5, "int", "ntexels")} - ${layout_declare_ubo(6, "ivec4", "mat1_sizes")} - ${layout_declare_ubo(7, "ivec4", "out_strides")} + ${layout_declare_ubo(5, "ivec4", "out_strides")} + ${layout_declare_ubo(6, "int", "out_numel")} + ${layout_declare_ubo(7, "ivec4", "mat1_sizes")} ${layout_declare_ubo(8, "ivec4", "mat1_strides")} ${layout_declare_ubo(9, "ivec4", "qmat2_strides")} ${layout_declare_ubo(10, "ivec4", "scales_strides")} @@ -48,15 +49,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #ifdef USING_BUFFER void main() { - const int t_id = int(gl_GlobalInvocationID.x); - if (t_id >= ntexels) { + const int out_bufi = int(gl_GlobalInvocationID.x); + if (out_bufi >= out_numel) { return; } - const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0); - VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x); - write_texel(t_out, t_id, outtex); + t_out[out_bufi] = q_8w_linear(out_tidx, mat1_sizes.x); } #else // USING_TEXTURE diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_linear.h b/backends/vulkan/runtime/graph/ops/glsl/q_linear.h index c1411376ad..f6de1e6dcf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_linear.h +++ b/backends/vulkan/runtime/graph/ops/glsl/q_linear.h @@ -16,36 +16,33 @@ #ifdef USING_BUFFER -VEC4_T q_8w_linear(const ivec4 out_pos, const int K) { - const VEC4_T scales = load_texel(t_scales, out_pos.x); +#ifndef FLOAT_T +#define FLOAT_T float +#endif - VEC4_T outtex = VEC4_T(0); +FLOAT_T q_8w_linear(const ivec4 out_idx, const int K) { + const FLOAT_T scale = t_scales[out_idx.x]; - // Initial mat1 pos will be (0, out_pos.y, out_pos.z, 0) - int mat1_tid = out_pos.y * mat1_strides.y + out_pos.z * qmat2_strides.z; - // Initial qmat2 pos wil be (0, out_pos.x * 4, 0, 0) - int qmat2_tid = out_pos.x * 4 * qmat2_strides.y; + FLOAT_T outval = FLOAT_T(0.0); - // TODO(ssjia): optimize memory access pattern by traversing K in inner loop - for (int i = 0; i < K; i += 4) { - const VEC4_T mat1_tex = load_texel(t_mat1, mat1_tid); + // Initial mat1 tensor idx will be (0, out_idx.y, out_idx.z, 0) + int mat1_offset = out_idx.y * mat1_strides.y + out_idx.z * qmat2_strides.z; + // Initial qmat2 tensor idx wil be (0, out_idx.x, 0, 0); note that the qmat2 + // tensor is transposed + int qmat2_offset = out_idx.x * qmat2_strides.y; - const VEC4_T sums = VEC4_T( - dot(mat1_tex, load_texel(t_qmat2, qmat2_tid) * scales.x), - dot(mat1_tex, - load_texel(t_qmat2, qmat2_tid + qmat2_strides.y) * scales.y), - dot(mat1_tex, - load_texel(t_qmat2, qmat2_tid + qmat2_strides.y * 2) * scales.z), - dot(mat1_tex, - load_texel(t_qmat2, qmat2_tid + qmat2_strides.y * 3) * scales.w)); + // TODO(ssjia): optimize memory access pattern by traversing K in inner loop + for (int i = 0; i < K; i++) { + const FLOAT_T mat1_val = t_mat1[mat1_offset]; + const FLOAT_T mat2_val = t_qmat2[qmat2_offset] * scale; - outtex += sums; + outval += mat1_val * mat2_val; - mat1_tid++; - qmat2_tid++; + mat1_offset++; + qmat2_offset++; } - return outtex; + return outval; } #else // USING_TEXTURE diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl index d1562d6576..45e6c3358e 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl @@ -43,11 +43,11 @@ void main() { // we calculate the source whcn-coordinate amended with offset-ed channel // value. Then we calculate the actual texture position from the // whcn-coordinate. - const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim); + const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim); vec4 outex; for (int i=0;i<4;i++) { - ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], out_sizes); + ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes); int in_channel = user_coor.z; diff --git a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl deleted file mode 100644 index 78d8346428..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define BUF_T ${buffer_scalar_type(DTYPE)} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#include "indexing_utils.h" - -${define_required_extensions(DTYPE)} - -layout(std430) buffer; - -${layout_declare_tensor(0, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_buffer(1, "w", "nchw_out", DTYPE)} -${layout_declare_ubo(2, "ivec4", "sizes")} -$if STORAGE == "buffer": - ${layout_declare_ubo(3, "ivec4", "gpu_strides")} - ${layout_declare_ubo(4, "int", "ntexels")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { - const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, - sizes, - packed_dim); - - if (tensor_idx[packed_dim] < sizes[packed_dim]) { - nchw_out[buf_indices.x] = BUF_T(texel.x); - } - if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) { - nchw_out[buf_indices.y] = BUF_T(texel.y); - } - if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) { - nchw_out[buf_indices.z] = BUF_T(texel.z); - } - if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) { - nchw_out[buf_indices.w] = BUF_T(texel.w); - } -} - -#ifdef USING_BUFFER - -void main() { - const int t_id = int(gl_GlobalInvocationID.x); - if (t_id >= ntexels) { - return; - } - - const VEC4_T intex = t_in[t_id]; - ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim); - tensor_idx[packed_dim] *= 4; - write_out_texel(intex, tensor_idx); -} - -#else // USING_TEXTURE - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); - - if (any(greaterThanEqual(tensor_idx, sizes))) { - return; - } - - const VEC4_T intex = load_texel(t_in, pos); - write_out_texel(intex, tensor_idx); -} - -#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.yaml deleted file mode 100644 index 93a261e1ee..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -tensor_to_nchw: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - - VALUE: int - - VALUE: int8 - STORAGE: - - VALUE: texture3d - - VALUE: texture2d - - VALUE: buffer - shader_variants: - - NAME: tensor_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl index 0cad62d38c..b645905939 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl @@ -11,6 +11,7 @@ #define PRECISION ${PRECISION} #define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define T ${buffer_scalar_type(DTYPE)} #define op(X, A, B) ${OPERATOR} @@ -18,46 +19,33 @@ ${define_active_storage_type(STORAGE)} #include "indexing_utils.h" -$if DTYPE == "half" and STORAGE == "buffer": - #extension GL_EXT_shader_16bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +${define_required_extensions(DTYPE)} layout(std430) buffer; ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} $if STORAGE == "buffer": - ${layout_declare_ubo(2, "int", "ntexels")} + ${layout_declare_ubo(2, "int", "numel")} $else: ${layout_declare_ubo(2, "ivec3", "out_limits")} ${layout_declare_ubo(3, "float", "minimum")} ${layout_declare_ubo(4, "float", "maximum")} - layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -float hardswish(float x){ - if(x <= -3) { - return 0; - } - else if(x >= 3) { - return x; - } - else { - return x * (x + 3)/6; - } -} +#include "activations.h" #ifdef USING_BUFFER void main() { const int i = int(gl_GlobalInvocationID.x); - if (i >= ntexels) { + if (i >= numel) { return; } - vec4 in_texel = vec4(t_in[i]); - t_out[i] = VEC4_T(op(in_texel, minimum, maximum)); + float in_val = float(t_in[i]); + t_out[i] = T(op(in_val, minimum, maximum)); } #else diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml index f39abc2134..2b9f0032f4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml @@ -35,6 +35,8 @@ unary_op: - NAME: tanh OPERATOR: tanh(clamp(X, -15.0, 15.0)) - NAME: hardshrink - OPERATOR: X * (vec4(greaterThan(X, vec4(A))) + vec4(lessThan(X, vec4(B)))) + OPERATOR: hardshrink(X, A, B) - NAME: hardswish - OPERATOR: vec4(hardswish(X.x),hardswish(X.y),hardswish(X.z),hardswish(X.w)) + OPERATOR: hardswish(X) + - NAME: hardsigmoid + OPERATOR: hardsigmoid(X) diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl index 0b0f587d1d..8d45e65b39 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/view.glsl @@ -39,13 +39,13 @@ void main() { // Assume there is a virtual continous buffer in nchw format. From the output // pos, we first calculate the index in the virual buffer, and then calculate // the input position from the indx. - const ivec4 buf_indices = get_texel_nchw_buffer_ixs(out_tensor_idx, out_sizes, out_packed_dim); + const ivec4 buf_indices = tidx_to_nchwi(out_tensor_idx, out_sizes, out_packed_dim); VEC4_T value = VEC4_T(0); // Need to look up the 4 values in the output texel separately. for (int i = 0 ; i < 4; i++) { if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) { - ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes); + ivec4 user_coor = nchwi_to_tidx(buf_indices[i], in_sizes); ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim); VEC4_T intex = texelFetch(t_in, in_pos_elem.xyz, 0); value[i] = intex[in_pos_elem.w]; diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp index 8e346bd208..eb0f1f99a2 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp @@ -88,7 +88,7 @@ void add_native_batch_norm_node( {{out_ref, vkapi::MemoryAccessType::WRITE}, {{in_ref, arg_weight, arg_bias, arg_mean, arg_var}, vkapi::MemoryAccessType::READ}}, - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), graph.create_params_buffer(epsilon), graph.create_params_buffer(num_texel_per_batch)})); } diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index 6bab8d1911..5896297144 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -85,8 +85,11 @@ void add_binary_op_node( {{arg1, arg2}, vkapi::MemoryAccessType::READ}}, // Shader params buffers {t_out->sizes_ubo(), + t_out->axis_map_ubo(), t_in1->sizes_ubo(), + t_in1->axis_map_ubo(), t_in2->sizes_ubo(), + t_in2->axis_map_ubo(), graph.create_params_buffer(broadcast_params), graph.create_params_buffer(alpha_val)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp index 04acec5937..a06af37bf0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp @@ -40,10 +40,10 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - utils::ivec3 range = t_in->texture_limits(); + utils::ivec3 range = t_in->logical_limits(); add_copy_offset_node( graph, input_ref, range, src_offset, dst_offset, out); - dst_offset.data[0] += range.data[0]; + dst_offset[0] += range[0]; } } else if (dim_index == kHeight4D) { @@ -52,10 +52,10 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - utils::ivec3 range = t_in->texture_limits(); + utils::ivec3 range = t_in->logical_limits(); add_copy_offset_node( graph, input_ref, range, src_offset, dst_offset, out); - dst_offset.data[1] += range.data[1]; + dst_offset[1] += range[1]; } } else if (dim_index == kBatch4D) { utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false); @@ -63,10 +63,10 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - utils::ivec3 range = t_in->texture_limits(); + utils::ivec3 range = t_in->logical_limits(); add_copy_offset_node( graph, input_ref, range, src_offset, dst_offset, out); - dst_offset.data[2] += range.data[2]; + dst_offset[2] += range[2]; } } else if (dim_index == kChannel4D) { int32_t src_offset = 0; diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp index cef751bc7c..946a0c9f40 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp @@ -32,7 +32,7 @@ void add_clone_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, - {t_out->texture_limits_ubo()})); + {t_out->logical_limits_ubo()})); } void clone(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index cbee886ad2..6ce905a12f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -106,7 +106,7 @@ ValueRef prepack_biases( graph.create_local_wg_size(v), vref, v, - {t->sizes_ubo()}, + {t->sizes_ubo(), t->axis_map_ubo()}, // Specialization constants {SV(t->packed_dim_whcn_idx())})); @@ -242,10 +242,8 @@ Conv2dParams create_conv2d_params( const Kernel2dParams& p, const bool transposed) { const auto& overlay_region = utils::make_ivec2({ - p.kernel_size.data[0] + - (p.kernel_size.data[0] - 1) * (p.dilation.data[0] - 1), - p.kernel_size.data[1] + - (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1), + p.kernel_size[0] + (p.kernel_size[0] - 1) * (p.dilation[0] - 1), + p.kernel_size[1] + (p.kernel_size[1] - 1) * (p.dilation[1] - 1), }); const auto weight_sizes = graph.sizes_of(weight); const int32_t in_group_size = utils::safe_downcast( @@ -255,15 +253,13 @@ Conv2dParams create_conv2d_params( void check_conv2d_params(const Kernel2dParams& p, const bool transposed) { if (transposed) { - if (p.dilation.data[0] > 1 || p.dilation.data[1] > 1) { + if (p.dilation[0] > 1 || p.dilation[1] > 1) { VK_THROW( "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!"); } } - if ((p.padding.data[0] > 0 && p.kernel_size.data[0] > 1 && - p.dilation.data[0] > 1) || - (p.padding.data[1] > 0 && p.kernel_size.data[1] > 1 && - p.dilation.data[1] > 1)) { + if ((p.padding[0] > 0 && p.kernel_size[0] > 1 && p.dilation[0] > 1) || + (p.padding[1] > 0 && p.kernel_size[1] > 1 && p.dilation[1] > 1)) { VK_THROW( "aten.convolution.default: padding > 0 while dilation, kernel_size > 1 is not supported yet!"); } @@ -295,11 +291,11 @@ utils::uvec3 create_conv2d_global_wg_size( const Conv2dMethod method, const ValueRef out) { if (method == Conv2dMethod::Pointwise) { - const utils::uvec3 image_extents = graph.image_extents_of(out); + const utils::uvec3 image_extents = graph.logical_limits_of(out); return { - utils::div_up(image_extents.data[0u], 2u), - utils::div_up(image_extents.data[1u], 2u), - image_extents.data[2u]}; + utils::div_up(image_extents[0u], 2u), + utils::div_up(image_extents[1u], 2u), + image_extents[2u]}; } else { return graph.create_global_wg_size(out); } @@ -380,7 +376,7 @@ void add_conv2d_node( {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - t_out->texture_limits_ubo(), + t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), graph.create_params_buffer(extra_params), @@ -478,7 +474,7 @@ void add_conv1d_node( {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - t_out->texture_limits_ubo(), + t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), graph.create_params_buffer(out_params), @@ -566,6 +562,7 @@ void conv(ComputeGraph& graph, const std::vector& args) { REGISTER_OPERATORS { VK_REGISTER_OP(aten.convolution.default, conv); VK_REGISTER_OP(conv_with_clamp.default, conv); + VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp index be0b457b79..f21dca1490 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp @@ -48,7 +48,12 @@ void add_embedding_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {{in, weight}, vkapi::MemoryAccessType::READ}}, - {t_out->sizes_ubo()})); + { + t_out->sizes_ubo(), + t_out->axis_map_ubo(), + t_in->axis_map_ubo(), + t_weight->axis_map_ubo(), + })); } void embedding(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp new file mode 100644 index 0000000000..17b6b351db --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace vkcompute { + +struct GridPriorsParam final { + int32_t stride; + float offset; +}; + +void resize_grid_priors_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr in = graph->get_tensor(extra_args[0]); + std::vector in_sizes = in->sizes(); + int64_t height = in_sizes.at(in_sizes.size() - 2); + int64_t width = in_sizes.at(in_sizes.size() - 1); + std::vector sizes = {height * width, 2}; + out->virtual_resize(sizes); +} + +void add_grid_priors_node( + ComputeGraph& graph, + const ValueRef& in, + const ValueRef& stride_ref, + const ValueRef& offset_ref, + const ValueRef& out) { + vTensorPtr t_out = graph.get_tensor(out); + vTensorPtr t_in = graph.get_tensor(in); + int32_t stride = graph.extract_scalar(stride_ref); + float offset = graph.extract_scalar(offset_ref); + + std::string kernel_name = "grid_priors"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, *t_out); + + GridPriorsParam param = {stride, offset}; + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + graph.create_global_wg_size(out), + graph.create_local_wg_size(out), + // Inputs and Outputs + { + {out, vkapi::MemoryAccessType::WRITE}, + }, + // Shader params buffers + { + t_in->sizes_ubo(), + t_out->sizes_ubo(), + graph.create_params_buffer(param), + }, + // Specialization Constants + {}, + resize_grid_priors_node, + {in})); +} + +void grid_priors(ComputeGraph& graph, const std::vector& args) { + return add_grid_priors_node(graph, args[0], args[1], args[2], args[3]); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.grid_priors.default, grid_priors); +} +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp index 63b60bf52f..1c8b631346 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp @@ -100,27 +100,36 @@ void add_addmm_naive_node( std::string kernel_name = graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive"; kernel_name.reserve(kShaderNameReserve); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1)); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); + utils::uvec3 global_wg_size = graph.logical_limits_of(out); graph.execute_nodes().emplace_back(new ExecuteNode( graph, VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), + global_wg_size, + graph.create_local_wg_size(global_wg_size), // Inputs and Outputs {{out, vkapi::MemoryAccessType::WRITE}, {{mat1, mat2, self}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), + graph.sizes_ubo(out), + graph.logical_limits_ubo(out), + graph.axis_map_ubo(out), graph.sizes_ubo(mat1), + graph.axis_map_ubo(mat1), + graph.sizes_ubo(mat2), + graph.axis_map_ubo(mat2), graph.sizes_ubo(self), + graph.axis_map_ubo(self), graph.create_params_buffer(params), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out), + graph.packed_dim_whcn_idx_of(mat1), + graph.packed_dim_whcn_idx_of(mat2), + graph.packed_dim_whcn_idx_of(self)}, // Resizing Logic resize_addmm_node, {mat2_is_transposed})); @@ -173,11 +182,20 @@ void add_addmm_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); - utils::uvec3 global_size; + utils::uvec3 global_size = graph.logical_limits_of(out); + + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `logical_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(global_size, {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(global_size, {4, 4, 1}); } utils::uvec3 local_size = adaptive_work_group_size(global_size); @@ -191,14 +209,18 @@ void add_addmm_optimized_node( {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), + graph.axis_map_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_map_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_map_ubo(mat2_packed), graph.sizes_ubo(self), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_map_ubo(self), graph.create_params_buffer(params), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out)}, // Resizing Logic resize_addmm_node, {mat2_is_transposed})); diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp index d1d3ad47d7..c182f220fb 100644 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp @@ -62,7 +62,48 @@ void resize_matmul_node( out->virtual_resize(new_out_sizes); } -void add_matmul_naive_node( +void add_matmul_naive_buffer_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef out, + const ValueRef mat2_is_transposed) { + ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked); + + std::string kernel_name = "matmul_naive_buffer"; + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + utils::uvec3 global_size = { + graph.size_at(-1, out), + graph.size_at(-2, out), + graph.size_at(-3, out) * graph.size_at(-4, out)}; + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_size, + graph.create_local_wg_size(global_size), + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{mat1, mat2}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + { + graph.sizes_ubo(out), + graph.strides_ubo(out), + graph.sizes_ubo(mat1), + graph.strides_ubo(mat1), + graph.sizes_ubo(mat2), + graph.strides_ubo(mat2), + graph.numel_ubo(out), + }, + // Specialization Constants + {}, + // Resizing Logic + resize_matmul_node, + {mat2_is_transposed})); +} + +void add_matmul_naive_texture3d_node( ComputeGraph& graph, const ValueRef mat1, const ValueRef mat2_data, @@ -74,25 +115,32 @@ void add_matmul_naive_node( ? "matmul_transposed_naive" : "matmul_naive"; kernel_name.reserve(kShaderNameReserve); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1)); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); + utils::uvec3 global_wg_size = graph.logical_limits_of(out); graph.execute_nodes().emplace_back(new ExecuteNode( graph, VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), + global_wg_size, + graph.create_local_wg_size(global_wg_size), // Inputs and Outputs {{out, vkapi::MemoryAccessType::WRITE}, {{mat1, mat2}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), + graph.sizes_ubo(out), + graph.logical_limits_ubo(out), + graph.axis_map_ubo(out), graph.sizes_ubo(mat1), + graph.axis_map_ubo(mat1), + graph.sizes_ubo(mat2), + graph.axis_map_ubo(mat2), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out), + graph.packed_dim_whcn_idx_of(mat1), + graph.packed_dim_whcn_idx_of(mat2)}, // Resizing Logic resize_matmul_node, {mat2_is_transposed})); @@ -139,12 +187,21 @@ void add_matmul_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); - utils::uvec3 global_size; + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] + utils::uvec3 global_size = graph.logical_limits_of(out); if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `logical_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(global_size, {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(global_size, {4, 4, 1}); } + utils::uvec3 local_size = adaptive_work_group_size(global_size); graph.execute_nodes().emplace_back(new ExecuteNode( @@ -157,12 +214,15 @@ void add_matmul_optimized_node( {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_map_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_map_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_map_ubo(mat2_packed), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out)}, // Resizing Logic resize_matmul_node, {mat2_is_transposed})); @@ -174,12 +234,16 @@ void add_matmul_node( const ValueRef mat2_data, const ValueRef out, const ValueRef mat2_is_transposed) { - if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) { + if (graph.is_buffer_storage(out)) { + add_matmul_naive_buffer_node( + graph, mat1, mat2_data, out, mat2_is_transposed); + } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) { add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed); } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) { - add_matmul_naive_node(graph, mat1, mat2_data, out, mat2_is_transposed); + add_matmul_naive_texture3d_node( + graph, mat1, mat2_data, out, mat2_is_transposed); } else { - VK_THROW("Input should be channel packed or width packed."); + VK_THROW("Input texture should be channel packed or width packed."); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp index 2b15d92470..d1cbf52182 100644 --- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp @@ -91,7 +91,7 @@ void add_native_layer_norm_node( std::vector in_sizes = t_input->sizes(); - utils::uvec3 global_size = t_mean->image_extents(); + utils::uvec3 global_size = t_mean->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); std::string kernel_name("native_layer_norm"); @@ -109,7 +109,7 @@ void add_native_layer_norm_node( vkapi::MemoryAccessType::WRITE}, {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_out->sizes_ubo(), graph.create_params_buffer(epsilon)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index 7baf921bf0..c6ed72dceb 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -62,7 +62,7 @@ void add_permute_node( !seen[permute_dim], "Argument dim ", permute_dim, " is repeated"); seen[permute_dim] = true; - out_dims.data[(4u - out_ndim) + i] = permute_dim + (4 - out_ndim); + out_dims[(4u - out_ndim) + i] = permute_dim + (4 - out_ndim); } std::string kernel_name = "permute"; @@ -90,7 +90,7 @@ void add_permute_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_out->sizes_ubo(), graph.create_params_buffer(params)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp index 8b477d3a31..33d8b77334 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp @@ -79,7 +79,7 @@ void add_max_pool2d_node( check_pool2d_args(*t_in, *t_out); - utils::uvec3 global_size = t_out->image_extents(); + utils::uvec3 global_size = t_out->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); std::string kernel_name("max_pool2d"); @@ -103,7 +103,7 @@ void add_max_pool2d_node( {arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - t_out->texture_limits_ubo(), + t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), }, @@ -155,7 +155,7 @@ void add_avg_pool2d_node( check_pool2d_args(*t_in, *t_out); - utils::uvec3 global_size = t_out->image_extents(); + utils::uvec3 global_size = t_out->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); std::string kernel_name("avg_pool2d"); @@ -176,7 +176,7 @@ void add_avg_pool2d_node( {{out, vkapi::MemoryAccessType::WRITE}, {arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), graph.create_params_buffer(divisor_params)}, diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp index cf887b6c1a..fa88db9a5d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp @@ -87,14 +87,14 @@ void add_q_8w_linear_node( if (graph.is_buffer_storage(out)) { ubos.append( {graph.sizes_ubo(out), - graph.ntexels_ubo(out), + graph.strides_ubo(out), + graph.numel_ubo(out), graph.sizes_ubo(mat1), - graph.texel_strides_ubo(out), - graph.texel_strides_ubo(mat1), - graph.texel_strides_ubo(q_mat2), - graph.texel_strides_ubo(scales)}); + graph.strides_ubo(mat1), + graph.strides_ubo(q_mat2), + graph.strides_ubo(scales)}); } else { - ubos.append({graph.texture_limits_ubo(out), graph.sizes_ubo(mat1)}); + ubos.append({graph.logical_limits_ubo(out), graph.sizes_ubo(mat1)}); } graph.execute_nodes().emplace_back(new ExecuteNode( diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp new file mode 100644 index 0000000000..d478b7c253 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace vkcompute { + +void check_q_matmul_args( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size_data, + const ValueRef scales_and_zeros, + const ValueRef out) { + const std::vector mat1_sizes = graph.sizes_of(mat1); + const std::vector mat2_sizes = graph.sizes_of(mat2_data); + const std::vector scales_and_zeros_sizes = + graph.sizes_of(scales_and_zeros); + + const uint32_t group_size = graph.extract_scalar(group_size_data); + + VK_CHECK_COND(mat1_sizes.size() == 2); + VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); + + VK_CHECK_COND(graph.memory_layout_of(mat1) == utils::kWidthPacked); + VK_CHECK_COND(graph.memory_layout_of(mat2_data) == utils::kWidthPacked); + VK_CHECK_COND( + graph.memory_layout_of(scales_and_zeros) == utils::kWidthPacked); + + if (graph.storage_type_of(out) == utils::kBuffer) { + VK_CHECK_COND(graph.memory_layout_of(out) == utils::kWidthPacked); + } else { + VK_CHECK_COND(graph.memory_layout_of(out) == utils::kChannelsPacked); + } + + const int mat1_K = utils::val_at(-1, mat1_sizes); + const int mat2_K = utils::val_at(-1, mat2_sizes) * 2; + const int N = utils::val_at(-2, mat2_sizes); + + VK_CHECK_COND(mat1_K == mat2_K); + + VK_CHECK_COND(mat2_K % group_size == 0); + + const uint32_t k_groups = mat2_K / group_size; + + VK_CHECK_COND(scales_and_zeros_sizes.size() == 3); + VK_CHECK_COND(utils::val_at(-1, scales_and_zeros_sizes) == k_groups); + VK_CHECK_COND(utils::val_at(-2, scales_and_zeros_sizes) == N); + VK_CHECK_COND(utils::val_at(-3, scales_and_zeros_sizes) == 2); + + // Match https://fburl.com/code/6ostkknm + std::vector valid_group_sizes = {32, 64, 128, 256}; + + bool is_valid_group_size = false; + for (auto valid_group_size : valid_group_sizes) { + if (group_size == valid_group_size) { + is_valid_group_size = true; + break; + } + } + + VK_CHECK_COND(is_valid_group_size); +} + +void resize_q_matmul_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); + vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); + + const int out_cols = utils::val_at(-2, mat1->sizes()); + const int out_rows = utils::val_at(-2, mat2->sizes()); + + std::vector new_out_sizes(3); + if (mat1->sizes().size() == 2) { + new_out_sizes.resize(2); + new_out_sizes.at(0) = out_cols; + new_out_sizes.at(1) = out_rows; + } else { + new_out_sizes.at(0) = mat1->sizes().at(0); + new_out_sizes.at(1) = out_cols; + new_out_sizes.at(2) = out_rows; + } + + out->virtual_resize(new_out_sizes); +} + +void add_q_matmul_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros_data, + const ValueRef out) { + auto storage_type = graph.storage_type_of(out); + + ValueRef mat2; + + if (storage_type == utils::kBuffer) { + mat2 = prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + } else { + mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + } + + ValueRef scales_and_zeros = + prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked); + + std::string kernel_name = "q_4w_linear"; + + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + add_storage_type_suffix(kernel_name, storage_type); + + const uint32_t group_size_val = graph.extract_scalar(group_size); + + vkapi::ParamsBindList ubos({}); + if (storage_type == utils::kBuffer) { + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.strides_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(mat1)); + ubos.append(graph.strides_ubo(mat2)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + } else { + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + } + + auto out_sizes = graph.sizes_of(out); + uint32_t N = utils::val_at(-1, out_sizes); + uint32_t M = utils::val_at(-2, out_sizes); + + utils::uvec3 global_wg_size = {N, M, 1}; + + utils::uvec3 local_wg_size = adaptive_work_group_size(global_wg_size); + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{mat1, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + ubos, + // Specialization Constants + {SV(group_size_val)}, + // Resizing Logic + resize_q_matmul_node, + {})); +} + +void int4pack_mm(ComputeGraph& graph, const std::vector& args) { + check_q_matmul_args(graph, args[0], args[1], args[2], args[3], args[4]); + return add_q_matmul_node( + graph, + args[0], // mat1 + args[1], // mat2 + args[2], // group_size + args[3], // scales_and_zeros + args[4] // out + ); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten._weight_int4pack_mm.default, int4pack_mm); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp index 0eda7d8260..6a19e27ae8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp @@ -83,8 +83,7 @@ void add_repeat_channel_node( utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch}; // Channel packed global work ids - running_range.data[2] = - out_whcn_sizes.data[3] * utils::div_up_4(out_whcn_sizes.data[2]); + running_range[2] = out_whcn_sizes[3] * utils::div_up_4(out_whcn_sizes[2]); utils::uvec3 global_size = utils::make_uvec3(running_range); utils::uvec3 local_size = adaptive_work_group_size(global_size); @@ -131,7 +130,7 @@ void add_repeat_node( // After expanding a dimension, we will update the "running_range" since we // will need to copy the "expanded" area. - utils::ivec3 running_range = t_in->texture_limits(); + utils::ivec3 running_range = t_in->logical_limits(); const std::vector& in_sizes = t_in->sizes(); @@ -165,7 +164,7 @@ void add_repeat_node( graph, out, running_range, src_offset, dst_offset, out); } - running_range.data[0] = running_range.data[0] * width_repeat; + running_range[0] = running_range[0] * width_repeat; } // Height @@ -179,7 +178,7 @@ void add_repeat_node( graph, out, running_range, src_offset, dst_offset, out); } - running_range.data[1] = running_range.data[1] * height_repeat; + running_range[1] = running_range[1] * height_repeat; } // Batch @@ -187,13 +186,13 @@ void add_repeat_node( utils::ivec3 src_offset{0, 0, 0}; for (int i = 1; i < batch_repeat; ++i) { - utils::ivec3 dst_offset = {0, 0, i * running_range.data[2]}; + utils::ivec3 dst_offset = {0, 0, i * running_range[2]}; add_copy_offset_node( graph, out, running_range, src_offset, dst_offset, out); } - running_range.data[2] = running_range.data[2] * batch_repeat; + running_range[2] = running_range[2] * batch_repeat; } } diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp index 351db0d192..1d0be47e38 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp @@ -112,7 +112,7 @@ void add_select_int_node( {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, // Parameter buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_out->sizes_ubo(), // TODO: num_batches and num_texel_per_batch are provided by // t_out->sizes. Can change the following to reduce params diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 8b323bafed..6aed81a591 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -123,7 +123,7 @@ void add_slice_tensor_out_node( kernel_name.reserve(kShaderNameReserve); add_dtype_suffix(kernel_name, *t_out); - utils::uvec3 global_size = t_out->image_extents(); + utils::uvec3 global_size = t_out->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); const struct Block final { diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp index fa4d3df944..dd2fb43e65 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp @@ -64,7 +64,7 @@ void add_softmax_node( {{out, vkapi::MemoryAccessType::WRITE}, {in_arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(utils::make_ivec2({in_dim, softmax_dim}))}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp index 9e3ae2e6a7..49abd63d75 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp @@ -50,10 +50,10 @@ void add_split_with_sizes_default_node( // Doesn't need to use split_size since we have already verified that the // output tensor's size matches with the split_size. vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->texture_limits(); + utils::ivec3 range = t_out->logical_limits(); add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); - src_offset.data[0] += range.data[0]; + src_offset[0] += range[0]; } } else if (dim_index == kHeight4D) { utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false); @@ -61,10 +61,10 @@ void add_split_with_sizes_default_node( for (ValueRef out_ref : *out_list) { vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->texture_limits(); + utils::ivec3 range = t_out->logical_limits(); add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); - src_offset.data[1] += range.data[1]; + src_offset[1] += range[1]; } } else if (dim_index == kBatch4D) { utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false); @@ -72,10 +72,10 @@ void add_split_with_sizes_default_node( for (ValueRef out_ref : *out_list) { vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->texture_limits(); + utils::ivec3 range = t_out->logical_limits(); add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); - src_offset.data[2] += range.data[2]; + src_offset[2] += range[2]; } } else if (dim_index == kChannel4D) { int32_t src_offset = 0; diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 2e5e9addfb..047e0d0f1f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -21,15 +21,17 @@ void add_staging_to_tensor_node( const ValueRef out_tensor) { VK_CHECK_COND(graph.val_is_staging(in_staging)); - vkapi::ShaderInfo shader = - get_nchw_to_tensor_shader(*graph.get_tensor(out_tensor)); + vkapi::ShaderInfo shader = get_nchw_to_tensor_shader( + *graph.get_tensor(out_tensor), graph.int8_buffers_enabled()); - vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)}); + vkapi::ParamsBindList ubos; if (graph.is_buffer_storage(out_tensor)) { - ubos.append({ - graph.texel_strides_ubo(out_tensor), - graph.ntexels_ubo(out_tensor), - }); + ubos.append( + {graph.sizes_ubo(out_tensor), + graph.strides_ubo(out_tensor), + graph.numel_ubo(out_tensor)}); + } else { + ubos.append({graph.sizes_ubo(out_tensor), graph.axis_map_ubo(out_tensor)}); } graph.execute_nodes().emplace_back(new ExecuteNode( @@ -55,25 +57,43 @@ void add_tensor_to_staging_node( const ValueRef out_staging) { VK_CHECK_COND(graph.val_is_staging(out_staging)); - vkapi::ShaderInfo shader = - get_tensor_to_nchw_shader(*graph.get_tensor(in_tensor)); + vkapi::ShaderInfo shader = get_tensor_to_nchw_shader( + *graph.get_tensor(in_tensor), graph.int8_buffers_enabled()); + + utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor); - vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)}); + vkapi::ParamsBindList ubos; if (graph.is_buffer_storage(in_tensor)) { - ubos.append({ - graph.texel_strides_ubo(in_tensor), - graph.ntexels_ubo(in_tensor), - }); + ubos.append( + {graph.sizes_ubo(in_tensor), + graph.strides_ubo(in_tensor), + graph.numel_ubo(in_tensor)}); + } else { + ubos.append({graph.sizes_ubo(in_tensor), graph.axis_map_ubo(in_tensor)}); + } + + // Normally, the image_to_nchw shader is structured so that each thread reads + // one texel from the input texture and writes each component of the texel + // into the corresponding location in the output buffer. However, this shader + // is structured slightly differently in that each thread writes out a + // complete 32 bit integer (containing 4 packed 8-bit integers) into the + // output buffer. Therefore, the global work group size for this shader will + // be the number of elements in the output buffer divided by 4, as opposed to + // the extents of the input texture. + if (shader.kernel_name == "int8_image_to_nchw_noint8") { + uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4; + global_wg_size = {buffer_len, 1, 1}; + ubos.append({graph.numel_ubo(in_tensor)}); } graph.execute_nodes().emplace_back(new ExecuteNode( graph, shader, - graph.create_global_wg_size(in_tensor), - graph.create_local_wg_size(in_tensor), + global_wg_size, + graph.create_local_wg_size(global_wg_size), // Input and Outputs - {{in_tensor, vkapi::MemoryAccessType::READ}, - {out_staging, vkapi::MemoryAccessType::WRITE}}, + {{out_staging, vkapi::MemoryAccessType::WRITE}, + {in_tensor, vkapi::MemoryAccessType::READ}}, // Parameter Buffers ubos, // Specialization Constants @@ -86,14 +106,14 @@ ValueRef prepack( const utils::GPUMemoryLayout layout) { ValueRef v = graph.add_tensor_like(vref, layout); - vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(*graph.get_tensor(v)); + vkapi::ShaderInfo shader = get_nchw_to_tensor_shader( + *graph.get_tensor(v), graph.int8_buffers_enabled()); - vkapi::ParamsBindList ubos({graph.sizes_ubo(v)}); + vkapi::ParamsBindList ubos; if (graph.is_buffer_storage(v)) { - ubos.append({ - graph.texel_strides_ubo(v), - graph.ntexels_ubo(v), - }); + ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)}); + } else { + ubos.append({graph.sizes_ubo(v), graph.axis_map_ubo(v)}); } graph.prepack_nodes().emplace_back(new PrepackNode( @@ -112,6 +132,33 @@ ValueRef prepack( return v; } +ValueRef prepack_buffer( + ComputeGraph& graph, + const ValueRef vref, + const utils::GPUMemoryLayout layout) { + ValueRef v = graph.add_tensor_like(vref, utils::kBuffer, layout); + + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR("buffer_to_buffer"); + + vkapi::ParamsBindList ubos; + ubos.append({graph.numel_ubo(v)}); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + shader, + graph.create_global_wg_size(v), + graph.create_local_wg_size(v), + // Input and Outputs + vref, + v, + // Parameter Buffers + ubos, + // Specialization Constants + {})); + + return v; +} + ValueRef prepack_if_tensor_ref( ComputeGraph& graph, const ValueRef v, @@ -123,6 +170,17 @@ ValueRef prepack_if_tensor_ref( } } +ValueRef prepack_buffer_if_tensor_ref( + ComputeGraph& graph, + const ValueRef v, + const utils::GPUMemoryLayout layout) { + if (graph.val_is_tref(v)) { + return prepack_buffer(graph, v, layout); + } else { + return v; + } +} + ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { if (graph.val_is_tref(v)) { utils::GPUMemoryLayout layout = @@ -133,4 +191,14 @@ ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { } } +ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { + if (graph.val_is_tref(v)) { + utils::GPUMemoryLayout layout = + graph.suggested_memory_layout(graph.get_tref(v)->sizes); + return prepack_buffer(graph, v, layout); + } else { + return v; + } +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index fc875de80d..88a9630239 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -29,6 +29,13 @@ ValueRef prepack_if_tensor_ref( const ValueRef v, const utils::GPUMemoryLayout layout); +ValueRef prepack_buffer_if_tensor_ref( + ComputeGraph& graph, + const ValueRef v, + const utils::GPUMemoryLayout layout); + ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v); +ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v); + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp index b466f404ad..b65845c223 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp @@ -85,7 +85,7 @@ void add_sum_dim_node( {{out, vkapi::MemoryAccessType::WRITE}, {arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), graph.create_params_buffer(dim + 4 - in_dim), graph.create_params_buffer(dim_size), graph.create_params_buffer(int(ceil(channel / 4.0)))}, diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp index 4342be7229..ea27183ead 100644 --- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp @@ -44,9 +44,9 @@ void add_unary_op_node( vkapi::ParamsBindList ubos({}); if (graph.is_buffer_storage(out)) { - ubos.append({graph.ntexels_ubo(out)}); + ubos.append({graph.numel_ubo(out)}); } else { - ubos.append({graph.texture_limits_ubo(out)}); + ubos.append({graph.logical_limits_ubo(out)}); } ubos.append( {graph.create_params_buffer(min), graph.create_params_buffer(max)}); @@ -114,12 +114,6 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) { "hardshrink"); \ } -#define DEFINE_HARDSWISH_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, args[0], kDummyFloat, kDummyFloat, args[1], #op_name); \ - } - void gelu(ComputeGraph& graph, const std::vector& args) { // args[1] is the `approximate` string // https://fburl.com/code/9omngmyo @@ -140,7 +134,8 @@ DEFINE_CLAMP_FN(clamp); DEFINE_CLAMP_FN(hardtanh); DEFINE_RELU_FN(relu); DEFINE_HARDSHRINK_FN(hardshrink); -DEFINE_HARDSWISH_FN(hardswish); +DEFINE_ACTIVATION_FN(hardswish); +DEFINE_ACTIVATION_FN(hardsigmoid); REGISTER_OPERATORS { VK_REGISTER_OP(aten.abs.default, abs); @@ -157,6 +152,7 @@ REGISTER_OPERATORS { VK_REGISTER_OP(aten.tanh.default, tanh); VK_REGISTER_OP(aten.hardshrink.default, hardshrink); VK_REGISTER_OP(aten.hardswish.default, hardswish); + VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp index 29baff4bde..f7fe5282e0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp @@ -66,11 +66,11 @@ void add_upsample_nearest2d_node( ValueRef arg_in = prepack_if_tensor_ref(graph, in); vTensorPtr t_in = graph.get_tensor(in); - utils::uvec3 input_sizes = t_in->image_extents(); + utils::uvec3 input_sizes = t_in->logical_limits(); utils::ivec2 input_size = { - utils::safe_downcast(input_sizes.data[0]), - utils::safe_downcast(input_sizes.data[1])}; + utils::safe_downcast(input_sizes[0]), + utils::safe_downcast(input_sizes[1])}; utils::vec2 rev_scales = { utils::safe_downcast(1.0), utils::safe_downcast(1.0)}; @@ -79,9 +79,9 @@ void add_upsample_nearest2d_node( auto output_size_ref = graph.get_int_list(output_sizes); rev_scales = { utils::safe_downcast( - (float)input_size.data[0] / output_size_ref->at(1)), + (float)input_size[0] / output_size_ref->at(1)), utils::safe_downcast( - (float)input_size.data[1] / output_size_ref->at(0))}; + (float)input_size[1] / output_size_ref->at(0))}; } else { auto scales = graph.get_double_list(scale_factors); @@ -105,7 +105,7 @@ void add_upsample_nearest2d_node( {{out, vkapi::MemoryAccessType::WRITE}, {arg_in, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), graph.create_params_buffer(input_size), graph.create_params_buffer(rev_scales)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h index 45dfceb3f0..4bd8e9b900 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h @@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST; constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST; inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) { - return static_cast(dim - v_in.dim()); + return dim < 0 ? static_cast(dim) + : static_cast(dim - v_in.dim()); } /* diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp index c5cef52f7a..2fb0f60b24 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp @@ -85,18 +85,18 @@ std::vector calc_out_sizes_hw( // Height out_sizes.at(0) = calc_out_size( in_sizes.at(ndim - 2), - kernel_size.data[1], - stride.data[1], - padding.data[1], - dilation.data[1], + kernel_size[1], + stride[1], + padding[1], + dilation[1], ceil_mode); // Width out_sizes.at(1) = calc_out_size( in_sizes.at(ndim - 1), - kernel_size.data[0], - stride.data[0], - padding.data[0], - dilation.data[0], + kernel_size[0], + stride[0], + padding[0], + dilation[0], ceil_mode); return out_sizes; @@ -128,19 +128,19 @@ std::vector calc_transpose_out_sizes_hw( // Height out_sizes.at(0) = calc_transpose_out_size( in_sizes.at(ndim - 2), - kernel_size.data[1], - stride.data[1], - padding.data[1], - dilation.data[1], - output_padding.data[1]); + kernel_size[1], + stride[1], + padding[1], + dilation[1], + output_padding[1]); // Width out_sizes.at(1) = calc_transpose_out_size( in_sizes.at(ndim - 1), - kernel_size.data[0], - stride.data[0], - padding.data[0], - dilation.data[0], - output_padding.data[0]); + kernel_size[0], + stride[0], + padding[0], + dilation[0], + output_padding[0]); return out_sizes; } diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp new file mode 100644 index 0000000000..4cf678a9dc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +void pack4(const uint8_t* w_ptr, uint8_t* b_ptr, uint32_t N, uint32_t K) { + for (int32_t n = 0; n < N; n++) { + for (int32_t k2 = 0; k2 < K / 2; k2++) { + uint8_t src_val0 = w_ptr[n * K + k2 * 2]; + uint8_t src_val1 = w_ptr[n * K + k2 * 2 + 1]; + b_ptr[n * (K / 2) + k2] = (uint8_t(src_val1) << 4) | uint8_t(src_val0); + } + } +} + +std::vector int4mm_pack_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr) { + const int32_t N = utils::val_at(-1, W_sizes); + const int32_t K = utils::val_at(-2, W_sizes); + + const auto numel = K * N; + std::vector w_ptr_T(numel); + std::vector b_ptr(utils::div_up(numel, 2)); + + // Transpose the weights + for (int32_t k = 0; k < K; k++) { + for (int32_t n = 0; n < N; n++) { + w_ptr_T[n * K + k] = w_ptr[k * N + n]; + } + } + + // Pack two int4s into each int8 + pack4(w_ptr_T.data(), b_ptr.data(), N, K); + + return b_ptr; +} + +std::vector int4mm_dequantize_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr, + const uint32_t group_size, + const float* scales_and_zeros) { + const int64_t N = utils::val_at(-1, W_sizes); + const int64_t K = utils::val_at(-2, W_sizes); + + std::vector w_ptr_deq(K * N); + const int k_groups = K / group_size; + const int zeros_stride = k_groups * N; + + for (int k = 0; k < K; k++) { + for (int n = 0; n < N; n++) { + const int kb = k / group_size; + const int scale_idx = k_groups * n + kb; + const float scale = scales_and_zeros[scale_idx]; + const float zero = + scales_and_zeros[scale_idx + zeros_stride] - scale * 8.0; + w_ptr_deq[k * N + n] = w_ptr[k * N + n] * scale + zero; + } + } + + return w_ptr_deq; +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h new file mode 100644 index 0000000000..4c4cf26d50 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +std::vector int4mm_pack_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr); + +std::vector int4mm_dequantize_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr, + const uint32_t group_size, + const float* scales_and_zeros); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp index 8d86c8287f..2737a86a1a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp @@ -101,15 +101,15 @@ utils::ivec2 create_broadcast_params( utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group) { utils::uvec3 local_group_size = {4, 4, 4}; - if (global_work_group.data[2u] == 1) { - if (global_work_group.data[1u] < 8) { - local_group_size.data[0u] = 16; - local_group_size.data[1u] = 4; - local_group_size.data[2u] = 1; + if (global_work_group[2u] == 1) { + if (global_work_group[1u] < 8) { + local_group_size[0u] = 16; + local_group_size[1u] = 4; + local_group_size[2u] = 1; } else { - local_group_size.data[0u] = 8; - local_group_size.data[1u] = 8; - local_group_size.data[2u] = 1; + local_group_size[0u] = 8; + local_group_size[1u] = 8; + local_group_size[2u] = 1; } } return local_group_size; diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp index b0964ace22..2cfb34a052 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp @@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set( } void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx) { descriptor_set.bind(idx, staging.buffer()); diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h index 3a7ec029da..eed39a9797 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h @@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set( const uint32_t base_idx); void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index d681618d9d..8804bcf2ef 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -13,104 +13,50 @@ #include -#include - namespace vkcompute { -template -void memcpy_to_mapping_impl( - const void* src, - vkapi::MemoryMap& dst_mapping, - const size_t nbytes) { - T* data_ptr = dst_mapping.template data(); - memcpy(data_ptr, reinterpret_cast(src), nbytes); -} - -template -void memcpy_from_mapping_impl( - vkapi::MemoryMap& src_mapping, - void* dst, - const size_t nbytes) { - T* data_ptr = src_mapping.template data(); - memcpy(reinterpret_cast(dst), data_ptr, nbytes); -} - -void memcpy_to_mapping( - const void* src, - vkapi::MemoryMap& dst_mapping, - const size_t nbytes, - const vkapi::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case vkapi::ScalarType::name: \ - memcpy_to_mapping_impl(src, dst_mapping, nbytes); \ - break; +vkapi::ShaderInfo get_nchw_to_tensor_shader( + const api::vTensor& v_dst, + const bool int8_buffer_enabled) { + std::string kernel_name; + kernel_name.reserve(kShaderNameReserve); - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); + if (v_dst.dtype() == vkapi::kChar && + v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) { + return VK_KERNEL(nchw_to_int8_image_noint8); } -#undef DTYPE_CASE -} - -void memcpy_from_mapping( - vkapi::MemoryMap& src_mapping, - void* dst, - const size_t nbytes, - const vkapi::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case vkapi::ScalarType::name: \ - memcpy_from_mapping_impl(src_mapping, dst, nbytes); \ - break; - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); + if (v_dst.storage_type() == utils::kBuffer) { + kernel_name = "nchw_to_buffer"; + add_dtype_suffix(kernel_name, v_dst); + return VK_KERNEL_FROM_STR(kernel_name); } -#undef DTYPE_CASE -} - -void copy_ptr_to_staging( - const void* src, - api::StorageBuffer& staging, - const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - mapping.invalidate(); - memcpy_to_mapping(src, mapping, nbytes, staging.dtype()); -} -void copy_staging_to_ptr( - api::StorageBuffer& staging, - void* dst, - const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); - mapping.invalidate(); - memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); -} - -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - uint8_t* data_ptr = mapping.template data(); - memset(data_ptr, 0, staging.nbytes()); -} - -vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst) { - std::string kernel_name; - kernel_name.reserve(kShaderNameReserve); - - kernel_name = "nchw_to_tensor"; + kernel_name = "nchw_to_image"; add_dtype_suffix(kernel_name, v_dst); add_storage_type_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } -vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src) { +vkapi::ShaderInfo get_tensor_to_nchw_shader( + const api::vTensor& v_src, + bool int8_buffer_enabled) { std::string kernel_name; kernel_name.reserve(kShaderNameReserve); - kernel_name = "tensor_to_nchw"; + if (v_src.dtype() == vkapi::kChar && + v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) { + return VK_KERNEL(int8_image_to_nchw_noint8); + } + + if (v_src.storage_type() == utils::kBuffer) { + kernel_name = "buffer_to_nchw"; + add_dtype_suffix(kernel_name, v_src); + return VK_KERNEL_FROM_STR(kernel_name); + } + + kernel_name = "image_to_nchw"; add_dtype_suffix(kernel_name, v_src); add_storage_type_suffix(kernel_name, v_src); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index dfe86a9e26..8d63958a73 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -12,26 +12,11 @@ namespace vkcompute { -// -// Functions to copy data into and out of a staging buffer -// - -void copy_ptr_to_staging( - const void* src, - api::StorageBuffer& staging, - const size_t nbytes); -void copy_staging_to_ptr( - api::StorageBuffer& staging, - void* dst, - const size_t nbytes); - -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes); - -// -// Functions to get shaders -// - -vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst); -vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src); +vkapi::ShaderInfo get_nchw_to_tensor_shader( + const api::vTensor& v_dst, + bool int8_buffer_enabled = true); +vkapi::ShaderInfo get_tensor_to_nchw_shader( + const api::vTensor& v_src, + bool int8_buffer_enabled = true); } // namespace vkcompute diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h index bc0179e4a4..ad4434cf5a 100644 --- a/backends/vulkan/runtime/utils/VecUtils.h +++ b/backends/vulkan/runtime/utils/VecUtils.h @@ -237,6 +237,38 @@ template struct vec final { // NOLINTNEXTLINE Type data[N]; + + vec() = default; + + // Standard constructor with initializer list + vec(std::initializer_list values) { + VK_CHECK_COND(values.size() == N); + std::copy(values.begin(), values.end(), data); + } + + // Conversion constructor from an _integral_ vec type. Note that this is only + // defined if `OtherType` is an integral type to disallow implicit narrowing. + template < + typename OtherType, + typename std::enable_if< + !std::is_same::value && + std::is_integral::value, + int>::type = 0> + /* implicit */ vec(const vec& other) { + for (int i = 0; i < N; ++i) { + data[i] = safe_downcast(other[i]); + } + } + + const Type& operator[](const uint32_t& i) const { + VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!"); + return data[i]; + } + + Type& operator[](const uint32_t& i) { + VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!"); + return data[i]; + } }; } // namespace detail @@ -261,24 +293,22 @@ using vec4 = vec<4u>; // uvec3 is the type representing tensor extents. Useful for debugging. inline std::ostream& operator<<(std::ostream& os, const uvec3& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")"; + os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")"; return os; } inline std::ostream& operator<<(std::ostream& os, const ivec3& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")"; + os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")"; return os; } inline std::ostream& operator<<(std::ostream& os, const uvec4& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " - << v.data[3u] << ")"; + os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")"; return os; } inline std::ostream& operator<<(std::ostream& os, const ivec4& v) { - os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " - << v.data[3u] << ")"; + os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")"; return os; } @@ -288,7 +318,7 @@ inline detail::vec divup_vec( const detail::vec& b) { detail::vec result; for (uint32_t i = 0; i < N; ++i) { - result.data[i] = utils::div_up(a.data[i], b.data[i]); + result[i] = utils::div_up(a[i], b[i]); } return result; } @@ -369,7 +399,7 @@ inline ivec4 make_ivec4_prepadded1(const std::vector& ints) { ivec4 result = {1, 1, 1, 1}; size_t base = 4 - ints.size(); for (size_t i = 0; i < ints.size(); ++i) { - result.data[i + base] = safe_downcast(ints[i]); + result[i + base] = safe_downcast(ints[i]); } return result; @@ -377,16 +407,16 @@ inline ivec4 make_ivec4_prepadded1(const std::vector& ints) { inline ivec3 make_ivec3(uvec3 ints) { return { - safe_downcast(ints.data[0u]), - safe_downcast(ints.data[1u]), - safe_downcast(ints.data[2u])}; + safe_downcast(ints[0u]), + safe_downcast(ints[1u]), + safe_downcast(ints[2u])}; } inline uvec3 make_uvec3(ivec3 ints) { return { - safe_downcast(ints.data[0u]), - safe_downcast(ints.data[1u]), - safe_downcast(ints.data[2u])}; + safe_downcast(ints[0u]), + safe_downcast(ints[1u]), + safe_downcast(ints[2u])}; } /* diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp index 2803e3fc8d..713fd9917e 100644 --- a/backends/vulkan/runtime/vk_api/Command.cpp +++ b/backends/vulkan/runtime/vk_api/Command.cpp @@ -171,13 +171,10 @@ void CommandBuffer::dispatch(const utils::uvec3& global_workgroup_size) { vkCmdDispatch( handle_, + utils::div_up(global_workgroup_size[0u], bound_.local_workgroup_size[0u]), + utils::div_up(global_workgroup_size[1u], bound_.local_workgroup_size[1u]), utils::div_up( - global_workgroup_size.data[0u], bound_.local_workgroup_size.data[0u]), - utils::div_up( - global_workgroup_size.data[1u], bound_.local_workgroup_size.data[1u]), - utils::div_up( - global_workgroup_size.data[2u], - bound_.local_workgroup_size.data[2u])); + global_workgroup_size[2u], bound_.local_workgroup_size[2u])); state_ = CommandBuffer::State::RECORDING; } diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp index 3d64fbf292..03b01c3fa8 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.cpp +++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp @@ -38,6 +38,10 @@ ParamsBindList::ParamsBindList( std::copy(init_list.begin(), init_list.end(), bind_infos.begin()); } +void ParamsBindList::append(const BufferBindInfo& bind_info) { + bind_infos.emplace_back(bind_info); +} + void ParamsBindList::append(const ParamsBindList& other) { bind_infos.insert( bind_infos.end(), other.bind_infos.begin(), other.bind_infos.end()); diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h index 28a89149d4..418d79a6b3 100644 --- a/backends/vulkan/runtime/vk_api/Descriptor.h +++ b/backends/vulkan/runtime/vk_api/Descriptor.h @@ -39,8 +39,10 @@ struct BufferBindInfo final { struct ParamsBindList final { std::vector bind_infos; + ParamsBindList() = default; ParamsBindList(std::initializer_list init_list); + void append(const BufferBindInfo& bind_info); void append(const ParamsBindList& other); }; diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h index 34c2d95c93..1e3b2a799f 100644 --- a/backends/vulkan/runtime/vk_api/Shader.h +++ b/backends/vulkan/runtime/vk_api/Shader.h @@ -53,8 +53,8 @@ class ShaderLayout final { struct ShaderInfo final { struct { - const uint32_t* bin; - uint32_t size; + const uint32_t* bin = nullptr; + uint32_t size = 0u; } src_code; std::string kernel_name{""}; @@ -71,6 +71,10 @@ struct ShaderInfo final { const uint32_t, std::vector, const utils::uvec3 tile_size); + + operator bool() const { + return src_code.bin != nullptr; + }; }; bool operator==(const ShaderInfo& _1, const ShaderInfo& _2); diff --git a/backends/vulkan/runtime/vk_api/VkUtils.h b/backends/vulkan/runtime/vk_api/VkUtils.h index 2b9b48d9ff..b765d417d4 100644 --- a/backends/vulkan/runtime/vk_api/VkUtils.h +++ b/backends/vulkan/runtime/vk_api/VkUtils.h @@ -14,7 +14,7 @@ namespace vkcompute { namespace vkapi { inline VkExtent3D create_extent3d(const utils::uvec3& extents) { - return VkExtent3D{extents.data[0u], extents.data[1u], extents.data[2u]}; + return VkExtent3D{extents[0u], extents[1u], extents[2u]}; } } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp index aef6785466..908feb0d3f 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp @@ -26,38 +26,35 @@ namespace vkcompute { namespace vkapi { Allocation::Allocation() - : memory_requirements{}, - create_info{}, - allocator(VK_NULL_HANDLE), - allocation(VK_NULL_HANDLE) {} + : allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), is_copy_(false) {} Allocation::Allocation( VmaAllocator vma_allocator, const VkMemoryRequirements& mem_props, const VmaAllocationCreateInfo& create_info) - : memory_requirements(mem_props), - create_info(create_info), - allocator(vma_allocator), - allocation(VK_NULL_HANDLE) { + : allocator(vma_allocator), allocation(VK_NULL_HANDLE), is_copy_(false) { VK_CHECK(vmaAllocateMemory( - allocator, &memory_requirements, &create_info, &allocation, nullptr)); + allocator, &mem_props, &create_info, &allocation, nullptr)); } +Allocation::Allocation(const Allocation& other) noexcept + : allocator(other.allocator), + allocation(other.allocation), + is_copy_(true) {} + Allocation::Allocation(Allocation&& other) noexcept - : memory_requirements(other.memory_requirements), - create_info(other.create_info), - allocator(other.allocator), - allocation(other.allocation) { + : allocator(other.allocator), + allocation(other.allocation), + is_copy_(other.is_copy_) { other.allocation = VK_NULL_HANDLE; } Allocation& Allocation::operator=(Allocation&& other) noexcept { VmaAllocation tmp_allocation = allocation; - memory_requirements = other.memory_requirements; - create_info = other.create_info; allocator = other.allocator; allocation = other.allocation; + is_copy_ = other.is_copy_; other.allocation = tmp_allocation; @@ -65,7 +62,10 @@ Allocation& Allocation::operator=(Allocation&& other) noexcept { } Allocation::~Allocation() { - if (VK_NULL_HANDLE != allocation) { + // Do not destroy the VmaAllocation if this class instance is a copy of some + // other class instance, since this means that this class instance does not + // have ownership of the underlying resource. + if (VK_NULL_HANDLE != allocation && !is_copy_) { vmaFreeMemory(allocator, allocation); } } diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h index 6a3fec9e73..e56605e14b 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocation.h @@ -31,7 +31,23 @@ struct Allocation final { const VkMemoryRequirements&, const VmaAllocationCreateInfo&); - Allocation(const Allocation&) = delete; + protected: + /* + * The Copy constructor allows for creation of a class instance that are + * "aliases" of another class instance. The resulting class instance will not + * have ownership of the underlying VmaAllocation. + * + * This behaviour is analogous to creating a copy of a pointer, thus it is + * unsafe, as the original class instance may be destroyed before the copy. + * These constructors are therefore marked protected so that they may be used + * only in situations where the lifetime of the original class instance is + * guaranteed to exceed, or at least be the same as, the lifetime of the + * copied class instance. + */ + Allocation(const Allocation&) noexcept; + + public: + // To discourage creating copies, the assignment operator is still deleted. Allocation& operator=(const Allocation&) = delete; Allocation(Allocation&&) noexcept; @@ -39,17 +55,27 @@ struct Allocation final { ~Allocation(); - VkMemoryRequirements memory_requirements; - // The properties this allocation was created with - VmaAllocationCreateInfo create_info; // The allocator object this was allocated from VmaAllocator allocator; // Handles to the allocated memory VmaAllocation allocation; + private: + // Indicates whether this class instance is a copy of another class instance, + // in which case it does not have ownership of the underlying VmaAllocation + bool is_copy_; + + public: operator bool() const { return (allocation != VK_NULL_HANDLE); } + + inline bool is_copy() const { + return is_copy_; + } + + friend class VulkanBuffer; + friend class VulkanImage; }; } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 1dadca27a0..6533f06164 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -58,6 +58,13 @@ Allocator::~Allocator() { vmaDestroyAllocator(allocator_); } +VmaAllocationCreateInfo Allocator::gpuonly_resource_create_info() { + VmaAllocationCreateInfo alloc_create_info = {}; + alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + return alloc_create_info; +} + Allocation Allocator::create_allocation( const VkMemoryRequirements& memory_requirements, const VmaAllocationCreateInfo& create_info) { @@ -103,9 +110,7 @@ VulkanImage Allocator::create_image( (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); } - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info(); const VulkanImage::ImageProperties image_props{ image_type, @@ -132,45 +137,34 @@ VulkanImage Allocator::create_image( allocate_memory); } -VulkanBuffer Allocator::create_storage_buffer( - const VkDeviceSize size, - const bool gpu_only, - const bool allocate_memory) { +VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; - // The create storage buffer will be accessed by both the CPU and GPU, so set - // the appropriate flags to indicate that the host device will be accessing + // Staging buffers are accessed by both the CPU and GPU, so set the + // appropriate flags to indicate that the host device will be accessing // the data from this buffer. - if (!gpu_only) { - // Deferred memory allocation should only be used for GPU only buffers. - VK_CHECK_COND( - allocate_memory, - "Only GPU-only buffers should use deferred memory allocation"); - - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - } + alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | + VMA_ALLOCATION_CREATE_MAPPED_BIT; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + alloc_create_info.preferredFlags = + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - return VulkanBuffer( - allocator_, size, alloc_create_info, buffer_usage, allocate_memory); + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); } -VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - - VkBufferUsageFlags buffer_usage = - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; +VulkanBuffer Allocator::create_storage_buffer( + const VkDeviceSize size, + const bool allocate_memory) { + const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); + VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info(); + return VulkanBuffer( + allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { @@ -181,9 +175,7 @@ VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - VulkanBuffer uniform_buffer( - allocator_, size, alloc_create_info, buffer_usage); - return uniform_buffer; + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); } } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 904163cefb..56385eb54d 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -48,6 +48,8 @@ class Allocator final { VmaAllocator allocator_; public: + VmaAllocationCreateInfo gpuonly_resource_create_info(); + Allocation create_allocation( const VkMemoryRequirements& memory_requirements, const VmaAllocationCreateInfo& create_info); @@ -62,13 +64,12 @@ class Allocator final { const bool allow_transfer = false, const bool allocate_memory = true); + VulkanBuffer create_staging_buffer(const VkDeviceSize); + VulkanBuffer create_storage_buffer( const VkDeviceSize, - const bool gpu_only = true, const bool allocate_memory = true); - VulkanBuffer create_staging_buffer(const VkDeviceSize); - /* * Create a uniform buffer with a specified size */ diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp index 29b0e9e3ac..2af3d9efe3 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp @@ -20,6 +20,7 @@ VulkanBuffer::VulkanBuffer() allocator_(VK_NULL_HANDLE), memory_{}, owns_memory_(false), + is_copy_(false), handle_(VK_NULL_HANDLE) {} VulkanBuffer::VulkanBuffer( @@ -37,6 +38,7 @@ VulkanBuffer::VulkanBuffer( allocator_(vma_allocator), memory_{}, owns_memory_(allocate_memory), + is_copy_(false), handle_(VK_NULL_HANDLE) { // If the buffer size is 0, allocate a buffer with a size of 1 byte. This is // to ensure that there will be some resource that can be bound to a shader. @@ -56,8 +58,6 @@ VulkanBuffer::VulkanBuffer( nullptr, // pQueueFamilyIndices }; - memory_.create_info = allocation_create_info; - if (allocate_memory) { VK_CHECK(vmaCreateBuffer( allocator_, @@ -74,11 +74,29 @@ VulkanBuffer::VulkanBuffer( } } +VulkanBuffer::VulkanBuffer( + const VulkanBuffer& other, + const VkDeviceSize offset, + const VkDeviceSize range) noexcept + : buffer_properties_(other.buffer_properties_), + allocator_(other.allocator_), + memory_(other.memory_), + owns_memory_(false), + is_copy_(true), + handle_(other.handle_) { + // TODO: set the offset and range appropriately + buffer_properties_.mem_offset = other.buffer_properties_.mem_offset + offset; + if (range != VK_WHOLE_SIZE) { + buffer_properties_.mem_range = range; + } +} + VulkanBuffer::VulkanBuffer(VulkanBuffer&& other) noexcept : buffer_properties_(other.buffer_properties_), allocator_(other.allocator_), memory_(std::move(other.memory_)), owns_memory_(other.owns_memory_), + is_copy_(other.is_copy_), handle_(other.handle_) { other.handle_ = VK_NULL_HANDLE; } @@ -91,6 +109,7 @@ VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept { allocator_ = other.allocator_; memory_ = std::move(other.memory_); owns_memory_ = other.owns_memory_; + is_copy_ = other.is_copy_; handle_ = other.handle_; other.handle_ = tmp_buffer; @@ -100,7 +119,10 @@ VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept { } VulkanBuffer::~VulkanBuffer() { - if (VK_NULL_HANDLE != handle_) { + // Do not destroy the VkBuffer if this class instance is a copy of another + // class instance, since this means that this class instance does not have + // ownership of the underlying resource. + if (VK_NULL_HANDLE != handle_ && !is_copy_) { if (owns_memory_) { vmaDestroyBuffer(allocator_, handle_, memory_.allocation); } else { @@ -113,6 +135,12 @@ VulkanBuffer::~VulkanBuffer() { } } +VmaAllocationInfo VulkanBuffer::allocation_info() const { + VmaAllocationInfo info; + vmaGetAllocationInfo(allocator_, memory_.allocation, &info); + return info; +} + VkMemoryRequirements VulkanBuffer::get_memory_requirements() const { VkMemoryRequirements memory_requirements; vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements); diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index d868ed0ac7..6197a02d40 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -19,6 +19,12 @@ #include namespace vkcompute { + +// Forward declare vTensor classes such that they can be set as friend classes +namespace api { +class vTensorStorage; +} // namespace api + namespace vkapi { using MemoryAccessFlags = uint8_t; @@ -47,8 +53,27 @@ class VulkanBuffer final { const VkBufferUsageFlags, const bool allocate_memory = true); - VulkanBuffer(const VulkanBuffer&) = delete; - VulkanBuffer& operator=(const VulkanBuffer&) = delete; + protected: + /* + * The Copy constructor and allows for creation of a class instance that are + * "aliases" of another class instance. The resulting class instance will not + * have ownership of the underlying VkBuffer. + * + * This behaviour is analogous to creating a copy of a pointer, thus it is + * unsafe, as the original class instance may be destroyed before the copy. + * These constructors are therefore marked protected so that they may be used + * only in situations where the lifetime of the original class instance is + * guaranteed to exceed, or at least be the same as, the lifetime of the + * copied class instance. + */ + VulkanBuffer( + const VulkanBuffer& other, + const VkDeviceSize offset = 0u, + const VkDeviceSize range = VK_WHOLE_SIZE) noexcept; + + public: + // To discourage creating copies, the assignment operator is still deleted. + VulkanBuffer& operator=(const VulkanBuffer& other) = delete; VulkanBuffer(VulkanBuffer&&) noexcept; VulkanBuffer& operator=(VulkanBuffer&&) noexcept; @@ -69,6 +94,9 @@ class VulkanBuffer final { Allocation memory_; // Indicates whether the underlying memory is owned by this resource bool owns_memory_; + // Indicates whether this VulkanBuffer was copied from another VulkanBuffer, + // thus it does not have ownership of the underlying VKBuffer + bool is_copy_; VkBuffer handle_; public: @@ -86,9 +114,7 @@ class VulkanBuffer final { return memory_.allocation; } - inline VmaAllocationCreateInfo allocation_create_info() const { - return VmaAllocationCreateInfo(memory_.create_info); - } + VmaAllocationInfo allocation_info() const; inline VkBuffer handle() const { return handle_; @@ -114,10 +140,18 @@ class VulkanBuffer final { return owns_memory_; } + inline bool is_copy() const { + return is_copy_; + } + operator bool() const { return (handle_ != VK_NULL_HANDLE); } + inline bool is_copy_of(const VulkanBuffer& other) const { + return (handle_ == other.handle_) && is_copy_; + } + inline void bind_allocation(const Allocation& memory) { VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!"); VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_)); @@ -125,6 +159,8 @@ class VulkanBuffer final { } VkMemoryRequirements get_memory_requirements() const; + + friend class api::vTensorStorage; }; class MemoryMap final { diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp index 42352cfb7e..5029d16616 100644 --- a/backends/vulkan/runtime/vk_api/memory/Image.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Image.cpp @@ -98,6 +98,7 @@ VulkanImage::VulkanImage() allocator_(VK_NULL_HANDLE), memory_{}, owns_memory_(false), + is_copy_(false), handles_{ VK_NULL_HANDLE, VK_NULL_HANDLE, @@ -120,6 +121,7 @@ VulkanImage::VulkanImage( allocator_(vma_allocator), memory_{}, owns_memory_{allocate_memory}, + is_copy_(false), handles_{ VK_NULL_HANDLE, VK_NULL_HANDLE, @@ -157,8 +159,6 @@ VulkanImage::VulkanImage( layout_, // initialLayout }; - memory_.create_info = allocation_create_info; - if (allocate_memory) { VK_CHECK(vmaCreateImage( allocator_, @@ -175,6 +175,17 @@ VulkanImage::VulkanImage( } } +VulkanImage::VulkanImage(const VulkanImage& other) noexcept + : image_properties_(other.image_properties_), + view_properties_(other.view_properties_), + sampler_properties_(other.sampler_properties_), + allocator_(other.allocator_), + memory_(other.memory_), + owns_memory_{false}, + is_copy_(true), + handles_(other.handles_), + layout_(other.layout_) {} + VulkanImage::VulkanImage(VulkanImage&& other) noexcept : image_properties_(other.image_properties_), view_properties_(other.view_properties_), @@ -182,6 +193,7 @@ VulkanImage::VulkanImage(VulkanImage&& other) noexcept allocator_(other.allocator_), memory_(std::move(other.memory_)), owns_memory_(other.owns_memory_), + is_copy_(other.is_copy_), handles_(other.handles_), layout_(other.layout_) { other.handles_.image = VK_NULL_HANDLE; @@ -201,6 +213,7 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept { allocator_ = other.allocator_; memory_ = std::move(other.memory_); owns_memory_ = other.owns_memory_; + is_copy_ = other.is_copy_; handles_ = other.handles_; layout_ = other.layout_; @@ -212,6 +225,13 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept { } VulkanImage::~VulkanImage() { + // Do not destroy any resources if this class instance is a copy of another + // class instance, since this means that this class instance does not have + // ownership of the underlying resource. + if (is_copy_) { + return; + } + if (VK_NULL_HANDLE != handles_.image_view) { vkDestroyImageView(this->device(), handles_.image_view, nullptr); } diff --git a/backends/vulkan/runtime/vk_api/memory/Image.h b/backends/vulkan/runtime/vk_api/memory/Image.h index 1e78f84a5c..447e980595 100644 --- a/backends/vulkan/runtime/vk_api/memory/Image.h +++ b/backends/vulkan/runtime/vk_api/memory/Image.h @@ -22,6 +22,12 @@ #include namespace vkcompute { + +// Forward declare vTensor classes such that they can be set as friend classes +namespace api { +class vTensorStorage; +} // namespace api + namespace vkapi { class ImageSampler final { @@ -96,7 +102,23 @@ class VulkanImage final { VkSampler, const bool allocate_memory = true); - VulkanImage(const VulkanImage&) = delete; + protected: + /* + * The Copy constructor allows for creation of a class instance that are + * "aliases" of another class instance. The resulting class instance will not + * have ownership of the underlying VkImage. + * + * This behaviour is analogous to creating a copy of a pointer, thus it is + * unsafe, as the original class instance may be destroyed before the copy. + * These constructors are therefore marked protected so that they may be used + * only in situations where the lifetime of the original class instance is + * guaranteed to exceed, or at least be the same as, the lifetime of the + * copied class instance. + */ + VulkanImage(const VulkanImage& other) noexcept; + + public: + // To discourage creating copies, the assignment operator is still deleted. VulkanImage& operator=(const VulkanImage&) = delete; VulkanImage(VulkanImage&&) noexcept; @@ -123,6 +145,9 @@ class VulkanImage final { Allocation memory_; // Indicates whether the underlying memory is owned by this resource bool owns_memory_; + // Indicates whether this VulkanImage was copied from another VulkanImage, + // thus it does not have ownership of the underlying VKBuffer + bool is_copy_; Handles handles_; // Layout VkImageLayout layout_; @@ -144,10 +169,6 @@ class VulkanImage final { return memory_.allocation; } - inline VmaAllocationCreateInfo allocation_create_info() const { - return VmaAllocationCreateInfo(memory_.create_info); - } - inline VkFormat format() const { return image_properties_.image_format; } @@ -193,10 +214,18 @@ class VulkanImage final { return owns_memory_; } + inline bool is_copy() const { + return is_copy_; + } + inline operator bool() const { return (handles_.image != VK_NULL_HANDLE); } + inline bool is_copy_of(const VulkanImage& other) const { + return (handles_.image == other.handles_.image) && is_copy_; + } + inline void bind_allocation(const Allocation& memory) { VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!"); VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image)); @@ -207,6 +236,8 @@ class VulkanImage final { } VkMemoryRequirements get_memory_requirements() const; + + friend class api::vTensorStorage; }; struct ImageMemoryBarrier final { diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index da40f0a720..20d09f1df5 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -24,6 +24,9 @@ Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str ] +logger: logging.Logger = logging.getLogger("") +logger.setLevel(logging.INFO) + class VkGraphBuilder: def __init__( @@ -351,9 +354,9 @@ def build_graph(self) -> vk_graph_schema.VkGraph: self.process_node(node, call_node_debug_hdl) call_node_debug_hdl += 1 - logging.info("Operators included in this Vulkan partition: ") + logger.info("Operators included in this Vulkan partition: ") for op in self.seen_ops: - logging.info(f" {op.__name__}") + logger.info(f" {op.__name__}") return vk_graph_schema.VkGraph( version="0", diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml index edba41b7ea..37403c97ac 100644 --- a/backends/vulkan/test/glsl/all_shaders.yaml +++ b/backends/vulkan/test/glsl/all_shaders.yaml @@ -47,21 +47,12 @@ idx_fill_buffer: idx_fill_texture: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - PACKING: CHANNELS_PACKED generate_variant_forall: - PACKING: - - VALUE: "CHANNELS_PACKED" - SUFFIX: "C_packed" - - VALUE: "WIDTH_PACKED" - SUFFIX: "W_packed" - - VALUE: "HEIGHT_PACKED" - SUFFIX: "H_packed" DTYPE: - - VALUE: "half" - SUFFIX: "half" - - VALUE: "float" - SUFFIX: "float" + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 shader_variants: - NAME: idx_fill_texture diff --git a/backends/vulkan/test/glsl/idx_fill_buffer.glsl b/backends/vulkan/test/glsl/idx_fill_buffer.glsl index 98cf04e338..d32c52c205 100644 --- a/backends/vulkan/test/glsl/idx_fill_buffer.glsl +++ b/backends/vulkan/test/glsl/idx_fill_buffer.glsl @@ -10,39 +10,24 @@ #define PRECISION ${PRECISION} -#define VEC4_T ${buffer_gvec_type(DTYPE, 4)} +#define T ${buffer_scalar_type(DTYPE)} #include "indexing_utils.h" -$if DTYPE == "half": - #extension GL_EXT_shader_16bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require -$elif DTYPE == "int8": - #extension GL_EXT_shader_8bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -$elif DTYPE == "uint8": - #extension GL_EXT_shader_8bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_uint8 : require +${define_required_extensions(DTYPE)} layout(std430) buffer; -layout(set = 0, binding = 0) buffer PRECISION restrict writeonly Buffer { - VEC4_T data[]; -} -buffer_in; - -layout(set = 0, binding = 1) uniform PRECISION restrict Params { - int len; -} -params; +${layout_declare_buffer(0, "w", "out_buf", DTYPE, PRECISION, True)} +${layout_declare_ubo(1, "int", "numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { - const int i = ivec3(gl_GlobalInvocationID).x; - - const int base = 4 * i; - if (base < params.len) { - buffer_in.data[i] = VEC4_T(base, base + 1, base + 2, base + 3); + const int t_id = ivec3(gl_GlobalInvocationID).x; + if (t_id >= numel) { + return; } + + out_buf[t_id] = T(t_id); } diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl index 1f75cadf49..8914d2b892 100644 --- a/backends/vulkan/test/glsl/idx_fill_texture.glsl +++ b/backends/vulkan/test/glsl/idx_fill_texture.glsl @@ -12,21 +12,17 @@ #define VEC4_T ${texel_type(DTYPE)} -#define POS ${get_pos[NDIM]("pos")} - #include "indexing_utils.h" layout(std430) buffer; -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; - -layout(set = 0, binding = 1) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; +${layout_declare_tensor(0, "w", "image_out", DTYPE, "texture3d")} +${layout_declare_ubo(1, "ivec4", "sizes")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; +layout(constant_id = 4) const int offset = 10; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); @@ -37,6 +33,6 @@ void main() { } const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); - VEC4_T texel = VEC4_T(buf_indices); - imageStore(image_out, POS, texel); + VEC4_T texel = VEC4_T(buf_indices) + offset; + imageStore(image_out, pos, texel); } diff --git a/backends/vulkan/test/glsl/reference_matmul.glsl b/backends/vulkan/test/glsl/reference_matmul.glsl new file mode 100644 index 0000000000..5278b3abc5 --- /dev/null +++ b/backends/vulkan/test/glsl/reference_matmul.glsl @@ -0,0 +1,48 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION highp + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", "float", "buffer")} +${layout_declare_tensor(1, "r", "t_mat1", "float", "buffer")} +${layout_declare_tensor(2, "r", "t_mat2", "float", "buffer")} +${layout_declare_ubo(3, "ivec4", "out_sizes")} +${layout_declare_ubo(4, "ivec4", "out_strides")} +${layout_declare_ubo(5, "ivec4", "mat1_sizes")} +${layout_declare_ubo(6, "ivec4", "mat1_strides")} +${layout_declare_ubo(7, "ivec4", "mat2_sizes")} +${layout_declare_ubo(8, "ivec4", "mat2_strides")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec2 out_idx = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y); + if (any(greaterThanEqual(out_idx, out_sizes.xy))) { + return; + } + + // Initial idx for mat1 is (0, out_idx.y) + int mat1_id = out_idx.y * mat1_strides.y; + // Initial idx for mat2 is (out_idx.x, 0) + int mat2_id = out_idx.x * mat2_strides.x; + + float sum = 0.0; + for (int i = 0; i < mat1_sizes.x; ++i) { + sum += t_mat1[mat1_id] * t_mat2[mat2_id]; + + mat1_id += mat1_strides.x; + mat2_id += mat2_strides.y; + } + + const int out_id = out_idx.x * out_strides.x + out_idx.y * out_strides.y; + t_out[out_id] = sum; +} diff --git a/backends/vulkan/test/glsl/scalar_add_buffer.glsl b/backends/vulkan/test/glsl/scalar_add_buffer.glsl index 7f6cb2db47..cd3a85a165 100644 --- a/backends/vulkan/test/glsl/scalar_add_buffer.glsl +++ b/backends/vulkan/test/glsl/scalar_add_buffer.glsl @@ -10,22 +10,14 @@ #define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, "buffer")} - -$if DTYPE == "half": - #extension GL_EXT_shader_16bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require -$elif DTYPE == "int8": - #extension GL_EXT_shader_8bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -$elif DTYPE == "uint8": - #extension GL_EXT_shader_8bit_storage : require - #extension GL_EXT_shader_explicit_arithmetic_types_uint8 : require +${define_required_extensions(DTYPE)} + +#define T ${buffer_scalar_type(DTYPE)} layout(std430) buffer; ${layout_declare_tensor(0, "rw", "buffer_in", DTYPE, "buffer")} -${layout_declare_ubo(1, "int", "ntexels")} +${layout_declare_ubo(1, "int", "numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -33,9 +25,9 @@ layout(constant_id = 3) const float scalar = 2.0; void main() { const int t_id = ivec3(gl_GlobalInvocationID).x; - if (t_id >= ntexels) { + if (t_id >= numel) { return; } - buffer_in[t_id] = buffer_in[t_id] + VEC4_T(scalar);// buffer_in[t_id]; + buffer_in[t_id] = buffer_in[t_id] + T(scalar); } diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl new file mode 100644 index 0000000000..aa2b22c81f --- /dev/null +++ b/backends/vulkan/test/glsl/scalar_add_texture.glsl @@ -0,0 +1,29 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +layout(std430) buffer; + +${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")} +${layout_declare_ubo(1, "uvec3", "extents")} +${layout_declare_ubo(2, "int", "scalar")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(pos, extents))) { + return; + } + + vec4 in_tex = imageLoad(t_in, pos); + imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar)); +} diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index ff5c7a60e0..f2276b0247 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -49,6 +49,7 @@ def get_binary_elementwise_inputs(): ((S, S1, S2), (S, S1, S2)), ((S, S1, S2), (S, S1, 1), 2.0), ((S, S1, S2), (S, 1, S2), 2.0), + ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), ] ) test_suite.layouts = [ @@ -70,6 +71,7 @@ def get_mm_inputs(): test_suite.prepacked_args = ["mat2"] # ATen matmul doesn't support half test_suite.dtypes = ["at::kFloat"] + test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] test_suite.layouts = [ "utils::kWidthPacked", "utils::kChannelsPacked", @@ -877,6 +879,7 @@ def get_softmax_inputs(): "aten.neg.default", "aten.cos.default", "aten.hardswish.default", + "aten.hardsigmoid.default", ] ) def get_unary_ops_inputs(): diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 778ce67787..e6ddf1cdb8 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -25,7 +25,7 @@ ctypes.CDLL("libvulkan.so.1") -from executorch.exir.program._program import _to_edge_transform_and_lower +from executorch.exir import to_edge_transform_and_lower from executorch.extension.pybindings.portable_lib import ( # @manual _load_for_executorch_from_buffer, ) @@ -120,7 +120,7 @@ def run_test(memory_layout): model, sample_inputs, dynamic_shapes=dynamic_shapes ) - edge_program = _to_edge_transform_and_lower( + edge_program = to_edge_transform_and_lower( program, transform_passes=[ I64toI32(self._edge_compile_config._skip_dim_order), @@ -204,6 +204,16 @@ def forward(self, x, y, w): self.lower_module_and_test_output(add_module, sample_inputs) + sample_inputs = ( + torch.rand(size=(4, 5, 2, 3), dtype=torch.float32), + torch.rand(size=(4, 5, 2, 3), dtype=torch.float32), + torch.rand( + size=(2, 3), dtype=torch.float32 + ), # test broadcasting on packed dim + ) + + self.lower_module_and_test_output(add_module, sample_inputs) + def test_vulkan_backend_add_int(self): class AddIntModule(torch.nn.Module): def __init__(self): @@ -1632,3 +1642,57 @@ def forward(self, x): (torch.tensor([[[0, 1], [0, 1]], [[4, 2], [3, 3]]]),), memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], ) + + def test_vulkan_backend_conv_with_clamp(self): + class ConvWithClampModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.weight = torch.randn(6, 8, 3, 3) + self.bias = torch.randn(8) + self.stride = (1, 2) + self.padding = (2, 3) + self.dilation = (1, 1) + self.transposed = True + self.output_padding = (0, 1) + self.groups = 1 + self.output_min = 0 + self.output_max = 10 + + def forward(self, x): + return torch.ops.et_vk.conv_with_clamp( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.transposed, + self.output_padding, + self.groups, + self.output_min, + self.output_max, + ) + + self.lower_module_and_test_output( + ConvWithClampModule(), + (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),), + memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], + ) + + def test_vulkan_backend_grid_priors(self): + class GridPriorsModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.ops.et_vk.grid_priors( + x, + stride=8, + offset=0.5, + ) + + self.lower_module_and_test_output( + GridPriorsModule(), + (torch.rand(size=[1, 5, 2, 3]),), + memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], + ) diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index c55c286acc..1b91e1ff4e 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -8,11 +8,14 @@ #include -#include +#include #include #include +#include + +using namespace vkcompute; // // Operator Recording Functions @@ -23,15 +26,13 @@ void record_nchw_to_buffer_op( vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = { - SV(v_dst.packed_dim_whcn_idx())}; context->submit_compute_job( get_nchw_to_tensor_shader(v_dst), pipeline_barrier, - {uint32_t(v_dst.texel_numel()), 1, 1}, + {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, - specialization_constants, + {}, VK_NULL_HANDLE, 0, v_dst.buffer( @@ -40,8 +41,8 @@ void record_nchw_to_buffer_op( vkapi::MemoryAccessType::WRITE), src_buffer, v_dst.sizes_ubo(), - v_dst.texel_strides_ubo(), - v_dst.ntexels_ubo()); + v_dst.strides_ubo(), + v_dst.numel_ubo()); } void record_buffer_to_nchw_op( @@ -49,22 +50,19 @@ void record_buffer_to_nchw_op( api::vTensor& v_src, vkapi::VulkanBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = { - SV(v_src.packed_dim_whcn_idx())}; - context->submit_compute_job( get_tensor_to_nchw_shader(v_src), pipeline_barrier, - {uint32_t(v_src.texel_numel()), 1, 1}, + {uint32_t(v_src.numel()), 1, 1}, {64, 1, 1}, - specialization_constants, + {}, VK_NULL_HANDLE, 0, - v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), dst_buffer, + v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), - v_src.texel_strides_ubo(), - v_src.ntexels_ubo()); + v_src.strides_ubo(), + v_src.numel_ubo()); } void record_nchw_to_image_op( @@ -76,10 +74,11 @@ void record_nchw_to_image_op( SV(v_dst.packed_dim_whcn_idx())}; context->submit_compute_job( - get_nchw_to_tensor_shader(v_dst), + get_nchw_to_tensor_shader( + v_dst, context->adapter_ptr()->has_full_int8_buffers_support()), pipeline_barrier, - v_dst.image_extents(), - adaptive_work_group_size(v_dst.image_extents()), + v_dst.logical_limits(), + adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, @@ -88,7 +87,8 @@ void record_nchw_to_image_op( vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, - v_dst.sizes_ubo()); + v_dst.sizes_ubo(), + v_dst.axis_map_ubo()); } void record_image_to_nchw_op( @@ -102,14 +102,37 @@ void record_image_to_nchw_op( context->submit_compute_job( get_tensor_to_nchw_shader(v_src), pipeline_barrier, - v_src.image_extents(), - adaptive_work_group_size(v_src.image_extents()), + v_src.logical_limits(), + adaptive_work_group_size(v_src.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, - v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), dst_buffer, - v_src.sizes_ubo()); + v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + v_src.sizes_ubo(), + v_src.axis_map_ubo()); +} + +void record_int8_image_to_nchw_noint8_op( + api::Context* const context, + api::vTensor& v_src, + api::StagingBuffer& dst_buffer) { + vkapi::PipelineBarrier pipeline_barrier{}; + uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); + utils::uvec3 global_wg_size = {buffer_len, 1, 1}; + context->submit_compute_job( + VK_KERNEL(int8_image_to_nchw_noint8), + pipeline_barrier, + global_wg_size, + adaptive_work_group_size(global_wg_size), + {v_src.packed_dim_whcn_idx()}, + VK_NULL_HANDLE, + 0, + dst_buffer.buffer(), + v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + v_src.sizes_ubo(), + v_src.axis_map_ubo(), + v_src.numel_ubo()); } void record_conv2d_prepack_weights_op( @@ -137,8 +160,8 @@ void record_conv2d_prepack_weights_op( context->submit_compute_job( shader, pipeline_barrier, - v_dst.image_extents(), - adaptive_work_group_size(v_dst.image_extents()), + v_dst.logical_limits(), + adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, @@ -165,8 +188,8 @@ void record_binary_op( context->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, - v_dst.image_extents(), - adaptive_work_group_size(v_dst.image_extents()), + v_dst.logical_limits(), + adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, @@ -229,7 +252,7 @@ void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) { api::context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, - {uint32_t(v_ten.texel_numel()), 1, 1}, + {uint32_t(v_ten.numel()), 1, 1}, {64, 1, 1}, specialization_constants, VK_NULL_HANDLE, @@ -253,7 +276,7 @@ void record_scalar_add_buffer( api::context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel), pipeline_barrier, - {uint32_t(v_ten.texel_numel()), 1, 1}, + {uint32_t(v_ten.numel()), 1, 1}, {64, 1, 1}, specialization_constants, VK_NULL_HANDLE, @@ -262,7 +285,73 @@ void record_scalar_add_buffer( pipeline_barrier, vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE), - v_ten.ntexels_ubo()); + v_ten.numel_ubo()); +} + +void record_reference_matmul( + api::Context* context, + api::vTensor& out, + api::vTensor& mat1, + api::vTensor& mat2) { + vkapi::PipelineBarrier pipeline_barrier{}; + api::context()->submit_compute_job( + VK_KERNEL(reference_matmul), + pipeline_barrier, + {uint32_t(out.size(1)), uint32_t(out.size(0)), 1}, + {64, 1, 1}, + {}, + VK_NULL_HANDLE, + 0, + out.buffer( + pipeline_barrier, + vkapi::PipelineStage::COMPUTE, + vkapi::MemoryAccessType::WRITE), + mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + out.sizes_ubo(), + out.strides_ubo(), + mat1.sizes_ubo(), + mat1.strides_ubo(), + mat2.sizes_ubo(), + mat2.strides_ubo()); +} + +void record_matmul_texture3d( + api::Context* context, + api::vTensor& out, + api::vTensor& mat1, + api::vTensor& mat2) { + std::string kernel_name = "matmul_naive"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, out.storage_type()); + add_dtype_suffix(kernel_name, out.dtype()); + + utils::uvec3 global_wg_size = out.logical_limits(); + + vkapi::PipelineBarrier pipeline_barrier{}; + api::context()->submit_compute_job( + VK_KERNEL_FROM_STR(kernel_name), + pipeline_barrier, + global_wg_size, + {8, 8, 1}, + {out.packed_dim_whcn_idx(), + mat1.packed_dim_whcn_idx(), + mat2.packed_dim_whcn_idx()}, + VK_NULL_HANDLE, + 0, + out.image( + pipeline_barrier, + vkapi::PipelineStage::COMPUTE, + vkapi::MemoryAccessType::WRITE), + mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + out.sizes_ubo(), + out.logical_limits_ubo(), + out.axis_map_ubo(), + mat1.sizes_ubo(), + mat1.axis_map_ubo(), + mat2.sizes_ubo(), + mat2.axis_map_ubo()); } // @@ -273,22 +362,22 @@ void record_scalar_add_buffer( _(uint8_t, Byte) \ _(int8_t, Char) \ _(int32_t, Int) \ - _(torch::executor::Half, Half) \ + _(exec_aten::Half, Half) \ _(float, Float) \ _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size()); - -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted; \ - data_converted.resize(data.size()); \ - for (int i = 0; i < data.size(); ++i) { \ - data_converted[i] = ctype(data[i]); \ - } \ - copy_ptr_to_staging( \ - data_converted.data(), staging_buffer, vten.gpu_nbytes()); \ + api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); + +#define CASE(ctype, name) \ + case vkapi::ScalarType::name: { \ + std::vector data_converted; \ + data_converted.resize(data.size()); \ + for (int i = 0; i < data.size(); ++i) { \ + data_converted[i] = ctype(data[i]); \ + } \ + staging_buffer.copy_from( \ + data_converted.data(), vten.staging_buffer_nbytes()); \ } break; switch (vten.dtype()) { @@ -307,7 +396,7 @@ void fill_vtensor(api::vTensor& vten, std::vector& data) { } void fill_vtensor(api::vTensor& vten, float val, bool iota) { - std::vector vten_data(vten.gpu_numel()); + std::vector vten_data(vten.staging_buffer_numel()); if (iota) { std::iota(vten_data.begin(), vten_data.end(), val); } else { @@ -317,12 +406,44 @@ void fill_vtensor(api::vTensor& vten, float val, bool iota) { fill_vtensor(vten, vten_data); } +std::vector create_random_float_buffer( + const size_t numel, + const float min, + const float max) { + std::vector data(numel); + std::default_random_engine rng; + std::uniform_real_distribution dist(min, max); + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = dist(rng); + } + return data; +} + +std::vector create_random_uint8_buffer( + const size_t numel, + const uint8_t min, + const uint8_t max) { + std::vector data(numel); + std::default_random_engine rng; + std::uniform_real_distribution dist(min, max); + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = (uint8_t)dist(rng); + } + return data; +} + void fill_vtensor( ComputeGraph& graph, const IOValueRef idx, float val, bool iota) { - std::vector data(graph.get_tensor(idx.value)->gpu_numel()); + vTensorPtr t = graph.get_tensor(idx.value); + std::vector data(t->numel()); + if (t->storage_type() != utils::kBuffer) { + data.resize(t->staging_buffer_numel()); + } if (iota) { std::iota(data.begin(), data.end(), val); } else { @@ -333,8 +454,8 @@ void fill_vtensor( } void extract_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer( - api::context(), vten.dtype(), vten.gpu_numel()); + api::StagingBuffer staging_buffer( + api::context(), vten.dtype(), vten.staging_buffer_numel()); if (vten.storage_type() == utils::StorageType::BUFFER) { record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer()); @@ -346,14 +467,14 @@ void extract_vtensor(api::vTensor& vten, std::vector& data) { api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); fence.wait(); -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted(data.size()); \ - copy_staging_to_ptr( \ - staging_buffer, data_converted.data(), vten.gpu_nbytes()); \ - for (int i = 0; i < data.size(); ++i) { \ - data[i] = float(data_converted[i]); \ - } \ +#define CASE(ctype, name) \ + case vkapi::ScalarType::name: { \ + std::vector data_converted(data.size()); \ + staging_buffer.copy_to( \ + data_converted.data(), vten.staging_buffer_nbytes()); \ + for (int i = 0; i < data.size(); ++i) { \ + data[i] = float(data_converted[i]); \ + } \ } break; switch (vten.dtype()) { @@ -376,8 +497,10 @@ void submit_to_gpu() { } vkapi::Allocation allocate_memory_for(const api::vTensor& vten) { + VmaAllocationCreateInfo alloc_create_info = + api::context()->adapter_ptr()->vma().gpuonly_resource_create_info(); return api::context()->adapter_ptr()->vma().create_allocation( - vten.get_memory_requirements(), vten.get_allocation_create_info()); + vten.get_memory_requirements(), alloc_create_info); } VmaTotalStatistics get_vma_stats() { @@ -409,7 +532,7 @@ void execute_graph_and_check_output( IOValueRef out_ioval = graph.outputs().at(i); vTensorPtr t_out = graph.get_tensor(out_ioval.value); - std::vector output_data(t_out->gpu_numel()); + std::vector output_data(t_out->staging_buffer_numel()); graph.copy_from_staging( out_ioval.staging, output_data.data(), output_data.size()); @@ -418,3 +541,9 @@ void execute_graph_and_check_output( } } } + +bool check_close(float a, float b, float atol, float rtol) { + float max = std::max(std::abs(a), std::abs(b)); + float diff = std::abs(a - b); + return diff <= (atol + rtol * max); +} diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index 89e16131c9..d9d83a9620 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -16,11 +16,9 @@ #include #include -using namespace vkcompute; - #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \ - api::vTensor( \ - api::context(), \ + vkcompute::api::vTensor( \ + vkcompute::api::context(), \ sizes, \ vkapi::kFloat, \ utils::StorageType::TEXTURE_3D, \ @@ -28,8 +26,8 @@ using namespace vkcompute; allocate_memory); #define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \ - api::vTensor( \ - api::context(), \ + vkcompute::api::vTensor( \ + vkcompute::api::context(), \ sizes, \ vkapi::kFloat, \ utils::StorageType::BUFFER, \ @@ -37,16 +35,20 @@ using namespace vkcompute; allocate_memory); #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.gpu_numel()); \ + vkcompute::api::StagingBuffer staging_buffer_##tensor( \ + vkcompute::api::context(), \ + vkapi::kFloat, \ + tensor.staging_buffer_numel()); \ record_nchw_to_image_op( \ - api::context(), staging_buffer_##tensor.buffer(), tensor); + vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor); #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.gpu_numel()); \ + vkcompute::api::StagingBuffer staging_buffer_##tensor( \ + vkcompute::api::context(), \ + vkapi::kFloat, \ + tensor.staging_buffer_numel()); \ record_image_to_nchw_op( \ - api::context(), tensor, staging_buffer_##tensor.buffer()); + vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer()); #define CHECK_VALUE(data, idx, expected) \ do { \ @@ -63,92 +65,125 @@ using namespace vkcompute; // void record_nchw_to_buffer_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst); + vkcompute::api::Context* const context, + vkcompute::vkapi::VulkanBuffer& src_buffer, + vkcompute::api::vTensor& v_dst); void record_buffer_to_nchw_op( - api::Context* const context, - api::vTensor& v_src, - vkapi::VulkanBuffer& dst_buffer); + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_src, + vkcompute::vkapi::VulkanBuffer& dst_buffer); void record_nchw_to_image_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst); + vkcompute::api::Context* const context, + vkcompute::vkapi::VulkanBuffer& src_buffer, + vkcompute::api::vTensor& v_dst); void record_image_to_nchw_op( - api::Context* const context, - api::vTensor& v_src, - vkapi::VulkanBuffer& dst_buffer); + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_src, + vkcompute::vkapi::VulkanBuffer& dst_buffer); + +void record_int8_image_to_nchw_noint8_op( + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_src, + vkcompute::api::StagingBuffer& dst_buffer); void record_conv2d_prepack_weights_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst, + vkcompute::api::Context* const context, + vkcompute::vkapi::VulkanBuffer& src_buffer, + vkcompute::api::vTensor& v_dst, const std::vector& original_sizes, const bool transposed); void record_binary_op( - api::Context* const context, + vkcompute::api::Context* const context, const std::string& op_name, - api::vTensor& v_in1, - api::vTensor& v_in2, - api::vTensor& v_dst); + vkcompute::api::vTensor& v_in1, + vkcompute::api::vTensor& v_in2, + vkcompute::api::vTensor& v_dst); void execute_and_check_add( - api::vTensor& a, - api::vTensor& b, - api::vTensor& c, + vkcompute::api::vTensor& a, + vkcompute::api::vTensor& b, + vkcompute::api::vTensor& c, float a_val, float b_val); -void record_index_fill_buffer(api::Context* const context, api::vTensor& v_ten); +void record_index_fill_buffer( + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_ten); void record_scalar_add_buffer( - api::Context* context, - api::vTensor& v_ten, + vkcompute::api::Context* context, + vkcompute::api::vTensor& v_ten, float offset); +void record_reference_matmul( + vkcompute::api::Context* context, + vkcompute::api::vTensor& out, + vkcompute::api::vTensor& mat1, + vkcompute::api::vTensor& mat2); + +void record_matmul_texture3d( + vkcompute::api::Context* context, + vkcompute::api::vTensor& out, + vkcompute::api::vTensor& mat1, + vkcompute::api::vTensor& mat2); + // // Input & Output Utilities // -inline void -fill_staging(api::StorageBuffer& staging, float val, int numel = -1) { +inline void fill_staging( + vkcompute::api::StagingBuffer& staging, + float val, + int numel = -1) { if (numel < 0) { numel = staging.numel(); } std::vector data(numel); std::fill(data.begin(), data.end(), val); - copy_ptr_to_staging(data.data(), staging, sizeof(float) * numel); + staging.copy_from(data.data(), sizeof(float) * numel); } -void fill_vtensor(api::vTensor& vten, std::vector& data); +void fill_vtensor(vkcompute::api::vTensor& vten, std::vector& data); + +void fill_vtensor(vkcompute::api::vTensor& vten, float val, bool iota = false); -void fill_vtensor(api::vTensor& vten, float val, bool iota = false); +std::vector create_random_float_buffer( + const size_t numel, + const float min = 0, + const float max = 1); + +std::vector create_random_uint8_buffer( + const size_t numel, + const uint8_t min = 0, + const uint8_t max = 255); void fill_vtensor( - ComputeGraph& graph, - const IOValueRef idx, + vkcompute::ComputeGraph& graph, + const vkcompute::IOValueRef idx, float val, bool iota = false); -void extract_vtensor(api::vTensor& vten, std::vector& data); +void extract_vtensor(vkcompute::api::vTensor& vten, std::vector& data); -inline std::vector extract_vtensor(api::vTensor& vten) { - std::vector data_out(vten.gpu_numel()); +inline std::vector extract_vtensor(vkcompute::api::vTensor& vten) { + std::vector data_out(vten.staging_buffer_numel()); extract_vtensor(vten, data_out); return data_out; } -inline void -check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { +inline void check_staging_buffer( + vkcompute::api::StagingBuffer& staging, + float val, + int numel = -1) { if (numel < 0) { numel = staging.numel(); } std::vector data(numel); - copy_staging_to_ptr(staging, data.data(), sizeof(float) * numel); + staging.copy_to(data.data(), sizeof(float) * numel); for (size_t i = 0; i < data.size(); ++i) { CHECK_VALUE(data, i, val); @@ -156,21 +191,21 @@ check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { } inline int64_t get_buf_idx( - ComputeGraph& graph, - IOValueRef ref, + vkcompute::ComputeGraph& graph, + vkcompute::IOValueRef ref, const std::vector& tensor_coor) { - vTensorPtr vten_ptr = graph.get_tensor(ref.value); + vkcompute::vTensorPtr vten_ptr = graph.get_tensor(ref.value); const std::vector& sizes = vten_ptr->sizes(); - int64_t c = dim_at(sizes); - int64_t h = dim_at(sizes); - int64_t w = dim_at(sizes); + int64_t c = vkcompute::dim_at(sizes); + int64_t h = vkcompute::dim_at(sizes); + int64_t w = vkcompute::dim_at(sizes); - int64_t ni = dim_at(tensor_coor); - int64_t ci = dim_at(tensor_coor); - int64_t hi = dim_at(tensor_coor); - int64_t wi = dim_at(tensor_coor); + int64_t ni = vkcompute::dim_at(tensor_coor); + int64_t ci = vkcompute::dim_at(tensor_coor); + int64_t hi = vkcompute::dim_at(tensor_coor); + int64_t wi = vkcompute::dim_at(tensor_coor); return (ni * c * h * w + ci * h * w + hi * w + wi); } @@ -181,7 +216,8 @@ inline int64_t get_buf_idx( void submit_to_gpu(); -vkapi::Allocation allocate_memory_for(const api::vTensor& vten); +vkcompute::vkapi::Allocation allocate_memory_for( + const vkcompute::api::vTensor& vten); VmaTotalStatistics get_vma_stats(); @@ -192,7 +228,7 @@ size_t get_vma_allocation_count(); // void execute_graph_and_check_output( - ComputeGraph& graph, + vkcompute::ComputeGraph& graph, std::vector input_vals, std::vector expected_outputs); @@ -226,3 +262,9 @@ void print_vector( } std::cout << std::endl; } + +// +// Misc. Utilities +// + +bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 9260475ab6..32177c9c3d 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include #include @@ -21,10 +21,42 @@ #include +#include + #include +using namespace vkcompute; using namespace vkcompute::api; +std::vector +transpose_matrix(std::vector& mat, const int H, const int W) { + std::vector out(W * H); + for (int out_y = 0; out_y < H; ++out_y) { + for (int out_x = 0; out_x < W; ++out_x) { + out[out_x * H + out_y] = mat[out_y * W + out_x]; + } + } + return out; +} + +std::vector compute_reference_matmul( + std::vector& mat1, + std::vector& mat2, + const int M, + const int K, + const int N) { + std::vector out(M * N); + for (int out_y = 0; out_y < M; ++out_y) { + for (int out_x = 0; out_x < N; ++out_x) { + out[out_y * N + out_x] = 0; + for (int k = 0; k < K; ++k) { + out[out_y * N + out_x] += mat1[out_y * K + k] * mat2[k * N + out_x]; + } + } + } + return out; +} + std::vector> standard_sizes_to_test = { // 2D {7, 11}, @@ -69,22 +101,27 @@ TEST_F(VulkanComputeAPITest, print_adapter) { std::vector get_reference_strides( const std::vector& sizes, const utils::GPUMemoryLayout layout, - const bool texel_strides) { + const bool unsqueezed = false) { int64_t C = utils::val_at(-3, sizes); int64_t H = utils::val_at(-2, sizes); int64_t W = utils::val_at(-1, sizes); + int64_t numel = utils::multiply_integers(sizes); + switch (layout) { case utils::kWidthPacked: - if (texel_strides) { - W = utils::div_up(W, INT64_C(4)); - } switch (sizes.size()) { case 1: + if (unsqueezed) + return {numel, numel, numel, 1}; return {1}; case 2: + if (unsqueezed) + return {numel, numel, W, 1}; return {W, 1}; case 3: + if (unsqueezed) + return {numel, H * W, W, 1}; return {H * W, W, 1}; case 4: return {C * H * W, H * W, W, 1}; @@ -93,15 +130,18 @@ std::vector get_reference_strides( } break; case utils::kHeightPacked: - if (texel_strides) { - H = utils::div_up(H, INT64_C(4)); - } switch (sizes.size()) { case 1: + if (unsqueezed) + return {numel, numel, numel, 1}; return {1}; case 2: + if (unsqueezed) + return {numel, numel, 1, H}; return {1, H}; case 3: + if (unsqueezed) + return {numel, H * W, 1, H}; return {W * H, 1, H}; case 4: return {C * W * H, W * H, 1, H}; @@ -109,15 +149,18 @@ std::vector get_reference_strides( return {}; } case utils::kChannelsPacked: - if (texel_strides) { - C = utils::div_up(C, INT64_C(4)); - } switch (sizes.size()) { case 1: + if (unsqueezed) + return {numel, numel, numel, 1}; return {1}; case 2: + if (unsqueezed) + return {numel, numel, W, 1}; return {W, 1}; case 3: + if (unsqueezed) + return {numel, 1, W * C, C}; return {1, W * C, C}; case 4: return {H * W * C, 1, W * C, C}; @@ -128,33 +171,202 @@ std::vector get_reference_strides( return {}; } +TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { + vkapi::ShaderInfo empty_shader_info; + EXPECT_FALSE(empty_shader_info); + EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr); + EXPECT_TRUE(empty_shader_info.src_code.size == 0u); +} + +TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { + // ndim, GPUMemoryLayout, expected dim order pairs + std::vector>> + test_cases = { + {1, utils::kWidthPacked, {0}}, + {1, utils::kHeightPacked, {0}}, + {1, utils::kChannelsPacked, {0}}, + {2, utils::kWidthPacked, {0, 1}}, + {2, utils::kHeightPacked, {1, 0}}, + {2, utils::kChannelsPacked, {0, 1}}, + {3, utils::kWidthPacked, {0, 1, 2}}, + {3, utils::kHeightPacked, {0, 2, 1}}, + {3, utils::kChannelsPacked, {1, 2, 0}}, + {4, utils::kWidthPacked, {0, 1, 2, 3}}, + {4, utils::kHeightPacked, {0, 1, 3, 2}}, + {4, utils::kChannelsPacked, {0, 2, 3, 1}}, + }; + + for (const auto& test_case : test_cases) { + const size_t& ndim = std::get<0>(test_case); + const utils::GPUMemoryLayout& layout = std::get<1>(test_case); + const auto& expected_dim_order = std::get<2>(test_case); + std::vector dim_order = calculate_dim_order(ndim, layout); + + ASSERT_TRUE(dim_order == expected_dim_order); + } +} + TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { + vTensor v_tensor_to_resize( + context(), + {25, 25, 25, 25}, + vkapi::kFloat, + utils::kBuffer, + utils::kWidthPacked, + /*allocate_memory = */ false); + for (const auto& sizes : standard_sizes_to_test) { if (sizes.size() < 3) { continue; } for (const auto& layout : {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) { - // texel_strides = true { - std::vector strides = calculate_strides(sizes, layout); - std::vector ref_strides = + std::vector dim_order = + calculate_dim_order(sizes.size(), layout); + std::vector strides = calculate_strides(sizes, dim_order); + std::vector ref_strides = get_reference_strides(sizes, layout); + ASSERT_TRUE(strides == ref_strides); + + int64_t numel = utils::multiply_integers(sizes); + std::vector unsqueezed_strides = + unsqueeze_strides(strides, numel); + std::vector ref_unsqueezed_strides = get_reference_strides(sizes, layout, true); - ASSERT_TRUE(strides == ref_strides); + ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides); + + // Create new vTensor and check that the strides are correct + vTensor new_v_tensor( + context(), + sizes, + vkapi::kFloat, + utils::kBuffer, + layout, + /*allocate_memory = */ false); + + ASSERT_TRUE(new_v_tensor.strides() == ref_strides); + ASSERT_TRUE( + new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides); + + // Resize vtensor and check that updated metadata is correct + v_tensor_to_resize.virtual_reconfigure(sizes, dim_order); + ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides); + ASSERT_TRUE( + v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides); } + } + } +} - // texel_strides = false - { - std::vector strides = calculate_strides(sizes, layout, false); - std::vector ref_strides = - get_reference_strides(sizes, layout, false); - ASSERT_TRUE(strides == ref_strides); - } +TEST_F(VulkanComputeAPITest, virtual_transpose_test) { + std::vector sizes = {7, 9, 11, 13}; + // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx + std::vector>> test_cases = { + {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}}, + {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 2}, {0}}, + {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 2}, {2}}, + }; + + for (const auto& test_case : test_cases) { + const int dim0 = test_case.at(0).at(0); + const int dim1 = test_case.at(0).at(1); + + const auto& expected_sizes = test_case.at(1); + const auto& expected_dim_order = test_case.at(2); + const auto& expected_axis_map = test_case.at(3); + const int expected_packed_dim = test_case.at(4).at(0); + + { + vTensor a_buffer = vTensor( + context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked); + + a_buffer.virtual_transpose(dim0, dim1); + EXPECT_TRUE(a_buffer.sizes() == expected_sizes); + EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order); + } + + { + vTensor a_texture = vTensor( + context(), + sizes, + vkapi::kFloat, + utils::kTexture3D, + utils::kWidthPacked); + a_texture.virtual_transpose(dim0, dim1); + EXPECT_TRUE(a_texture.sizes() == expected_sizes); + EXPECT_TRUE(a_texture.axis_map() == expected_axis_map); + EXPECT_TRUE(a_texture.packed_dim_whcn_idx() == expected_packed_dim); } } } +utils::ivec3 make_temp_ivec3(int x, int y, int z) { + return utils::ivec3{x, y, z}; +} + +TEST_F(VulkanComputeAPITest, vec_test) { + { + utils::vec3 v3({1, 2, 3}); + ASSERT_TRUE(v3[0] == 1); + ASSERT_TRUE(v3[1] == 2); + ASSERT_TRUE(v3[2] == 3); + v3 = {4, 5, 6}; + ASSERT_TRUE(v3[0] == 4); + ASSERT_TRUE(v3[1] == 5); + ASSERT_TRUE(v3[2] == 6); + } + + { + utils::uvec4 uv4({4, 3, 2, 1}); + ASSERT_TRUE(uv4[0] == 4); + ASSERT_TRUE(uv4[1] == 3); + ASSERT_TRUE(uv4[2] == 2); + ASSERT_TRUE(uv4[3] == 1); + uv4 = {11, 13, 12, 88}; + ASSERT_TRUE(uv4[0] == 11); + ASSERT_TRUE(uv4[1] == 13); + ASSERT_TRUE(uv4[2] == 12); + ASSERT_TRUE(uv4[3] == 88); + } + + // Test copy from same type + { + utils::ivec3 v{5, 6, 8}; + utils::ivec3 v2 = v; + + ASSERT_TRUE(v2[0] == 5); + ASSERT_TRUE(v2[1] == 6); + ASSERT_TRUE(v2[2] == 8); + } + + // Test copy from different type + { + utils::uvec3 v{5, 6, 8}; + utils::ivec3 v2 = v; + + ASSERT_TRUE(v2[0] == 5); + ASSERT_TRUE(v2[1] == 6); + ASSERT_TRUE(v2[2] == 8); + } + + // Test construction from temporary vec + { + utils::uvec3 v{make_temp_ivec3(4, 5, 10)}; + ASSERT_TRUE(v[0] == 4); + ASSERT_TRUE(v[1] == 5); + ASSERT_TRUE(v[2] == 10); + } + + // Test initalization from temporary vec + { + utils::uvec3 v = make_temp_ivec3(4, 5, 10); + ASSERT_TRUE(v[0] == 4); + ASSERT_TRUE(v[1] == 5); + ASSERT_TRUE(v[2] == 10); + } +} + TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) { // Try to get shader from custom shader library const vkapi::ShaderInfo& kernel = VK_KERNEL(test_shader); @@ -211,7 +423,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StorageBuffer buffer(context(), vkapi::kFloat, len); + StagingBuffer buffer(context(), vkapi::kFloat, len); float scale = 3.0f; float offset = 1.5f; @@ -235,7 +447,7 @@ TEST_F(VulkanComputeAPITest, spec_var_shader_test) { submit_to_gpu(); std::vector data(len); - copy_staging_to_ptr(buffer, data.data(), buffer.nbytes()); + buffer.copy_to(data.data(), buffer.nbytes()); for (size_t i = 0; i < len; ++i) { CHECK_VALUE(data, i, scale * i + offset); @@ -282,7 +494,8 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { params.buffer()); } - StorageBuffer staging_buffer(context(), vkapi::kFloat, a.gpu_numel()); + StagingBuffer staging_buffer( + context(), vkapi::kFloat, a.staging_buffer_numel()); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); submit_to_gpu(); @@ -302,7 +515,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { template void test_storage_buffer_type(const size_t len) { - StorageBuffer buffer(context(), dtype, len); + StagingBuffer buffer(context(), dtype, len); std::string kernel_name("idx_fill_buffer"); switch (dtype) { @@ -344,7 +557,7 @@ void test_storage_buffer_type(const size_t len) { submit_to_gpu(); std::vector data(len); - copy_staging_to_ptr(buffer, data.data(), buffer.nbytes()); + buffer.copy_to(data.data(), buffer.nbytes()); for (size_t i = 0; i < len; ++i) { CHECK_VALUE(data, i, T(i)); @@ -359,7 +572,7 @@ TEST_F(VulkanComputeAPITest, test_buffer_float16) { if (!context()->adapter_ptr()->has_full_float16_buffers_support()) { GTEST_SKIP(); } - test_storage_buffer_type(16); + test_storage_buffer_type(16); } TEST_F(VulkanComputeAPITest, test_buffer_int8) { @@ -441,7 +654,7 @@ TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) { run_buffer_tensor_sanity_check(a); break; case vkapi::kHalf: - run_buffer_tensor_sanity_check(a); + run_buffer_tensor_sanity_check(a); break; case vkapi::kChar: run_buffer_tensor_sanity_check(a); @@ -478,6 +691,146 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) { } } +TEST_F(VulkanComputeAPITest, tensor_alias_test) { + for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) { + std::vector sizes = {9, 9}; + + const size_t alloc_count_before = get_vma_allocation_count(); + + vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type); + + vTensor copy = vTensor(original); + + // Two tensors but only one additional allocation. + EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1); + EXPECT_TRUE(copy.is_view_of(original)); + + // Fill original tensor with some data + fill_vtensor(original, 2.5f, true); + + std::vector data_out(copy.staging_buffer_numel()); + // Extract the copy tensor; should contain the data of the original tensor + extract_vtensor(copy, data_out); + + for (size_t i = 0; i < original.numel(); ++i) { + CHECK_VALUE(data_out, i, 2.5f + i); + } + } +} + +TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { + constexpr int M = 11; + constexpr int K = 23; + constexpr int N = 17; + std::vector mat1_sizes = {M, K}; + std::vector mat2_sizes = {N, K}; + std::vector out_sizes = {M, N}; + + for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) { + vTensor mat1 = vTensor( + context(), + mat1_sizes, + vkapi::kFloat, + storage_type, + utils::kWidthPacked); + vTensor mat2 = vTensor( + context(), + mat2_sizes, + vkapi::kFloat, + storage_type, + utils::kWidthPacked); + vTensor out = vTensor( + context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked); + + // Generate data + std::vector mat1_data = + create_random_float_buffer(mat1.staging_buffer_numel()); + std::vector mat2_data = + create_random_float_buffer(mat2.staging_buffer_numel()); + + // Create direct view and modify sizes and strides later + vTensor mat2_t = vTensor(mat2); + // Update sizes and strides of mat2_t to be that of a transposed tensor + mat2_t.virtual_transpose(0, 1); + + EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked); + + std::vector mat2_t_data = transpose_matrix(mat2_data, N, K); + std::vector ref_out = + compute_reference_matmul(mat1_data, mat2_t_data, M, K, N); + + // Fill original tensor with some data + fill_vtensor(mat1, mat1_data); + fill_vtensor(mat2, mat2_data); + + if (storage_type == utils::kTexture3D) { + record_matmul_texture3d(context(), out, mat1, mat2_t); + } else { + record_reference_matmul(context(), out, mat1, mat2_t); + } + + std::vector data_out(out.staging_buffer_numel()); + // Extract the copy tensor; should contain the data of the original tensor + extract_vtensor(out, data_out); + + for (size_t i = 0; i < ref_out.size(); ++i) { + EXPECT_TRUE(check_close(data_out[i], ref_out[i])); + } + } +} + +TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) { + constexpr int L = 31; + + // S{N} refers to slice {N} + constexpr int L_S1 = 17; + constexpr int O_S1 = 5; + + constexpr int L_S2 = 7; + constexpr int O_S2 = 3; + + std::vector dim_order = {0}; + + std::vector t_sizes = {L}; + std::vector s1_sizes = {L_S1}; + std::vector s2_sizes = {L_S2}; + + vTensor orig = CREATE_FLOAT_BUFFER(t_sizes, /*allocate_memory=*/true); + + fill_vtensor(orig, 0); + + vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1); + vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2); + + record_scalar_add_buffer(api::context(), s1, 4.5f); + record_scalar_add_buffer(api::context(), s2, 7.5f); + + std::vector orig_data(orig.staging_buffer_numel()); + extract_vtensor(orig, orig_data); + + int id = 0; + while (id < O_S1) { + EXPECT_TRUE(orig_data[id] == 0); + ++id; + } + while (id < O_S1 + O_S2) { + EXPECT_TRUE(orig_data[id] == 4.5); + ++id; + } + while (id < O_S1 + O_S2 + L_S2) { + EXPECT_TRUE(orig_data[id] == 12); + ++id; + } + while (id < O_S1 + L_S1) { + EXPECT_TRUE(orig_data[id] == 4.5); + ++id; + } + while (id < L) { + EXPECT_TRUE(orig_data[id] == 0); + ++id; + } +} + TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) { // This test is the same as texture_add_sanity_check, except that the tensor // memory is allocated in a deferred fashion @@ -490,9 +843,9 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) { // No allocations made so far EXPECT_TRUE(get_vma_allocation_count() == 0); - std::vector data_a(a.gpu_numel()); + std::vector data_a(a.staging_buffer_numel()); std::fill(data_a.begin(), data_a.end(), 2.5f); - std::vector data_b(b.gpu_numel()); + std::vector data_b(b.staging_buffer_numel()); std::fill(data_b.begin(), data_b.end(), 1.5f); // Allocate memory at the last possible opportunity @@ -511,7 +864,7 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) { record_binary_op(context(), "add", a, b, c); - std::vector data_c(c.gpu_numel()); + std::vector data_c(c.staging_buffer_numel()); extract_vtensor(c, data_c); for (size_t i = 0; i < data_c.size(); ++i) { @@ -551,11 +904,11 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) { EXPECT_TRUE(get_vma_allocation_count() == 3); // Specify input data - std::vector data_a(a.gpu_numel()); + std::vector data_a(a.staging_buffer_numel()); std::fill(data_a.begin(), data_a.end(), 2.5f); - std::vector data_b(b.gpu_numel()); + std::vector data_b(b.staging_buffer_numel()); std::fill(data_b.begin(), data_b.end(), 1.5f); - std::vector data_d(b.gpu_numel()); + std::vector data_d(b.staging_buffer_numel()); std::fill(data_d.begin(), data_d.end(), 1.0f); // First, fill a and b with data @@ -572,7 +925,7 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) { record_binary_op(context(), "add", c, d, e); // Extract data from e - std::vector data_e(e.gpu_numel()); + std::vector data_e(e.staging_buffer_numel()); extract_vtensor(e, data_e); // Sanity check that the values are correct @@ -625,71 +978,13 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) { // No allocations yet EXPECT_TRUE(get_vma_allocation_count() == 0); - std::vector data_a(a.gpu_numel()); + std::vector data_a(a.staging_buffer_numel()); std::fill(data_a.begin(), data_a.end(), 2.5f); // Encoding a command buffer with a vTensor without memory should throw EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error); } -TEST_F(VulkanComputeAPITest, tensor_reallocation_test) { - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - execute_and_check_add(a, b, c, 3.0f, 5.0f); - - // Redo with new sizes - std::vector new_sizes = {4, 6, 3}; - a.reallocate(new_sizes); - b.reallocate(new_sizes); - c.reallocate(new_sizes); - - // Flush everything - context()->flush(); - - execute_and_check_add(a, b, c, 12.0f, 10.0f); -} - -TEST_F( - VulkanComputeAPITest, - tensor_reallocation_with_deferred_allocation_test) { - std::vector sizes = {8, 8, 8}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - - vkapi::Allocation a_mem = allocate_memory_for(a); - a.image().bind_allocation(a_mem); - vkapi::Allocation b_mem = allocate_memory_for(b); - b.image().bind_allocation(b_mem); - vkapi::Allocation c_mem = allocate_memory_for(c); - c.image().bind_allocation(c_mem); - - execute_and_check_add(a, b, c, 4.0f, 8.0f); - - std::vector> new_sizes_list = { - {4, 3, 5}, {4, 1, 7}, {8, 3, 2}, {8, 7, 2}}; - - for (auto& new_sizes : new_sizes_list) { - // Redo with new sizes - a.reallocate(new_sizes); - b.reallocate(new_sizes); - c.reallocate(new_sizes); - - // Flush everything - context()->flush(); - - a.image().bind_allocation(a_mem); - b.image().bind_allocation(b_mem); - c.image().bind_allocation(c_mem); - - execute_and_check_add( - a, b, c, float(new_sizes[1] + 4.5f), float(new_sizes[2] + 13.0f)); - } -} - TEST_F(VulkanComputeAPITest, texture_virtual_resize) { context()->set_cmd(/*reusable = */ true); std::vector sizes = {8, 12, 12}; @@ -718,14 +1013,18 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) { b.virtual_resize(new_sizes); c.virtual_resize(new_sizes); - fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.gpu_numel()); - fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.gpu_numel()); + fill_staging( + staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel()); + fill_staging( + staging_buffer_b, + float(new_sizes[2] + 55.0f), + b.staging_buffer_numel()); submit_to_gpu(); check_staging_buffer( staging_buffer_c, float(new_sizes[1] + new_sizes[2] + 56.5f), - c.gpu_numel()); + c.staging_buffer_numel()); } } @@ -733,10 +1032,39 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) { // Compute Graph Tests // -#define EXTRACT_TENSOR(name) \ - std::vector data_##name(graph.get_tensor(name.value)->gpu_numel()); \ +#define EXTRACT_TENSOR(name) \ + std::vector data_##name( \ + graph.get_tensor(name.value)->staging_buffer_numel()); \ graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); +// The purpose of this test is simply to track the size of various classes over +// time, in the interest of making sure that they doesn't grow too large. +TEST_F(VulkanComputeAPITest, print_object_sizes) { +#define PRINT_SIZE(name) \ + std::cout << #name << " size: " << sizeof(name) << " B" << std::endl + PRINT_SIZE(vTensor); + PRINT_SIZE(Value); + PRINT_SIZE(StagingBuffer); + PRINT_SIZE(ComputeGraph); + PRINT_SIZE(ExecuteNode); +#undef PRINT_SIZE + + // The actual sizes of each object is dependent on the platform. However, we + // can alert ourselves to any significant changes in the sizes of these + // objects by checking the `sizeof()` the class against some loose thresholds. + + // Current known size on 64 bit system: 1040 B + EXPECT_TRUE(sizeof(vTensor) < 1200); + // Current known size on 64 bit system: 1056 B + EXPECT_TRUE(sizeof(Value) < 1200); + // Current known size on 64 bit system: 120 B + EXPECT_TRUE(sizeof(StagingBuffer) < 500); + // Current known size on 64 bit system: 384 B + EXPECT_TRUE(sizeof(ComputeGraph) < 500); + // Current known size on 64 bit system: 248 B + EXPECT_TRUE(sizeof(ExecuteNode) < 500); +} + TEST(VulkanComputeGraphTest, test_values_scalars) { GraphConfig config; ComputeGraph graph(config); @@ -791,6 +1119,19 @@ TEST(VulkanComputeGraphTest, test_values_string) { EXPECT_TRUE(stored == "hello, world"); } +TEST(VulkanComputeGraphTest, empty_init_executenode_test) { + ExecuteNode node(nullptr, {}); + EXPECT_FALSE(node); + + GraphConfig config; + ComputeGraph graph(config); + + // Encode an empty ExecuteNode and check that command buffer encoding does not + // crash. + graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {})); + EXPECT_NO_FATAL_FAILURE(graph.encode_execute()); +} + TEST(VulkanComputeGraphTest, test_zero_dim_tensor) { GraphConfig config; ComputeGraph graph(config); @@ -877,12 +1218,68 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_buffer) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_view) { + constexpr int W = 7; + constexpr int H = 7; + // slice height + constexpr int S_H = 2; + // slice offset + constexpr int S_O = 3; + + GraphConfig config; + config.set_storage_type_override(utils::kBuffer); + ComputeGraph graph(config); + + std::vector dim_order = {0, 1}; + + std::vector orig_sizes = {H, W}; + std::vector slice_sizes = {S_H, W}; + const int offset = S_O * W; + + // Build graph + + IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat); + ValueRef slice = + graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset); + + EXPECT_TRUE(graph.val_is_view_of(slice, orig.value)); + + IOValueRef out = {}; + + out.value = graph.add_tensor(slice_sizes, vkapi::kFloat); + + auto opFn = VK_GET_OP_FN("aten.abs.default"); + opFn(graph, {slice, out.value, kDummyValueRef, kDummyValueRef}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + float start_val = -130 + i; + + fill_vtensor(graph, orig, start_val, true); + + graph.execute(); + + EXTRACT_TENSOR(out); + + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + const float expected_val = std::abs(start_val) - float(offset) - i; + CHECK_VALUE(data_out, i, expected_val); + } + } +} + TEST(VulkanComputeGraphTest, test_simple_graph) { GraphConfig config; ComputeGraph graph(config); - std::vector size_big = {8, 64, 124}; - std::vector size_small = {8, 1, 124}; + std::vector size_big = {1, 8, 8}; + std::vector size_small = {1, 1, 8}; // Build graph @@ -922,6 +1319,64 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) { + GraphConfig config; + config.set_storage_type_override(utils::kTexture3D); + ComputeGraph graph(config); + + std::vector sizes = {8, 64, 124}; + + // Build graph + + ValueRef scalar = graph.add_symint(1); + IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat); + + IOValueRef out = {}; + out.value = a.value; + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR("scalar_add_texture"), + graph.create_global_wg_size(a.value), + graph.create_local_wg_size(a.value), + // Inputs and Outputs + {{out.value, vkapi::MemoryAccessType::WRITE}}, + // Shader params buffers + {graph.logical_limits_ubo(a.value), + graph.get_or_create_int_param_buffer(scalar)}, + // Specialization Constants + {}, + // Resizing Logic + nullptr, + {})); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + int scalar_val = i - 3.0f; + graph.set_symint(scalar, scalar_val); + + float val_a = i + 2.0f; + float val_out = val_a + scalar_val; + + fill_vtensor(graph, a, val_a); + + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + CHECK_VALUE(data_out, i, val_out); + } + } +} + #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val) \ std::vector data_##name(utils::multiply_integers(sizes)); \ std::fill(data_##name.begin(), data_##name.end(), val); \ @@ -989,6 +1444,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { GraphConfig config; ComputeGraph graph(config); + size_t expected_vma_allocation_count = 0; std::vector size_big = {12, 64, 64}; std::vector size_small = {12, 64, 64}; @@ -1005,8 +1461,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 4); // +2: t.sizes_ubo() for each staging shader + // +2: t.axis_map_ubo() for each staging shader // +2: staging buffer for each input tensor - EXPECT_TRUE(get_vma_allocation_count() == 4); + expected_vma_allocation_count += 6; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); ValueRef c = graph.add_tensor( size_big, @@ -1016,15 +1474,22 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto addFn = VK_GET_OP_FN("aten.add.Tensor"); addFn(graph, {a.value, b.value, kDummyValueRef, c}); + // +2: alpha UBO, broadcast UBO for arithmetic shader + // +1: t.sizes_ubo() for arithmetic shader output c + // +1: t.axis_map_ubo() for arithmetic shader output c + expected_vma_allocation_count += 4; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); + IOValueRef d = graph.add_input_tensor( size_small, vkapi::kFloat, /*shared_object_idx = */ 2); - // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() uniform buffer for staging shader + // +1: t.axis_map_ubo() uniform buffer for staging shader // +1: staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 9); + expected_vma_allocation_count += 3; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); ValueRef e = graph.add_tensor( size_big, @@ -1034,20 +1499,26 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto mulFn = VK_GET_OP_FN("aten.mul.Tensor"); mulFn(graph, {c, d.value, e}); + // +2: alpha UBO, broadcast UBO for arithmetic shader + // +1: t.sizes_ubo() for arithmetic shader output e + // +1: t.axis_map_ubo() for arithmetic shader output e + expected_vma_allocation_count += 4; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); + IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); - // +2: alpha UBO, broadcast UBO for arithmetic shader - // +1: t.sizes_ubo() for staging shader - // +1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 13); + // +1: staging buffer for the output tensor + expected_vma_allocation_count += 1; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); graph.prepare(); graph.encode_execute(); // +3: shared memory allocations for tensors - EXPECT_TRUE(get_vma_allocation_count() == 16); + expected_vma_allocation_count += 3; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); // Run graph @@ -1114,6 +1585,105 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) { + GraphConfig config; + ComputeGraph graph(config); + + std::vector size_big = {8, 64, 124}; + std::vector size_small = {8, 1, 124}; + + // Build graph + + IOValueRef a = graph.add_input_tensor( + size_big, vkapi::kFloat, /*shared_object_idx = */ 0); + IOValueRef b = graph.add_input_tensor( + size_small, vkapi::kFloat, /*shared_object_idx = */ 1); + + IOValueRef out = {}; + + out.value = + graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2); + + // Perform the following compute + // + // a, b, out; + // { + // inter; + // { + // tmp = a + b + // tmp2 = tmp + a + // inter = tmp2 + b + // } + // { + // tmp = inter + b; + // tmp2 = tmp + a + // out = tmp2 + b; + // } + // } + { + TmpTensor inter(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(inter.sobj_idx == 3); + { + TmpTensor tmp(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp.sobj_idx == 4); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {a, b, kDummyValueRef, tmp}); + + TmpTensor tmp2(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp2.sobj_idx == 5); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp, a, kDummyValueRef, tmp2}); + + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp2, b, kDummyValueRef, inter}); + } + { + TmpTensor tmp(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp.sobj_idx == 4); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {inter, b, kDummyValueRef, tmp}); + + TmpTensor tmp2(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp2.sobj_idx == 5); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp, a, kDummyValueRef, tmp2}); + + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp2, b, kDummyValueRef, out}); + } + } + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + float val_a = i + 2.0f; + float val_b = i + 1.5f; + float val_tmp = val_a + val_b; + float val_tmp2 = val_tmp + val_a; + float val_inter = val_tmp2 + val_b; + float val_tmp_2 = val_inter + val_b; + float val_tmp2_2 = val_tmp_2 + val_a; + float val_out = val_tmp2_2 + val_b; + + fill_vtensor(graph, a, val_a); + fill_vtensor(graph, b, val_b); + + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + CHECK_VALUE(data_out, i, val_out); + } + } +} + TEST(VulkanComputeGraphTest, test_large_graph) { auto build_start_time = std::chrono::system_clock::now(); GraphConfig config; @@ -1692,25 +2262,21 @@ void run_from_gpu_test( if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) { return; } - if ((dtype == vkapi::kChar || dtype == vkapi::kQInt8) && - !context()->adapter_ptr()->has_full_int8_buffers_support()) { - return; - } vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); std::string kernel_name("idx_fill_texture"); - add_memory_layout_suffix(kernel_name, vten); add_dtype_suffix(kernel_name, vten); + int32_t offset = -50; + { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = {vten.packed_dim_whcn_idx()}; context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, - vten.image_extents(), + vten.logical_limits(), {4, 4, 4}, - specialization_constants, + {vten.packed_dim_whcn_idx(), offset}, VK_NULL_HANDLE, 0, vten.image( @@ -1720,22 +2286,27 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StorageBuffer staging_buffer(context(), dtype, vten.gpu_numel()); + StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); - record_image_to_nchw_op(context(), vten, staging_buffer.buffer()); + if (dtype == vkapi::kChar && + !context()->adapter_ptr()->has_full_int8_buffers_support()) { + record_int8_image_to_nchw_noint8_op(context(), vten, staging_buffer); + } else { + record_image_to_nchw_op(context(), vten, staging_buffer.buffer()); + } submit_to_gpu(); std::vector data_out(staging_buffer.numel()); - copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes()); + staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes()); for (int i = 0; i < vten.numel(); i++) { - CHECK_VALUE(data_out, i, i); + CHECK_VALUE(data_out, i, i + offset); } } template -void run_to_gpu_test( +void round_trip_test( std::vector& sizes, utils::GPUMemoryLayout memory_layout = utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, @@ -1744,44 +2315,87 @@ void run_to_gpu_test( if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) { return; } - if ((dtype == vkapi::kChar || dtype == vkapi::kQInt8) && - !context()->adapter_ptr()->has_full_int8_buffers_support()) { - return; - } vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); // Create and fill input staging buffer - StorageBuffer staging_buffer_in(context(), dtype, vten.gpu_numel()); + StagingBuffer staging_buffer_in( + context(), dtype, vten.staging_buffer_numel()); std::vector data_in(staging_buffer_in.numel()); for (int i = 0; i < staging_buffer_in.numel(); i++) { - data_in[i] = i; + data_in[i] = T(i * -1); } - copy_ptr_to_staging(data_in.data(), staging_buffer_in, vten.gpu_nbytes()); + staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes()); // Output staging buffer - StorageBuffer staging_buffer_out(context(), dtype, vten.gpu_numel()); + StagingBuffer staging_buffer_out( + context(), dtype, vten.staging_buffer_numel()); - // Copy data in and out of the tensor record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); - record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer()); + + // Copy data in and out of the tensor + if (dtype == vkapi::kChar && + !context()->adapter_ptr()->has_full_int8_buffers_support()) { + record_int8_image_to_nchw_noint8_op(context(), vten, staging_buffer_out); + } else { + record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer()); + } // Execute command buffer submit_to_gpu(); // Extract data from output staging buffer std::vector data_out(staging_buffer_out.numel()); - copy_staging_to_ptr( - staging_buffer_out, data_out.data(), staging_buffer_out.nbytes()); + staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes()); // All indices should be equal to the input data for (int i = 0; i < vten.numel(); i++) { - CHECK_VALUE(data_out, i, i); + CHECK_VALUE(data_out, i, data_in[i]); } } -TEST(VulkanToFromGPUShaderTest, to_gpu_and_from_gpu_test_texture) { +template +void compute_graph_round_trip_test( + std::vector& sizes, + utils::GPUMemoryLayout memory_layout = + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, + vkapi::ScalarType dtype = vkapi::kFloat, + utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { + if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) { + return; + } + + GraphConfig config; + ComputeGraph graph(config); + + ValueRef r_tensor = + graph.add_tensor(sizes, dtype, storage_type, memory_layout); + ValueRef r_staging_in = graph.set_input_tensor(r_tensor); + ValueRef r_staging_out = graph.set_output_tensor(r_tensor); + + graph.prepare(); + graph.encode_execute(); + + vTensorPtr tensor = graph.get_tensor(r_tensor); + + std::vector data_in(tensor->numel()); + for (int i = 0; i < data_in.size(); i++) { + data_in[i] = T(i * -1); + } + graph.copy_into_staging(r_staging_in, data_in.data(), data_in.size()); + + graph.execute(); + + std::vector data_out(tensor->staging_buffer_numel()); + graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size()); + + for (int i = 0; i < data_in.size(); i++) { + CHECK_VALUE(data_out, i, data_in[i]); + } +} + +TEST(VulkanToFromGPUShaderTest, round_trip_tests) { // The below tests will fill each texel element with the value of the linear // buffer index that corresponds to it. The texel at position (0, 0, 0) will // be filled with the values [0, 1, 2, 3], the texel at position (1, 0, 0) @@ -1824,16 +2438,22 @@ TEST(VulkanToFromGPUShaderTest, to_gpu_and_from_gpu_test_texture) { }; #define RUN_TESTS(ctype, dtype) \ - run_to_gpu_test( \ + round_trip_test( \ + sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \ + round_trip_test( \ + sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype); \ + round_trip_test( \ + sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype); \ + compute_graph_round_trip_test( \ sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \ - run_to_gpu_test( \ + compute_graph_round_trip_test( \ sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype); \ - run_to_gpu_test( \ + compute_graph_round_trip_test( \ sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype); for (auto& sizes : to_test) { RUN_TESTS(float, vkapi::kFloat) - RUN_TESTS(torch::executor::Half, vkapi::kHalf) + RUN_TESTS(exec_aten::Half, vkapi::kHalf) } for (auto& sizes : to_test_int8) { @@ -1911,24 +2531,28 @@ void test_binary_op( } } -#define CALL_TEST_FN_FORALL_CONDITIONS(_) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, true) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true) - -#define CALL_TEST_FN_FOR_W_PACKED(_) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true) - -#define CALL_TEST_FN_FOR_C_PACKED(_) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true) +#define CALL_TEST_FN_FORALL_CONDITIONS(_) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, true) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true) + +#define CALL_TEST_FN_FOR_W_PACKED(_) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true) \ + _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false) \ + _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true) + +#define CALL_TEST_FN_FOR_C_PACKED(_) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true) \ + _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false) \ + _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true) TEST(VulkanComputeGraphOpsTest, add_smoke_test) { -#define RUN_TESTS(dtype, layout, prepack) \ +#define RUN_TESTS(dtype, storage, layout, prepack) \ test_binary_op("add", {17, 21}, {17, 21}, dtype, layout, prepack); \ test_binary_op("add", {17, 21}, {1, 1}, dtype, layout, prepack); \ test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout, prepack); \ @@ -1949,9 +2573,11 @@ void test_mm( int K, int N, vkapi::ScalarType dtype, + utils::StorageType storage_type, utils::GPUMemoryLayout memory_layout, bool prepack = true) { GraphConfig config; + config.set_storage_type_override(storage_type); ComputeGraph graph(config); std::vector mat1_size = {M, K}; @@ -2008,38 +2634,42 @@ void test_mm( } TEST(VulkanComputeGraphOpsTest, mm_smoke_test) { -#define RUN_TESTS(dtype, layout, prepack) \ - test_mm( \ - /*B = */ 1, \ - /*M = */ 31, \ - /*K = */ 127, \ - /*N = */ 23, \ - dtype, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 5, \ - /*M = */ 31, \ - /*K = */ 127, \ - /*N = */ 23, \ - dtype, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 7, \ - /*M = */ 13, \ - /*K = */ 89, \ - /*N = */ 17, \ - dtype, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 1, \ - /*M = */ 13, \ - /*K = */ 89, \ - /*N = */ 17, \ - dtype, \ - layout, \ +#define RUN_TESTS(dtype, storage_type, layout, prepack) \ + test_mm( \ + /*B = */ 1, \ + /*M = */ 31, \ + /*K = */ 127, \ + /*N = */ 23, \ + dtype, \ + storage_type, \ + layout, \ + prepack); \ + test_mm( \ + /*B = */ 5, \ + /*M = */ 31, \ + /*K = */ 127, \ + /*N = */ 23, \ + dtype, \ + storage_type, \ + layout, \ + prepack); \ + test_mm( \ + /*B = */ 7, \ + /*M = */ 13, \ + /*K = */ 89, \ + /*N = */ 17, \ + dtype, \ + storage_type, \ + layout, \ + prepack); \ + test_mm( \ + /*B = */ 1, \ + /*M = */ 13, \ + /*K = */ 89, \ + /*N = */ 17, \ + dtype, \ + storage_type, \ + layout, \ prepack); CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS); @@ -2097,18 +2727,18 @@ void test_max_pool2d( fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota = */ true); vTensorPtr t_in = graph.get_tensor(in_ioval.value); - std::vector input_data(t_in->gpu_numel()); + std::vector input_data(t_in->staging_buffer_numel()); graph.copy_from_staging( in_ioval.staging, input_data.data(), input_data.size()); graph.execute(); vTensorPtr t_out = graph.get_tensor(out_ioval.value); - std::vector output_data(t_out->gpu_numel()); + std::vector output_data(t_out->staging_buffer_numel()); graph.copy_from_staging( out_ioval.staging, output_data.data(), output_data.size()); vTensorPtr t_idx = graph.get_tensor(idx_ioval.value); - std::vector index_data(t_idx->gpu_numel()); + std::vector index_data(t_idx->staging_buffer_numel()); graph.copy_from_staging( idx_ioval.staging, index_data.data(), index_data.size()); @@ -2152,19 +2782,18 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); + StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { data_in[i] = i + 1; } - copy_ptr_to_staging( - data_in.data(), staging_buffer_in, sizeof(float) * in_numel); + staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel); // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); + StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( @@ -2176,8 +2805,7 @@ void test_conv2d( // Extract data from output staging buffer std::vector data_out(out_numel); - copy_staging_to_ptr( - staging_buffer_out, data_out.data(), sizeof(float) * out_numel); + staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel); // Check data matches results copied from ATen-VK for (int i = 0; i < vten.numel(); i++) { @@ -2203,3 +2831,201 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) { 0, 3, 9, 0, 0, 6, 12, 0, 0, 5, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); } + +void test_grid_priors( + std::vector input_sizes, + std::vector output_sizes, + int stride, + double offset, + const std::vector& data_out_expected) { + GraphConfig config; + ComputeGraph graph(config); + + // Build graph + IOValueRef in = graph.add_input_tensor( + input_sizes, + vkapi::kFloat, + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); + IOValueRef out; + out.value = graph.add_tensor( + output_sizes, + vkapi::kFloat, + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); + + VK_GET_OP_FN("et_vk.grid_priors.default") + (graph, + {in.value, + graph.add_scalar(stride), + graph.add_scalar(offset), + out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + + vTensorPtr t_in = graph.get_tensor(in.value); + vTensorPtr t_out = graph.get_tensor(out.value); + // Resize input + graph.propagate_resize(); + + // run graph + graph.execute(); + + std::vector output_data(t_out->staging_buffer_numel()); + graph.copy_from_staging(out.staging, output_data.data(), output_data.size()); + + // check results + int h_out = utils::val_at(-2, t_out->sizes()); + int w_out = utils::val_at(-1, t_out->sizes()); + for (size_t i = 0; i < h_out; ++i) { + for (size_t j = 0; j < w_out; ++j) { + size_t idx_out = i * w_out + j; + CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]); + } + } +} + +TEST(VulkanComputeGraphOpsTest, grid_priors_test) { + test_grid_priors( + /*input size = */ {1, 5, 2, 3}, + /*output size = */ {6, 2}, + /*stride = */ 1, + /*offset = */ 0.0, + /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1}); + + test_grid_priors( + /*input size = */ {1, 5, 2, 3}, + /*output size = */ {6, 2}, + /*stride = */ 8, + /*offset = */ 0.5, + /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); +} + +void test_int4pack_mm( + std::vector MKN, + uint32_t group_size, + utils::StorageType storage_type) { + GraphConfig config; + ComputeGraph graph(config); + + const uint32_t M = MKN[0]; + const uint32_t K = MKN[1]; + const uint32_t N = MKN[2]; + + const std::vector mat1_size = {M, K}; + const std::vector mat2_size = {K, N}; + const std::vector mat2_q_size = {N, K / 2}; // Transposed and packed + const std::vector out_size = {M, N}; + + std::vector A_data = create_random_float_buffer(M * K); + IOValueRef A = graph.add_input_tensor(mat1_size, vkapi::kFloat, storage_type); + graph.copy_into_staging(A.staging, A_data.data(), A_data.size()); + + // Quantized but un-packed weights + std::vector B_quant_data = create_random_uint8_buffer(K * N, 0, 16); + + // Pack and transpose weights to correspond to int4 weight format + std::vector B_int4_data = + int4mm_pack_weights(mat2_size, B_quant_data.data()); + + IOValueRef B_int4 = + graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, storage_type); + graph.copy_into_staging( + B_int4.staging, B_int4_data.data(), B_int4_data.size()); + + const int k_groups = K / group_size; + + // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in + // int4 range + IOValueRef scales_and_zeros = + graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, storage_type); + std::vector s_data(graph.numel_of(scales_and_zeros.value)); + const int zeros_stride = s_data.size() / 2; + for (size_t i = 0; i < zeros_stride; i++) { + s_data[i] = rand() % 100; + s_data[i + zeros_stride] = rand() % 16; + } + + graph.copy_into_staging( + scales_and_zeros.staging, s_data.data(), s_data.size()); + + IOValueRef out_int4; + + if (storage_type == utils::kBuffer) { + out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + } else { + out_int4.value = + graph.add_tensor(out_size, vkapi::kFloat, utils::kChannelsPacked); + } + + VK_GET_OP_FN("aten._weight_int4pack_mm.default") + (graph, + {A.value, + B_int4.value, + graph.add_scalar(group_size), + scales_and_zeros.value, + out_int4.value}); + + out_int4.staging = graph.set_output_tensor(out_int4.value); + + // Dequantized matmul for comparison + IOValueRef B_deq = + graph.add_input_tensor(mat2_size, vkapi::kFloat, storage_type); + std::vector B_deq_data = int4mm_dequantize_weights( + mat2_size, B_quant_data.data(), group_size, s_data.data()); + graph.copy_into_staging(B_deq.staging, B_deq_data.data(), B_deq_data.size()); + + IOValueRef out_deq; + out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, storage_type); + + VK_GET_OP_FN("aten.mm.default") + (graph, {A.value, B_deq.value, out_deq.value}); + + out_deq.staging = graph.set_output_tensor(out_deq.value); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + graph.propagate_resize(); + graph.execute(); + + // Compare outputs + std::vector out_int4_data(graph.numel_of(out_int4.value)); + graph.copy_from_staging( + out_int4.staging, out_int4_data.data(), out_int4_data.size()); + + std::vector out_deq_data(graph.numel_of(out_deq.value)); + graph.copy_from_staging( + out_deq.staging, out_deq_data.data(), out_deq_data.size()); + + for (int i = 0; i < out_int4_data.size(); i++) { + CHECK_VALUE(out_int4_data, i, out_deq_data[i]); + } +} + +TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) { + if (!context()->adapter_ptr()->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + + for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) { + // Vector multiplication, single group per row + test_int4pack_mm({1, 32, 1}, 32, storage_type); + + // Vector multiplication, multiple groups per row + test_int4pack_mm({1, 256, 1}, 64, storage_type); + + // Square matrices, single group per row + test_int4pack_mm({32, 32, 32}, 32, storage_type); + + // Irregular matrices, single group per row + test_int4pack_mm({37, 32, 19}, 32, storage_type); + + // Irregular matrices, multiple groups per row + test_int4pack_mm({37, 256, 19}, 64, storage_type); + } +} diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl index c16ad5d14b..38c9befec6 100644 --- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl +++ b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl @@ -26,6 +26,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int niter = 1; layout(constant_id = 4) const int nvec = 1; layout(constant_id = 5) const int local_group_size = 1; +// The address mask works as a modulo because x % 2^n == x & (2^n - 1). +// This will help us limit address accessing to a specific set of unique +// addresses depending on the access size we want to measure. +layout(constant_id = 6) const int addr_mask = 1; +layout(constant_id = 7) const int workgroup_width = 1; $if MEMTYPE == "shared": shared vec4 A[nvec]; @@ -36,15 +41,7 @@ void main() { A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0]; memoryBarrierShared(); - // The address mask works as a modulo because x % 2^n == x & (2^n - 1). - // This will help us limit address accessing to a specific set of unique - // addresses depending on the access size we want to measure. - const int addr_mask = nvec - 1; vec4 sum = vec4(0); - - // This is to distribute the accesses to unique addresses across the workgroups, once the - // size of the access excedes the workgroup width. - const uint workgroup_width = local_group_size * niter * ${NUNROLL}; uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl index d848fc0475..7ab67bd2d0 100644 --- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl @@ -21,17 +21,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int niter = 1; layout(constant_id = 4) const int nvec = 1; layout(constant_id = 5) const int local_group_size = 1; +// The address mask works as a modulo because x % 2^n == x & (2^n - 1). +// This will help us limit address accessing to a specific set of unique +// addresses depending on the access size we want to measure. +layout(constant_id = 6) const int addr_mask = 1; +layout(constant_id = 7) const int workgroup_width = 1; void main() { - // The address mask works as a modulo because x % 2^n == x & (2^n - 1). - // This will help us limit address accessing to a specific set of unique - // addresses depending on the access size we want to measure. - const int addr_mask = nvec - 1; vec4 sum = vec4(0); - - // This is to distribute the accesses to unique addresses across the workgroups, once the - // size of the access excedes the workgroup width. - const uint workgroup_width = local_group_size * niter * ${NUNROLL}; uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 0d312ee87c..9af908eb17 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -40,7 +40,7 @@ void reg_count(const App& app) { uint32_t NITER; auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); + StagingBuffer buffer(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "reg_count_" + std::to_string(nreg); @@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) { uint32_t NITER; auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_physical"; @@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) { // doesn't depend on kernel timing, so the extra wait time doesn't lead to // inaccuracy. auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_scheduler"; @@ -242,7 +242,7 @@ void warp_size(const App& app, const bool verbose = false) { }); std::vector data(app.nthread_logic); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + out_buf.copy_to(data.data(), out_buf.nbytes()); if (verbose) { std::stringstream ss; diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h index 8cb0da49ca..31137b11ee 100644 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) { uint32_t NITER; auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); + StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StagingBuffer out_buf(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "buf_cacheline_size"; @@ -123,8 +123,17 @@ void _bandwidth( // Number of vectors that fit in this iteration const uint32_t nvec_access = access_size / VEC_SIZE; - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( + // The address mask works as a modulo because x % 2^n == x & (2^n - 1). + // This will help us limit address accessing to a specific set of unique + // addresses depending on the access size we want to measure. + const uint32_t addr_mask = nvec_access - 1; + + // This is to distribute the accesses to unique addresses across the + // workgroups, once the size of the access excedes the workgroup width. + const uint32_t workgroup_width = local_x * NITER * NUNROLL; + + StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; @@ -136,7 +145,11 @@ void _bandwidth( pipeline_barrier, {global_x, 1, 1}, {local_x, 1, 1}, - {SV(NITER), SV(nvec_access), SV(local_x)}, + {SV(NITER), + SV(nvec_access), + SV(local_x), + SV(addr_mask), + SV(workgroup_width)}, VK_NULL_HANDLE, 0, in_buf.buffer(), diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h index bb8a3371a9..c9ff133f1e 100644 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) { vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); vkapi::PipelineBarrier pipeline_barrier{}; @@ -164,7 +164,16 @@ void tex_bandwidth(const App& app) { // Number of texels that fit in this iteration const uint32_t ntexel_access = access_size / VEC_SIZE; - StorageBuffer out_buf( + // The address mask works as a modulo because x % 2^n == x & (2^n - 1). + // This will help us limit address accessing to a specific set of unique + // addresses depending on the access size we want to measure. + const uint32_t addr_mask = ntexel_access - 1; + + // This is to distribute the accesses to unique addresses across the + // workgroups, once the size of the access excedes the workgroup width. + const uint32_t workgroup_width = local_x * NITER * NUNROLL; + + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; @@ -174,7 +183,11 @@ void tex_bandwidth(const App& app) { pipeline_barrier, {global_x, 1, 1}, {local_x, 1, 1}, - {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + {SV(NITER), + SV(ntexel_access), + SV(local_x), + SV(addr_mask), + SV(workgroup_width)}, VK_NULL_HANDLE, 0, in_tensor.image(), diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 1865c32acd..7e85c25fae 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -57,7 +57,7 @@ def preprocess( # noqa: C901 MeanToSumDiv(), SpecPropPass(), ConstraintBasedSymShapeEvalPass(), - MemoryPlanningPass("greedy"), + MemoryPlanningPass(), ] new_gm = program.graph_module diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 1ac7867f3c..98870bf0e1 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -32,6 +32,15 @@ if(NOT PYTHON_EXECUTABLE) resolve_python_executable() endif() +# NB: Enabling this will serialize execution of delegate instances Keeping this +# OFF by default to maintain existing behavior, to be revisited. +option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE + "Enable workspace sharing across different delegate instances" OFF +) +if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) + add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) +endif() + set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(_common_compile_options -Wno-deprecated-declarations -fPIC) diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 33a0bfaf30..0c3d7e1442 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -105,9 +105,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake index 40e4e72c38..b76c54bee6 100644 --- a/backends/xnnpack/cmake/Dependencies.cmake +++ b/backends/xnnpack/cmake/Dependencies.cmake @@ -36,6 +36,10 @@ set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "" ) +set(XNNPACK_ENABLE_KLEIDIAI + OFF + CACHE BOOL "" +) add_subdirectory("${XNNPACK_SOURCE_DIR}") include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR}) list(APPEND xnnpack_third_party XNNPACK) diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py index d25cc58d5a..b2653a5fdc 100644 --- a/backends/xnnpack/operators/__init__.py +++ b/backends/xnnpack/operators/__init__.py @@ -10,6 +10,7 @@ op_add, op_addmm, op_avg_pooling2d, + op_bmm, op_cat, op_ceiling, op_clamp, diff --git a/backends/xnnpack/operators/op_bmm.py b/backends/xnnpack/operators/op_bmm.py new file mode 100644 index 0000000000..8c008a5554 --- /dev/null +++ b/backends/xnnpack/operators/op_bmm.py @@ -0,0 +1,54 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch +from executorch.backends.xnnpack.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import ( + XNNBatchMatrixMultiply, + XNNGraph, + XNode, +) +from executorch.backends.xnnpack.utils.utils import get_input_node + + +@register_node_visitor +class BMMVisitor(NodeVisitor): + target = "aten.bmm.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + + self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids) + + # input1 + input1_id = vals_to_ids[get_input_node(node, 0)] + + # input2 + input2_id = vals_to_ids[get_input_node(node, 1)] + + # output + output_id = vals_to_ids[node] + + ser_node = XNode( + xnode_union=XNNBatchMatrixMultiply( + input1_id=input1_id, input2_id=input2_id, output_id=output_id, flags=0 + ), + debug_handle=debug_handle, + ) + xnn_graph.xnodes.append(ser_node) diff --git a/backends/xnnpack/operators/op_conv2d.py b/backends/xnnpack/operators/op_conv2d.py index 5661a9a4d3..28da480574 100644 --- a/backends/xnnpack/operators/op_conv2d.py +++ b/backends/xnnpack/operators/op_conv2d.py @@ -52,6 +52,9 @@ def define_node( ) # NHWC input kwargs["input1_id"] = vals_to_ids[get_input_node(node, 0)] + # filter shape for pytorch convolution is (oc, inc/groups, height, width) + # shape for xnnpack convolution is (oc, height, width, inc/groups), to convert + # to the proper shape, this is essentially a NCHW to NHWC conversion kernel_node = get_input_node(node, 1) kernel_shape = get_shape(kernel_node) groups = cast(int, node.args[8]) @@ -65,19 +68,13 @@ def define_node( is_depthwise_conv = (group_input_channels == 1) and ( group_output_channels % group_input_channels == 0 ) - # filter - # filter shape for pytorch convolution is (oc, inc/groups, height, width) - # shape for xnnpack convolution is (oc, height, width, inc/groups), to convert - # to the proper shape, this is essentially a NCHW to NHWC conversion - weight_node = get_input_node(node, 1) weight_quant_params = QuantParams.from_weights( - weight_node, self._exported_program + kernel_node, self._exported_program ) - - fp32_static_weights = weight_node.meta["val"].dtype == torch.float16 + fp32_static_weights = kernel_node.meta["val"].dtype == torch.float16 self.define_tensor( - weight_node, + kernel_node, xnn_graph, vals_to_ids, convert_to_nhwc=True, diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py index d47f9f479e..f8f0c54ee6 100644 --- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py @@ -12,7 +12,15 @@ register_node_visitor, ) from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph -from executorch.backends.xnnpack.utils.utils import get_input_node +from executorch.backends.xnnpack.utils.quant_utils import ( + is_per_channel_group, + is_per_token, +) +from executorch.backends.xnnpack.utils.utils import ( + check_or_raise, + get_input_node, + is_param_node, +) @register_node_visitor @@ -65,3 +73,40 @@ def define_node( dq_input = get_input_node(node, 0) if dq_input in vals_to_ids: vals_to_ids[node] = vals_to_ids[dq_input] + + +@register_node_visitor +class OpDequantizeAffine(NodeVisitor): + target = "quant.dequantize_affine.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + """ + We always define dequantize affine nodes because they are always explicit + """ + if is_per_channel_group(node): + check_or_raise( + is_param_node(self._exported_program, node.all_input_nodes[0]), + f"Expected quantize affine node with per-token semantics to be used " + f"in front of a weight node, but found node {node.all_input_nodes[0]}", + ) + # Affine dequantize was recognized as per channel group which means that it should + # be skipped as this means it is used in front of a weight node + return + + check_or_raise( + is_per_token(node), + "Expecting Affine Dequantized Op to have per-token semantics", + ) + # This must be a per-token affine dequantized node, so let us serialize as such + dq_input = get_input_node(node, 0) + if dq_input in vals_to_ids: + vals_to_ids[node] = vals_to_ids[dq_input] diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py index bf5f3b7b09..23047e731f 100644 --- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py @@ -17,6 +17,10 @@ XNNGraph, XNode, ) +from executorch.backends.xnnpack.utils.quant_utils import ( + is_per_channel_group, + is_per_token, +) from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node @@ -118,3 +122,56 @@ def define_node( debug_handle=debug_handle, ) xnn_graph.xnodes.append(ser_node) + + +@register_node_visitor +class OpQuantizeAffine(NodeVisitor): + target = "quant.quantize_affine.default" + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + """ + We always define quantize affine nodes because they are always explicit + """ + if is_per_channel_group(node): + # Affine quantized was recognized as per channel group which means that it should + # be skipped as this means it is used in front of a weight node + return + + check_or_raise( + is_per_token(node), + "Encountered affine quantized op which does not have per-token semantics", + ) + # Treat this node as dynamic per-token quantization + q_input = get_input_node(node, 0) + + # fp32 input + self.define_tensor(q_input, xnn_graph, vals_to_ids) + input_id = vals_to_ids[q_input] + + # dynamic quantized output + input_quant_params = QuantParams.from_q_dq_node(node) + # qinput isn't needed for dynamically quantized nodes since it will always be + # the output of a convert node. Instead we set q_input to the node itself so + # we can extract the shape from the dq output + input_quant_params.q_input = node + input_quant_params.is_input = False + check_or_raise( + input_quant_params.is_dynamic, + "Internal Error, dynamically quantized node expected dynamic quantized params", + ) + self.define_tensor( + node, xnn_graph, vals_to_ids, quant_params=input_quant_params + ) + output_id = vals_to_ids[node] + + ser_node = XNode( + xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0), + debug_handle=debug_handle, + ) + xnn_graph.xnodes.append(ser_node) diff --git a/backends/xnnpack/operators/op_prelu.py b/backends/xnnpack/operators/op_prelu.py index 213ffdb9c5..2d3fdc2258 100644 --- a/backends/xnnpack/operators/op_prelu.py +++ b/backends/xnnpack/operators/op_prelu.py @@ -23,7 +23,7 @@ @register_node_visitor class PReLUVisitor(NodeVisitor): - target = "aten._prelu_kernel.default" + target = "aten.prelu.default" def __init__(self, *args) -> None: super().__init__(*args) diff --git a/backends/xnnpack/operators/op_sdpa.py b/backends/xnnpack/operators/op_sdpa.py index 3f4149aca6..e0ec7b37b3 100644 --- a/backends/xnnpack/operators/op_sdpa.py +++ b/backends/xnnpack/operators/op_sdpa.py @@ -66,9 +66,12 @@ def define_node( # Hack to broadcast the scale q_shape = get_shape(get_input_node(node, 0)) - scale = cast(float, node.kwargs["scale"]) + embedding_dim = q_shape[-1] + scale = 1 / (embedding_dim**0.5) + if "scale" in node.kwargs and node.kwargs["scale"]: + scale = cast(float, node.kwargs["scale"]) - t = torch.full((q_shape[-1],), scale, dtype=mask_dtype) + t = torch.full((embedding_dim,), scale, dtype=mask_dtype) scale_node = self.get_fake_attr("scale", t) self.define_tensor( scale_node, diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py index d6a54c901e..6597c0568e 100644 --- a/backends/xnnpack/operators/op_skip_ops.py +++ b/backends/xnnpack/operators/op_skip_ops.py @@ -97,6 +97,15 @@ class OpSymSizeInt(OpSkipOps): target = "sym_size.int" +@register_node_visitor +class OpChooseQparamsAffine(OpSkipOps): + """ + do nothing if node is choose_qparams_affine.default + """ + + target = "quant.choose_qparams_affine.default" + + @register_node_visitor class OpChooseQparamsToken(OpSkipOps): """ diff --git a/backends/xnnpack/operators/op_static_resize_bilinear_2d.py b/backends/xnnpack/operators/op_static_resize_bilinear_2d.py index 96e6875078..83f3f0ea05 100644 --- a/backends/xnnpack/operators/op_static_resize_bilinear_2d.py +++ b/backends/xnnpack/operators/op_static_resize_bilinear_2d.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import cast, Dict, List +from typing import cast, Dict import torch from executorch.backends.xnnpack.operators.node_visitor import ( @@ -23,7 +23,7 @@ @register_node_visitor class StaticResizeBilinear2DVisitor(NodeVisitor): - target = "aten.upsample_bilinear2d.default" + target = "aten.upsample_bilinear2d.vec" def __init__(self, *args) -> None: super().__init__(*args) @@ -44,7 +44,7 @@ def define_node( # output output_id = vals_to_ids[node] - new_size = cast(List[int], node.args[1]) + new_size = node.meta["val"].shape[-2:] flags = XNN_FLAG_ALIGN_CORNERS if cast(bool, node.args[2]) else 0 diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py index d60c300276..44908ac7fc 100644 --- a/backends/xnnpack/operators/quant_params.py +++ b/backends/xnnpack/operators/quant_params.py @@ -10,7 +10,15 @@ import torch from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass -from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.quant_utils import ( + extract_qdq_affine_op_args_for_decomposed_ops, + is_affine_qdq, + is_dequant, + is_dynamic_qdq, + is_per_channel, + is_per_channel_group, + is_quant, +) from executorch.backends.xnnpack.utils.utils import ( check_or_raise, get_param_tensor, @@ -154,30 +162,18 @@ def from_q_dq_node( q_input = quant_node.all_input_nodes[0] # TODO: Use presence of choose_qparam node to determine if this is a dynamic quantization - if quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, - ]: + if is_dynamic_qdq(quant_node): return cls._from_dynamic_input_node(quant_node) - per_channel = quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - ] - - _groupwise = False - if quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default, - ]: - # This is a sub-category of per channel quantization - per_channel = True - _groupwise = True - - scale = quant_node.args[1] - zp = quant_node.args[2] + per_channel = is_per_channel(quant_node) + + _groupwise = is_per_channel_group(quant_node) + quant_node_args = quant_node.args + if _groupwise and is_affine_qdq(quant_node): + quant_node_args = extract_qdq_affine_op_args_for_decomposed_ops(quant_node) + + scale = quant_node_args[1] + zp = quant_node_args[2] axis = 0 if per_channel: assert isinstance(scale, torch.fx.Node) and isinstance(scale.target, str) @@ -193,10 +189,15 @@ def _get_tensor(node): scale = _get_tensor(scale) zp = _get_tensor(zp) - axis = cast(int, quant_node.args[3]) + axis = cast(int, quant_node_args[3]) if _groupwise: scale_tensor = cast(torch.Tensor, scale) + if scale_tensor.ndim == 1: + scale_tensor = scale_tensor.reshape(-1, 1) + zp = zp.reshape(-1, 1) + scale = scale_tensor + assert ( scale_tensor.ndim == 2 ), "Weight scale must be 2D for per_channel_group [de]quant node, got {scale.ndim}D" @@ -204,23 +205,23 @@ def _get_tensor(node): check_or_raise( bool( - quant_node.args[-1] != torch.uint8 - or quant_node.args[-1] != torch.quint8 + quant_node_args[-1] != torch.uint8 + or quant_node_args[-1] != torch.quint8 ), "XNNPACK does not support unsigned quantization", ) if _groupwise: - _ = quant_node.args[-1] # output dtype - not used - group_size = cast(int, quant_node.args[-2]) - dtype = cast(torch.dtype, quant_node.args[-3]) - qmax = cast(int, quant_node.args[-4]) - qmin = cast(int, quant_node.args[-5]) + _ = quant_node_args[-1] # output dtype - not used + group_size = cast(int, quant_node_args[-2]) + dtype = cast(torch.dtype, quant_node_args[-3]) + qmax = cast(int, quant_node_args[-4]) + qmin = cast(int, quant_node_args[-5]) else: group_size = 0 - dtype = cast(torch.dtype, quant_node.args[-1]) - qmax = cast(int, quant_node.args[-2]) - qmin = cast(int, quant_node.args[-3]) + dtype = cast(torch.dtype, quant_node_args[-1]) + qmax = cast(int, quant_node_args[-2]) + qmin = cast(int, quant_node_args[-3]) is_output = any( user_node.op == "output" for user_node in quant_node.users.keys() @@ -244,26 +245,14 @@ def _get_tensor(node): def from_weights( cls, tensor_node: torch.fx.Node, ep: Optional[ExportedProgram] = None ) -> Optional[QuantParams]: - # Ignore transpose for weights - # TODO:T148540997 remove the t_copy/permute_copy check when convert addmm to linear - dq = ( - tensor_node.all_input_nodes[0] - if tensor_node.target - in ( - exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.t_copy.default, - ) - else tensor_node - ) - # check input of t_copy/permute_copy is dequant - if not is_dequant(dq): + if not is_dequant(tensor_node): return None # source node for quant params - src = dq + src = tensor_node # is input of dq is q? - dq_input = dq.all_input_nodes[0] + dq_input = src.all_input_nodes[0] if is_quant(dq_input): src = dq_input diff --git a/backends/xnnpack/partition/TARGETS b/backends/xnnpack/partition/TARGETS index f11695460f..bed4aa3ea4 100644 --- a/backends/xnnpack/partition/TARGETS +++ b/backends/xnnpack/partition/TARGETS @@ -15,6 +15,7 @@ runtime.python_library( ":configs", ":partitioner_graphs", "//executorch/backends/xnnpack:xnnpack_preprocess", + "//executorch/backends/xnnpack/partition/config:xnnpack_partitioner_configs", "//executorch/exir:delegate", "//executorch/exir:lib", "//executorch/exir/backend:partitioner", diff --git a/backends/xnnpack/partition/config/TARGETS b/backends/xnnpack/partition/config/TARGETS new file mode 100644 index 0000000000..adfbf95a72 --- /dev/null +++ b/backends/xnnpack/partition/config/TARGETS @@ -0,0 +1,20 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.python_library( + name = "xnnpack_partitioner_configs", + srcs = glob([ + "*.py", + ]), + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "//executorch/exir:lib", + "//executorch/exir/backend:partitioner", + "//executorch/exir/backend:utils", + "//executorch/exir/backend/canonical_partitioners:config_partitioner_lib", + ], +) diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py new file mode 100644 index 0000000000..ed105dc1f5 --- /dev/null +++ b/backends/xnnpack/partition/config/__init__.py @@ -0,0 +1,110 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import List, Type + +from executorch.backends.xnnpack.partition.config.gemm_configs import ( + AddmmConfig, + ConvolutionConfig, + LinearConfig, + MMConfig, +) + +from executorch.backends.xnnpack.partition.config.generic_node_configs import ( + AbsConfig, + AddConfig, + AvgPoolingConfig, + BMMConfig, + CatConfig, + CeilConfig, + ClampConfig, + ConstantPadConfig, + DeQuantizedPerTensorConfig, + DivConfig, + FloorConfig, + HardswishConfig, + # EluConfig, + HardtanhConfig, + LeakyReLUConfig, + MaximumConfig, + MaxPool2dConfig, + MeanDimConfig, + MinimumConfig, + MulConfig, + NegConfig, + PermuteConfig, + PowConfig, + QuantizedPerTensorConfig, + ReLUConfig, + # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails + SigmoidConfig, + SliceCopyConfig, + SoftmaxConfig, + SquareRootConfig, + SubConfig, + UpsampleBilinear2dConfig, +) +from executorch.backends.xnnpack.partition.config.node_configs import ( + BatchNormConfig, + MaxDimConfig, + PreluConfig, +) +from executorch.backends.xnnpack.partition.config.quant_affine_configs import ( + ChooseQParamsAffineConfig, + DeQuantizeAffineConfig, + QuantizeAffineConfig, +) +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + XNNPartitionerConfig, +) + +ALL_PARTITIONER_CONFIGS: List[Type[XNNPartitionerConfig]] = [ + AbsConfig, + AddConfig, + AddmmConfig, + AvgPoolingConfig, + BatchNormConfig, + BMMConfig, + CatConfig, + CeilConfig, + ConstantPadConfig, + ConvolutionConfig, + ClampConfig, + DivConfig, + # EluConfig, # Waiting for PyTorch Pin Update + FloorConfig, + HardtanhConfig, + HardswishConfig, + LeakyReLUConfig, + LinearConfig, + MaxDimConfig, + MaximumConfig, + MaxPool2dConfig, + MeanDimConfig, + MinimumConfig, + MMConfig, + MulConfig, + NegConfig, + PermuteConfig, + PowConfig, + PreluConfig, + ReLUConfig, + # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails + SigmoidConfig, + SliceCopyConfig, + SoftmaxConfig, + SquareRootConfig, + SubConfig, + UpsampleBilinear2dConfig, + # Quant/Dequant Op Configs + QuantizedPerTensorConfig, + DeQuantizedPerTensorConfig, + # Quant Affine Configs to preserve decomp + QuantizeAffineConfig, + DeQuantizeAffineConfig, + ChooseQParamsAffineConfig, +] diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py new file mode 100644 index 0000000000..cbcb14899d --- /dev/null +++ b/backends/xnnpack/partition/config/gemm_configs.py @@ -0,0 +1,444 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from itertools import chain +from typing import cast, List, Optional, Tuple + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, +) +from executorch.backends.xnnpack.utils.quant_utils import ( + extract_qdq_affine_op_args_for_decomposed_ops, + is_affine_qdq, + is_dequant, + is_dynamic_qdq, + is_per_channel, + is_per_channel_group, + is_qparam, + is_quant, +) +from executorch.backends.xnnpack.utils.utils import ( + get_input_node, + is_getitem, + is_node, + is_param_node, +) +from executorch.exir.backend.canonical_partitioners.config_partitioner import ( + format_target_name, +) +from executorch.exir.backend.utils import WhyNoPartition +from torch.export import ExportedProgram +from torch.fx.passes.utils.source_matcher_utils import ( + get_source_partitions, + SourcePartition, +) + +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + + +class GEMMConfig(XNNPartitionerConfig): + """ + GEMM-like ops like Convolution, Addmm, Linear, mostly behave in the same way, in which we + have some weight, bias, and activation node. The only difference between these types + of ops are that the weight, bias, and activations are in different indicies of the + nodes arguments, this class helps to generalize the logic needed to partition these + different ops + """ + + def __init__(self, weight_idx, bias_idx, act_idx, fused_acts, **kwargs): + super().__init__(**kwargs) + self.weight_idx = weight_idx + self.bias_idx = bias_idx + self.act_idx = act_idx + self.fused_acts = fused_acts + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + if not self.check_common_constraints(node, ep): + # short circuit if we don't pass common constraints + return False + + is_valid, _ = self.get_deps(node, ep) + if not is_valid: + why(node, "Failed to get valid dependent nodes.") + return is_valid + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + partition = [node] + _, deps = self.get_deps(node, ep) + partition.extend(deps) + + return partition + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return None + + def _detect_precision(self, node: torch.fx.Node) -> ConfigPrecisionType: + weight = get_input_node(node, self.weight_idx) + + if not is_dequant(weight): + return ConfigPrecisionType.FP32 + + activation = get_input_node(node, self.act_idx) + if is_dynamic_qdq(activation): + return ConfigPrecisionType.DYNAMIC_QUANT + + return ConfigPrecisionType.STATIC_QUANT + + def get_deps( + self, + node: torch.fx.Node, + ep: ExportedProgram, + ) -> Tuple[bool, List[torch.fx.Node]]: + """ + Gets all dependencies for this gemm partition. Returns a tuple of + a bool indicating if the deps are valid and a list of all the + dep nodes + """ + precision = self._detect_precision(node) + if precision not in self.supported_precision_types(): + # detected precision but it is either disabled or not supported + return (False, []) + + valid_bias, bias_deps = self._get_bias_deps(node, ep, precision) + valid_weight, weight_deps = self._get_weight_deps(node, ep, precision) + valid_act, act_deps = self._get_act_deps(node, ep, precision) + valid_output, output_deps = self._get_output_deps(node, ep, precision) + + valid_deps = valid_bias and valid_weight and valid_act and valid_output + deps = list(chain(bias_deps, weight_deps, act_deps, output_deps)) + + return valid_deps, deps + + def _get_weight_deps( + self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType + ) -> Tuple[bool, List[torch.fx.Node]]: + gemm_deps = [] + if precision == ConfigPrecisionType.FP32: + # First find the weight + weight_node = get_input_node(node, self.weight_idx) + if not is_param_node(ep, weight_node): + return (False, []) # weight must be a static param + gemm_deps.append(weight_node) + + return (True, gemm_deps) + else: + # Quantized Weight deps + dequant_node = get_input_node(node, self.weight_idx) + if not is_dequant(dequant_node): + return False, [] + gemm_deps.append(dequant_node) + weight = get_input_node(dequant_node, 0) + if not is_param_node(ep, weight): + return False, [] + gemm_deps.append(weight) + + if is_per_channel(dequant_node) or is_per_channel_group(dequant_node): + if len(dequant_node.all_input_nodes) < 2: + # Expected channel quantized to have scale/zp nodes + return False, [] + + gemm_deps.extend(dequant_node.all_input_nodes[1:3]) + return (True, gemm_deps) + + def _get_output_deps( + self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType + ) -> Tuple[bool, List[torch.fx.Node]]: + gemm_deps = [] + if precision == ConfigPrecisionType.STATIC_QUANT: + # Look for fused activations and tail end quant node + node_users = list(node.users.keys()) + if len(node_users) != 1: + # Expect quantized node to have a single output (fused act or dequant) + return False, [] + + # Check if the quantized pattern has a fused activation + n_output = node_users[0] + if ( + n_output.op == "call_function" + and format_target_name(n_output.target.__name__) in self.fused_acts + ): + gemm_deps.append(n_output) + fused_out_users = list(n_output.users.keys()) + if len(fused_out_users) == 1: + n_output = fused_out_users[0] + + if not is_quant(n_output): + # Expected gemm_node --> fused_act (optional) --> dequant + return (False, []) + gemm_deps.append(n_output) + elif precision == ConfigPrecisionType.FP32: + # Look for fused activations only, and partition with fp32 op + node_users = list(node.users.keys()) + if len(node_users) == 1: + n_output = node_users[0] + if ( + n_output.op == "call_function" + and format_target_name(n_output.target.__name__) in self.fused_acts + ): + gemm_deps.append(n_output) + + # FP32 and Dynamic Quant have no output dependencies + return (True, gemm_deps) + + def _get_bias_deps( + self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType + ) -> Tuple[bool, List[torch.fx.Node]]: + gemm_deps = [] + if len(node.all_input_nodes) > 2 and self.bias_idx: + bias_node = get_input_node(node, self.bias_idx) + if bias_node: + if not is_param_node(ep, bias_node): + return (False, []) # bias node must be a static param + gemm_deps.append(bias_node) + + return (True, gemm_deps) + + def _get_act_deps( + self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType + ) -> Tuple[bool, List[torch.fx.Node]]: + gemm_deps = [] + if precision == ConfigPrecisionType.FP32: + return (True, []) + else: + dq_input = get_input_node(node, self.act_idx) + if not is_dequant(dq_input): + # Expected static quant input to be dequant node + return False, [] + gemm_deps.append(dq_input) + if precision == ConfigPrecisionType.STATIC_QUANT: + # if static quant we are done after finding first dq_input + return (True, gemm_deps) + + # q input node + q_input = get_input_node(dq_input, 0) + if not is_quant(q_input): + return (False, []) + + gemm_deps.append(q_input) + q_input_args = q_input.args + if is_affine_qdq(q_input): + q_input_args = extract_qdq_affine_op_args_for_decomposed_ops(q_input) + if not (is_node(q_input_args[1]) and is_node(q_input_args[2])): + # expected to find getitem node from choose qparam + return (False, []) + + getitem1 = q_input_args[1] + getitem2 = q_input_args[2] + + if not (is_getitem(getitem1) and is_getitem(getitem2)): + # expected getitem node from choose qparam + return (False, []) + + gemm_deps.extend([getitem1, getitem2]) + choose_qparam = get_input_node(getitem1, 0) + if not is_qparam(choose_qparam): + # expected to find choose_qparam node + return (False, []) + gemm_deps.append(choose_qparam) + return (True, gemm_deps) + + +class LinearConfig(GEMMConfig): + target_name = "linear.default" + + def __init__(self, **kwargs): + super().__init__( + weight_idx=1, + bias_idx=2, + act_idx=0, + fused_acts=["relu.default", "hardtanh.default"], + **kwargs, + ) + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return torch.ops.aten.linear.default + + def _get_weight_deps( + self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType + ) -> Tuple[bool, List[torch.fx.Node]]: + if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear: + # if force fp32_dynamic_linear is on and we detected this as fp32, then we + # do not partition the weight node + return (True, []) + + return super()._get_weight_deps(node, ep, precision) + + def supported_precision_types(self): + return [ + ConfigPrecisionType.DYNAMIC_QUANT, + ConfigPrecisionType.FP32, + ConfigPrecisionType.STATIC_QUANT, + ] + + +class ConvolutionConfig(GEMMConfig): + target_name = "convolution.default" + + def __init__(self, **kwargs): + super().__init__( + weight_idx=1, + bias_idx=2, + act_idx=0, + fused_acts=["relu.default", "hardtanh.default"], + **kwargs, + ) + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Currently we have no support for convolution 3d and transposed convolution + """ + if not super().check_constraints(node, ep): + return False + + conv_stride = cast(List[int], node.args[3]) + if len(conv_stride) > 2: + why(node, "Only support 1D + 2D Conv") + return False # Only support 1D + 2D Conv + + transposed = cast(bool, node.args[6]) + if transposed: + why(node, "Transposed Conv is not supported") + return False # Currently don't support transposed conv + + return True + + def supported_precision_types(self): + return [ + ConfigPrecisionType.FP32, + ConfigPrecisionType.STATIC_QUANT, + ] + + +class AddmmConfig(GEMMConfig): + """ + We will handle the legacy form of addmm partitioning which will include + partitioning using source partitions. + """ + + target_name = "addmm.default" + + def __init__(self, **kwargs): + super().__init__( + weight_idx=2, + bias_idx=0, + act_idx=1, + fused_acts=["relu.default", "hardtanh.default"], + **kwargs, + ) + self.src_partitions = None + self.linear_modules = [torch.nn.functional.linear, torch.nn.Linear] + + def get_deps( + self, + node: torch.fx.Node, + ep: ExportedProgram, + ) -> Tuple[bool, List[torch.fx.Node]]: + """ + Gets all dependencies for this gemm partition. Returns a tuple of + a bool indicating if the deps are valid and a list of all the + dep nodes. This handles the src partition for + """ + if self.src_partitions is None: + # Cache src partitions so we don't have to recompute them every time + self.src_partitions = get_source_partitions(ep.graph, self.linear_modules) + + # src_partition is None if node is not in source partition, + # otherwise gives us the linear source partition it belongs to + src_partition = None + for partition_list in self.src_partitions.values(): + for partition in partition_list: + if node in partition.nodes: + src_partition = partition + + if src_partition: + # if addmm belongs to linear src partition, then partition the + # src partition and get its deps + return self.get_deps_from_src_partition(node, ep, src_partition) + + return super().get_deps(node, ep) + + def get_deps_from_src_partition( + self, node: torch.fx.Node, ep: ExportedProgram, src_partition: SourcePartition + ): + """ + Gets all the dependencies for the src partition. This is done by simulating the + linear node from the src partition. We find the associated weights, act, bias + from the linear src partition, and plug those in as the addmm node's args. We also + take the users of the src partitions output node as the addmm node's users. Finally + we just run the GEMMConfig's get_deps method no this faked linear node. After + getting the deps, we return the addmm nodes users and args back. + """ + + def find_partition_args(input_node): + while ( + len(input_node.all_input_nodes) != 0 + and input_node not in src_partition.input_nodes + ): + input_node = input_node.all_input_nodes[0] + return input_node + + old_args, old_users = node.args, node.users + + fake_args = [] + for arg in node.args: + # map addmm's args to the source partition's inputs + # basically simulating what the args of the linear node would be + fake_args.append(find_partition_args(arg)) + + # validate source partition + if ( + # bias must be in source partition + (self.bias_idx and fake_args[self.bias_idx] not in src_partition.nodes) + # activation input must be an input node to partition + or fake_args[self.act_idx] not in src_partition.input_nodes + # weight can either be in the nodes or input_nodes + or fake_args[self.weight_idx] + not in (src_partition.nodes + src_partition.input_nodes) + # there can only be a single output node in partition + or len(src_partition.output_nodes) != 1 + ): + return (False, []) + + # map addmm's args to the source partition linear's inputs and users + node.args = tuple(fake_args) + node.users = src_partition.output_nodes[0].users + valid_deps, deps = super().get_deps(node, ep) + + # Reset addmm node back to old args and users + node.args = old_args + node.users = old_users + + return valid_deps, list(set(deps) | set(src_partition.nodes)) + + def supported_precision_types(self): + return [ + ConfigPrecisionType.FP32, + ConfigPrecisionType.STATIC_QUANT, + ConfigPrecisionType.DYNAMIC_QUANT, + ] + + +class MMConfig(AddmmConfig): + target_name = "mm.default" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.bias_idx = None + self.weight_idx = 1 + self.act_idx = 0 + + def supported_precision_types(self): + return [ + ConfigPrecisionType.FP32, + ConfigPrecisionType.STATIC_QUANT, + ConfigPrecisionType.DYNAMIC_QUANT, + ] diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py new file mode 100644 index 0000000000..b95d7c5b89 --- /dev/null +++ b/backends/xnnpack/partition/config/generic_node_configs.py @@ -0,0 +1,512 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import cast, List, Optional + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, +) +from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.utils import get_input_node +from executorch.exir.backend.canonical_partitioners.config_partitioner import ( + format_target_name, +) +from executorch.exir.backend.utils import WhyNoPartition +from torch.export import ExportedProgram + +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + + +class GenericNodePartitionerConfig(XNNPartitionerConfig): + def __init__(self, fused_act: Optional[List[str]] = None, **kwargs): + """ + fused_act is a list of node target names that can be fused with this + node under quantization + """ + self.fused_acts = fused_act or [] + super().__init__(**kwargs) + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + return self.check_common_constraints(node, ep) + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + deps = [node] + quantized_deps = [] + if ConfigPrecisionType.STATIC_QUANT in self.enabled_precision_types: + # try to partition dequant inputs and quant outputs if static quant is enabled + if [(is_dequant(dq_input)) for dq_input in node.all_input_nodes].count( + False + ): + # if not all inputs are dequant nodes then it isn't quantized + return deps + + quantized_deps.extend(node.all_input_nodes) + + # check if quantized pattern has fused activation + if len(node.users) != 1: + return deps + + node_output = list(node.users)[0] + if ( + node_output.op == "call_function" + and format_target_name(node_output.target.__name__) in self.fused_acts + ): + quantized_deps.append(node_output) + fused_out_users = list(node_output.users.keys()) + if len(fused_out_users) == 1: + node_output = fused_out_users[0] + + if not is_quant(node_output): + # Expected node --> fused_act (optional) --> dequant + return deps + + quantized_deps.append(node_output) + + return deps + quantized_deps + + +class QuantizedPerTensorConfig(GenericNodePartitionerConfig): + target_name = "quantize_per_tensor.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.STATIC_QUANT] + + +class DeQuantizedPerTensorConfig(GenericNodePartitionerConfig): + target_name = "dequantize_per_tensor.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.STATIC_QUANT] + + +class HardtanhConfig(GenericNodePartitionerConfig): + target_name = "hardtanh.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class AddConfig(GenericNodePartitionerConfig): + target_name = "add.Tensor" + + def __init__(self, **kwargs): + super().__init__(fused_act=["relu.default"], **kwargs) + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class ReLUConfig(GenericNodePartitionerConfig): + target_name = "relu.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class AbsConfig(GenericNodePartitionerConfig): + target_name = "abs.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class AvgPoolingConfig(GenericNodePartitionerConfig): + target_name = "avg_pool2d.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + XNNPACK does not support ceil_mode = True and count_include_pad = True + Additionally, we only support divisor_override if divisor_override = pooling region + """ + if not self.check_common_constraints(node, ep): + return False + + args = node.args + + ceil_mode = False # default is False + if len(args) >= 5: + ceil_mode = cast(bool, args[4]) + + count_include_pad = True # default is True + if len(args) >= 6: + count_include_pad = cast(bool, args[5]) + + kernel_size = cast(List[int], args[1]) + pooling_region = kernel_size[0] * kernel_size[1] + divisor_override = pooling_region # Default divisor is pooling_region + if len(args) >= 7: + divisor_override = cast(int, args[6]) + + if ceil_mode: + why(node, reason="ceil mode is not supported") + return False + + if count_include_pad: + why( + node, + reason="zero-padding in the averaging calculation is not supported", + ) + return False + + if divisor_override != pooling_region: + why(node, reason="divisor override is not supported") + return False + + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class CatConfig(GenericNodePartitionerConfig): + target_name = "cat.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Only support concatenation of 2 - 4 tensors + """ + if not self.check_common_constraints(node, ep): + return False + + num_tensors = len(node.all_input_nodes) + + if not (num_tensors >= 2 and num_tensors <= 4): + why( + node, + reason=f"only support concatenation of 2 - 4 tensors, got {num_tensors} tensors", + ) + return False + + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class CeilConfig(GenericNodePartitionerConfig): + target_name = "ceil.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class ClampConfig(GenericNodePartitionerConfig): + target_name = "clamp.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class DivConfig(GenericNodePartitionerConfig): + target_name = "div.Tensor" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class EluConfig(GenericNodePartitionerConfig): + target_name = "elu.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return torch.ops.aten.elu.default + + +class SoftmaxConfig(GenericNodePartitionerConfig): + target_name = "_softmax.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Check that dim is always the last dim + """ + if not self.check_common_constraints(node, ep): + return False + + dim = cast(int, node.args[1]) + node_input = node.all_input_nodes[0] + tensor_dims = node_input.meta["val"].dim() + + if not (dim == -1 or dim == tensor_dims - 1): + why( + node, + reason=f"dim must be the last dim, got dim = {dim} for tensor of rank {tensor_dims}", + ) + return False + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class PermuteConfig(GenericNodePartitionerConfig): + target_name = "permute_copy.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class SigmoidConfig(GenericNodePartitionerConfig): + target_name = "sigmoid.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class MulConfig(GenericNodePartitionerConfig): + target_name = "mul.Tensor" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class MaximumConfig(GenericNodePartitionerConfig): + target_name = "maximum.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class MaxPool2dConfig(GenericNodePartitionerConfig): + target_name = "max_pool2d.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + XNNPACK's maxpool2d does not support ceil mode + """ + if not self.check_common_constraints(node, ep): + return False + + is_ceil_mode = len(node.args) >= 6 and cast(bool, node.args[5]) + if is_ceil_mode: + why(node, reason="ceil mode is not supported") + return False + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return torch.ops.aten.max_pool2d.default + + +class UpsampleBilinear2dConfig(GenericNodePartitionerConfig): + target_name = "upsample_bilinear2d.vec" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return torch.ops.aten.upsample_bilinear2d.vec + + +class FloorConfig(GenericNodePartitionerConfig): + target_name = "floor.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class HardswishConfig(GenericNodePartitionerConfig): + target_name = "hardswish.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class LeakyReLUConfig(GenericNodePartitionerConfig): + target_name = "leaky_relu.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class MeanDimConfig(GenericNodePartitionerConfig): + target_name = "mean.dim" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Mean Dim currently only supports averaging 4D tensors across the innermost + dimensions + """ + if not self.check_common_constraints(node, ep): + return False + + dims = node.args[1] + output_dims = node.meta["val"].dim() + + if dims not in ([-2, -1], [-1, -2]): + why( + node, + reason="mean.dim only supports averaging 4D tensors across the innermost dimensions", + ) + return False + + if output_dims != 4: + why( + node, + reason=f"mean.dim only supports averaging 4D tensors, got tensor of rank {output_dims}", + ) + return False + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class MinimumConfig(GenericNodePartitionerConfig): + target_name = "minimum.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class NegConfig(GenericNodePartitionerConfig): + target_name = "neg.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class PowConfig(GenericNodePartitionerConfig): + target_name = "pow.Tensor_Scalar" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Only support powers of two + """ + if not self.check_common_constraints(node, ep): + return False + + power = node.args[1] + + if not isinstance(power, int): + why(node, reason=f"only support int powers, got {power}") + return False + + if power != 2: + why(node, reason=f"only support power == 2, got {power}") + return False + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class SliceCopyConfig(GenericNodePartitionerConfig): + target_name = "slice_copy.Tensor" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Support slicing with stride = 1, no zero-dim tensors, Slice isn't supported + if the input or output is dynamic + """ + if not self.check_common_constraints(node, ep): + return False + + stride = 1 + if len(node.args) > 4: + stride = cast(int, node.args[4]) + + if stride != 1: + return False + + input_node = get_input_node(node, 0) + output_node = node + + input_shape = list(input_node.meta["val"].shape) + output_shape = list(output_node.meta["val"].shape) + + for dim in input_shape: + if not isinstance(dim, int) or dim == 0: + why( + node, + reason=f"input tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.", + ) + return False + + for dim in output_shape: + if not isinstance(dim, int) or dim == 0: + why( + node, + reason=f"output tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.", + ) + return False + + return True + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class SquareRootConfig(GenericNodePartitionerConfig): + target_name = "sqrt.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class ConstantPadConfig(GenericNodePartitionerConfig): + target_name = "constant_pad_nd.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class SubConfig(GenericNodePartitionerConfig): + target_name = "sub.Tensor" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] + + +class BMMConfig(GenericNodePartitionerConfig): + """ + Despite being a GEMM Kernel, BMM Can be partitioned like a single node partitioner + because it does not perform any packing on the inputs being matrix multiplied + """ + + target_name = "bmm.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class SDPAConfig(GenericNodePartitionerConfig): + target_name = "scaled_dot_product_attention.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + """ + Requires Mask to have Rank 2 + """ + if not self.check_common_constraints(node, ep): + return False + + if len(node.all_input_nodes) < 4: + return False + mask_node = node.all_input_nodes[3] + mask_rank = mask_node.meta["val"].dim() + if mask_rank != 2: + why( + node, + reason=f"mask must have rank 2, got mask of rank {mask_rank}", + ) + return False + + return True + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return torch.ops.aten.scaled_dot_product_attention.default + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] diff --git a/backends/xnnpack/partition/config/node_configs.py b/backends/xnnpack/partition/config/node_configs.py new file mode 100644 index 0000000000..2449d9d644 --- /dev/null +++ b/backends/xnnpack/partition/config/node_configs.py @@ -0,0 +1,150 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import operator +from typing import List, Optional + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, +) +from executorch.backends.xnnpack.passes.fuse_batch_norm_with_conv import ( + FuseBatchNormWithConvPass, +) +from executorch.backends.xnnpack.utils.utils import is_param_node +from executorch.exir.backend.canonical_partitioners.config_partitioner import ( + format_target_name, +) +from executorch.exir.backend.utils import WhyNoPartition +from torch.export import ExportedProgram + +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + + +class BatchNormConfig(XNNPartitionerConfig): + target_name = "_native_batch_norm_legit_no_training.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + if not self.check_common_constraints(node, ep): + return False + + bn = node + conv = node.all_input_nodes[0] + + if conv.op != "call_function": + return False + + conv_name = format_target_name(conv.target.__name__) # pyre-ignore + + if conv_name not in ["convolution.default"]: + why(node, f"Invalid conv target {conv_name}") + return False + + can_fuse = FuseBatchNormWithConvPass.can_fuse(conv, bn, ep) + if not can_fuse: + why(node, "BatchNorm cannot be fused with Convolution") + return False + + return True + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + deps = [node] + + # weight, bias, running_mean, running_var + deps.extend(node.all_input_nodes[1:5]) + + # All the users of batchnorm node must be getitem ops. batchnorm + # returns a 3-element tuple. Each user must only access the first + # element of the tuple. + if [ + (user.target == operator.getitem and user.args[1] == 0) + for user in node.users + ].count(False): + return [] + + deps.extend(list(node.users.keys())) + return deps + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class MaxDimConfig(XNNPartitionerConfig): + target_name = "max.dim" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + # We support max_dim as long as we don't return indices + supported_dtypes = {torch.float32, torch.float16, torch.int8, torch.qint8} + node_val = node.meta.get("val") + output_0 = node_val[0] + + input_node = node.all_input_nodes[0] + if len(input_node.meta.get("val").shape) != 4: + why(node, f"Unsupported input rank {input_node.meta.get('val').shape}") + return False + # Don't check indicies dtype + if output_0.dtype not in supported_dtypes: + why(node, f"Unsupported output dtype {output_0.dtype}") + return False + + max_input = node.all_input_nodes[0] + if max_input.meta.get("val").dtype not in supported_dtypes: + why(node, f"Unsupported input dtype {max_input.meta.get('val').dtype}") + return False + + # Make sure that all users are getitems of the first output + for user in node.users: + if not (user.target == operator.getitem and user.args[1] == 0): + why(node, "Unsupported user of max.dim") + return False + + return True + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + getitems = list(node.users) + + return [node] + getitems + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return None + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + +class PreluConfig(XNNPartitionerConfig): + target_name = "prelu.default" + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + if not self.check_common_constraints(node, ep): + return False + + weight = node.all_input_nodes[1] + is_param = is_param_node(ep, weight) + if not is_param: + why(node, "Prelu weight must be a parameter") + return False + return True + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + return torch.ops.aten.prelu.default + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + weight = node.all_input_nodes[1] + + return [node, weight] + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] diff --git a/backends/xnnpack/partition/config/quant_affine_configs.py b/backends/xnnpack/partition/config/quant_affine_configs.py new file mode 100644 index 0000000000..d9e789104b --- /dev/null +++ b/backends/xnnpack/partition/config/quant_affine_configs.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, +) +from torch.export import ExportedProgram + + +class QDQAffineConfigs(XNNPartitionerConfig): + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + return True + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + # Do not return anything from this because we only use this to + # preserve the decomposition + return [] + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.DYNAMIC_QUANT] + + +class QuantizeAffineConfig(QDQAffineConfigs): + target_name = "quantize_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.quantize_affine.default + except: + return None + + +class DeQuantizeAffineConfig(QDQAffineConfigs): + target_name = "dequantize_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.dequantize_affine.default + except: + return None + + +class ChooseQParamsAffineConfig(QDQAffineConfigs): + target_name = "choose_qparams_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.choose_qparams_affine.default + except: + return None diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py new file mode 100644 index 0000000000..d261416a76 --- /dev/null +++ b/backends/xnnpack/partition/config/xnnpack_config.py @@ -0,0 +1,211 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from abc import abstractmethod +from enum import Enum +from typing import List, Optional + +import torch +from executorch.exir.backend.canonical_partitioners.config_partitioner import ( + format_target_name, + PartitionerConfig, +) +from executorch.exir.backend.utils import WhyNoPartition +from torch.export import ExportedProgram + +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + + +class ConfigPrecisionType(Enum): + FP32 = 1 + STATIC_QUANT = 2 + DYNAMIC_QUANT = 3 + + +class XNNPartitionerConfig(PartitionerConfig): + """ + Base partitioner config for XNNPACK Partitioner Configs. Base wrapper class + for all XNNPACK Partitioner Configs allows us to apply control over + all PartitionerConfigs. XNNPACK Partitioner config also sets a property + for supported precision types. This allows partitioner configs to set + the precision types they support, and let users toggle which precision + types they want to enable + """ + + def __init__(self, **kwargs): + super().__init__() + self.enabled_precision_types = self.supported_precision_types() + # Flag used in GEMMConfig() + self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False) + + def get_partition( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + """ + Overriding abstract method get_partition. + + Returns the partitioned nodes from get_node_and_deps, but also labels them + with the name of the XNNPartitionerConfig class which return this set of nodes. + This enforces that all partitions returned by XNNPartitioner configs are labeled + with the partitioner config which returned them + """ + partitioned_nodes = self.get_node_and_deps(node, ep) + # label partitioned nodes with the name of the partitioner config + for node in partitioned_nodes: + if "xnn_partitioner_config" in node.meta: + node.meta["xnn_partitioner_config"].append(self.__class__.__name__) + else: + node.meta["xnn_partitioner_config"] = [self.__class__.__name__] + + return partitioned_nodes + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + # By default if not specified, we do not halt decomposition for those configs + return None + + @abstractmethod + def supported_precision_types(self) -> List[ConfigPrecisionType]: + """ + Returns the supported PrecisionType of this partitioner config + """ + pass + + @abstractmethod + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + """ + Takes in a node and its exported program and returns a list of nodes + and its dependencies that need to be partitioned together + + Args: + node: Node to be partitioned + ep: Exported program of the graph module + Returns: + List of nodes that can be partitioned + """ + pass + + def set_enabled_precision_types( + self, precision_types: Optional[List[ConfigPrecisionType]] + ): + """ + Set the enabled precisions. + + We take the intersection of the precision_types we wish to enable with + the precision types that this config supports. If enabled_precisions is empty, i.e. + the config does not support any of the precision types we want to enable, + then we will not partition nothing and return false at the common constraints + """ + + if precision_types: + enabled_precisions = [] + for precision in precision_types: + if precision in self.supported_precision_types(): + enabled_precisions.append(precision) + + self.enabled_precision_types = enabled_precisions + + def check_common_constraints( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> bool: + """ + Checks common xnnpack constraints + + Args: + node (torch.fx.Node): Node to check common constraints against + ep (ExportedProgram): Exported Program to check constraints against + + Returns: + True or False whether this node is partitionable + """ + assert ( + node.op == "call_function" + and format_target_name(node.target.__name__) # pyre-ignore + == self.target_name + ) + + if len(self.enabled_precision_types) == 0: + why(node, reason="not enabled precision types") + return False + + has_valid_dtypes = self._check_node_has_valid_dtype(node) + if not has_valid_dtypes: + why(node, reason="invalid dtype") + return False + + return True + + def _check_inputs_are_valid_dtypes(self, node, valid_dtypes): + # Check inputs are valid dtypes + # Gather all args which are nodes + args_to_check = [] + for arg in node.args: + if isinstance(arg, list) or isinstance(arg, tuple): + for item in arg: + if isinstance(item, torch.fx.Node): + args_to_check.append(item) + + if isinstance(arg, torch.fx.Node): + args_to_check.append(arg) + + for arg in args_to_check: + arg_val = arg.meta.get("val", None) + + if arg_val is None or isinstance(arg_val, tuple): + continue + + # Being conservative for now, UX >> Perf + # TODO: We need a pass to scrub these out. + if not isinstance(arg_val, torch.Tensor): + return False + + # XNNPACK does not support empty tensors + if arg_val.numel() == 0: + return False + + if arg_val.dtype not in valid_dtypes: + return False + + return True + + def _check_outputs_are_valid_dtypes(self, node, valid_dtypes): + # Check outputs are valid dtype + node_val = node.meta.get("val", None) + if node_val is None: + return True + + if not isinstance(node_val, tuple): + node_val = (node_val,) + + for val in node_val: + if not isinstance(val, torch.Tensor): + return False + + if val.dtype not in valid_dtypes: + return False + + return True + + def _check_node_has_valid_dtype(self, node): + valid_dtypes = { + torch.float32, + torch.float16, + torch.int8, + torch.qint8, + } + if ( + node.op != "placeholder" + and node.op != "call_function" + and node.op != "get_attr" + ): + return False + + return self._check_inputs_are_valid_dtypes( + node, valid_dtypes + ) and self._check_outputs_are_valid_dtypes(node, valid_dtypes) diff --git a/backends/xnnpack/partition/configs.py b/backends/xnnpack/partition/configs.py index 46e0e3b4f0..2629695518 100644 --- a/backends/xnnpack/partition/configs.py +++ b/backends/xnnpack/partition/configs.py @@ -101,6 +101,8 @@ exir_ops.edge.aten.addmm.default, # TODO(T163877189) add constraint for addmm ] +# This set is used to determine if an op is a supported Quantized Op. This is +# used to determine whether a quantization op is implicit or explicit. SUPPORTED_IMPLICIT_Q_DQ_OP_NAMES_SET = { op.name() for op in ( @@ -108,6 +110,7 @@ + [ exir_ops.edge.aten._to_copy.default, exir_ops.edge.aten.linear.default, + exir_ops.edge.aten.convolution.default, ] ) } diff --git a/backends/xnnpack/partition/graphs/bilinear_2d.py b/backends/xnnpack/partition/graphs/bilinear_2d.py index a971cb9244..0040439f84 100644 --- a/backends/xnnpack/partition/graphs/bilinear_2d.py +++ b/backends/xnnpack/partition/graphs/bilinear_2d.py @@ -37,12 +37,15 @@ def forward(self, x): ] for align_corners in [True, False]: for config in capture_configs: - edge = exir.capture( - bilinear2d(align_corners), sample_inputs, config - ).to_edge( - config=get_xnnpack_edge_compile_config(), - ) - _bilinear2d_graphs[edge.exported_program.graph_module] = align_corners + for skip_dim_order_flag in [True, False]: + edge = exir.capture( + bilinear2d(align_corners), sample_inputs, config + ).to_edge( + config=get_xnnpack_edge_compile_config( + skip_dim_order=skip_dim_order_flag + ) + ) + _bilinear2d_graphs[edge.exported_program.graph_module] = align_corners return _bilinear2d_graphs diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py index 6cee53b86e..700c7d1b75 100644 --- a/backends/xnnpack/partition/xnnpack_partitioner.py +++ b/backends/xnnpack/partition/xnnpack_partitioner.py @@ -5,1201 +5,115 @@ # LICENSE file in the root directory of this source tree. import itertools -import logging -import operator -from typing import Any, Callable, cast, Dict, List, Optional, Set, Union - -import torch -from executorch.backends.xnnpack.partition.configs import ( - STATIC_MODULES, - STATIC_OPS, - SUPPORTED_DYN_QUANT_LINEAR_MODULES, - SUPPORTED_DYN_QUANT_MODULES, - SUPPORTED_MODULES, - SUPPORTED_OPS, - SUPPORTED_QUANT_MODULES, - SUPPORTED_QUANT_OPS, - UNSUPPORTED_QUANT_MODULES, -) -from executorch.backends.xnnpack.partition.graphs import bilinear_2d, sdpa -from executorch.backends.xnnpack.passes.fuse_batch_norm_with_conv import ( - FuseBatchNormWithConvPass, -) -from executorch.backends.xnnpack.utils.quant_utils import is_dequant -from executorch.backends.xnnpack.utils.utils import ( - get_input_node, - get_source_fn, - is_param_node, -) -from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend +import logging +from typing import List, Optional, Type, Union -from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( - generate_partitions_from_list_of_nodes, - generate_pattern_op_partitions, +from executorch.backends.xnnpack.partition.config import ALL_PARTITIONER_CONFIGS +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, ) -from executorch.exir.backend.compile_spec_schema import CompileSpec -from executorch.exir.backend.partitioner import ( - DelegationSpec, - Partitioner, - PartitionResult, +from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend +from executorch.exir.backend.backend_details import ExportedProgram +from executorch.exir.backend.canonical_partitioners.config_partitioner import ( + ConfigerationBasedPartitioner, ) -from executorch.exir.dialects._ops import ops as exir_ops -from torch.export import ExportedProgram +from executorch.exir.backend.partitioner import DelegationSpec from torch.fx.passes.infra.partitioner import Partition -from torch.fx.passes.operator_support import OperatorSupportBase - -from torch.fx.passes.utils.source_matcher_utils import ( - get_source_partitions, - SourcePartition, -) - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger(__name__) - -# TODO - Remove asserts - partitioner shouldn't assert, just not partition that part of the graph - -""" -Per op Constraints ------------------- - -These constraints are used to filter out nodes from a partition when specific -conditions are not met. Indirectly, they specify constrains under which a node -should be lowerable to XNNPACK. If a constraint is not specified here, we will -always lower it. Nodes inside a decomposed subgraph i.e. linear subgraph will -also get test. -Interface: Callable[[torch.fx.Node], bool] +logging.basicConfig(level=logging.WARNING) +logger = logging.getLogger(__name__) -Note: Constraint fns are shared for both module based, op support based and -graph based (for now) partitioner implementations. Given that these stem from -XNNPACK limitations it should be ok to share the same constraint across both. -For module based partitioner - if any node fails to qualify, we discard that -instance of the module. - -Don't update this global dict directly. It is updated through decorator -`XnnpackOperatorSupport._constraint` -""" -_OP_SUPPORT_CONSTRAINTS = {} - - -class XnnpackOperatorSupport(OperatorSupportBase): +class XnnpackPartitioner(ConfigerationBasedPartitioner): def __init__( self, - ep: ExportedProgram, - constraints_dict: Dict[ - Any, Callable[[torch.fx.Node], bool] - ] = _OP_SUPPORT_CONSTRAINTS, - supported_ops: Optional[List] = None, - unsupported_modules: Optional[List] = None, + configs: Optional[List[Type[XNNPartitionerConfig]]] = None, + config_precisions: Optional[ + Union[ConfigPrecisionType, List[ConfigPrecisionType]] + ] = None, + per_op_mode=False, + verbose: bool = False, + **kwargs, ): """ - @Arg constraints_dict: Dict mapping each node to a lambda function that - returns True if backend constraints are met for that instance of the - node. - @Arg supported_ops: List of supported operators for partitioning - """ - self.unsupported_modules = unsupported_modules - self.supported_ops = supported_ops - self.constraints = constraints_dict - self.ep = ep - self.nodes_with_packed_weights = { - exir_ops.edge.aten.convolution.default, - exir_ops.edge.aten.addmm.default, - exir_ops.edge.aten.mm.default, - exir_ops.edge.aten.bmm.default, - } - assert len(self.constraints) - - def _check_inputs_are_valid_dtypes(self, node, valid_dtypes): - # Check inputs are valid dtypes - - # Gather all args which are nodes - args_to_check = [] - for arg in node.args: - if isinstance(arg, list) or isinstance(arg, tuple): - for item in arg: - if isinstance(item, torch.fx.Node): - args_to_check.append(item) - - if isinstance(arg, torch.fx.Node): - args_to_check.append(arg) - - for arg in args_to_check: - arg_val = arg.meta.get("val", None) - - if arg_val is None or isinstance(arg_val, tuple): - continue - - # Being conservative for now, UX >> Perf - # TODO: We need a pass to scrub these out. - if not isinstance(arg_val, torch.Tensor): - return False - - # XNNPACK does not support empty tensors - if arg_val.numel() == 0: - return False - - if arg_val.dtype not in valid_dtypes: - return False - - return True - - def _check_outputs_are_valid_dtypes(self, node, valid_dtypes): - # Check outputs are valid dtype - node_val = node.meta.get("val", None) - if node_val is None: - return True - - if not isinstance(node_val, tuple): - node_val = (node_val,) - - for val in node_val: - if not isinstance(val, torch.Tensor): - return False - - if val.dtype not in valid_dtypes: - return False - - return True - - def check_node_has_valid_dtype(self, node): - # max_pool2d_with_indicies returns indicies which is int64 - # this is supportable within XNNPACK - if node.target in {exir_ops.edge.aten.max_pool2d_with_indices.default}: - return True - - valid_dtypes = { - torch.float32, - torch.float16, - torch.int8, - torch.qint8, - } - if ( - node.op != "placeholder" - and node.op != "call_function" - and node.op != "get_attr" - ): - return False - - return self._check_inputs_are_valid_dtypes( - node, valid_dtypes - ) and self._check_outputs_are_valid_dtypes(node, valid_dtypes) - - def check_common_constraints(self, node) -> bool: - has_valid_dtypes = self.check_node_has_valid_dtype(node) - - return has_valid_dtypes - - @staticmethod - def check_constraint(node, ep) -> bool: - """ - This node is from a partitioned subgraph by one of the partitioners so - should be a valid node otherwise, let's make sure the constraint is met - if specified - """ - return _OP_SUPPORT_CONSTRAINTS.get(node.target, lambda node, ep: True)(node, ep) - - def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: - # Parameters are supported if any of their users are supported - if is_param_node(self.ep, node): - for user in node.users.keys(): - user_of_param = user - if is_dequant(user): - user_of_param = list(user.users.keys())[0] - if ( - self.is_node_supported(submodules, user_of_param) - and user_of_param.target in self.nodes_with_packed_weights - ): - return True - - return False - - # TODO - other ops? - if node.op != "call_function": - return False - - # Specifying supported ops is optional - if self.supported_ops and node.target not in self.supported_ops: - return False - - return self.check_constraint(node, self.ep) and self.check_common_constraints( - node - ) - - @staticmethod - def _constraint(target): # noqa - """ - Decorator to register a constraint fn for a node + @verbose: if True, print out more information about the partitioner. + Default level is WARNING. If verbose is True, level is set to DEBUG. """ + if verbose: + logger.setLevel(logging.DEBUG) + logger.debug("Verbose logging enabled for XNNPACK partitioner.") - def register(func: Callable[[torch.fx.Node, ExportedProgram], bool]): - """ - Pass through registration for the constraint fn - """ - _OP_SUPPORT_CONSTRAINTS[target] = func - return staticmethod(func) - - return register - - """ - Define per op constraints functions below + delegation_spec = DelegationSpec(XnnpackBackend.__name__, []) + configs_to_use = configs or ALL_PARTITIONER_CONFIGS + # Can do logic and have extra args to filter/delete/select + # Certain configs based on user specification + initialized_configs = [] + if isinstance(config_precisions, ConfigPrecisionType): + config_precisions = [config_precisions] - These constraint functions are staticmethods, which are registered through - the decorator in a global dict. And called through `check_constraint()` - method. These are not directly related to the class or the class instance - but they are logically connected. - - Marked as `noqa` because Flake doesn't understand the staticmethod tag and - complains about self not being the first arg. - """ - - @_constraint(exir_ops.edge.aten.mean.dim) - def mean_dim(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Only select 2d cases are supported by XNNPACK - """ - dims = node.args[1] - return dims in ([-2, -1], [-1, -2]) + for config in configs_to_use: + # Config Classes given to XnnpackPartitioner should no longer be abstract + initialized = config(**kwargs) # pyre-ignore + initialized.set_enabled_precision_types(config_precisions) + initialized_configs.append(initialized) - @_constraint(exir_ops.edge.aten.max_pool2d_with_indices.default) - def maxpool2d_with_indices( - node: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - """ - Only if the first output value is consumed in the graph - and it is not in ceil mode - """ - users = list(node.users.keys()) - is_ceil_mode = len(node.args) >= 6 and node.args[5] - return ( - True - if len(users) == 1 - and users[0].target == operator.getitem - and users[0].args[1] == 0 - and not is_ceil_mode - else False - ) + # per_op_mode takes the first match from a partitioner config, any + # subsequent matches that overlap with the first match are not partitioned + self.per_op_mode = per_op_mode + super().__init__(delegation_spec, initialized_configs) - @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_tensor.default) - def quant_per_tensor_default(q: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa + def generate_partitions(self, ep: ExportedProgram) -> List[Partition]: """ - Decide if we want to pull this q node or not in the partition. - Given, op1 -> q -> dq -> op2 - For node q, if op1 or op2 is good, q should be good - TODO: q -> op -> dq, real q not handled right now + generate_partitions is different if partitioner is set to per_op_mode + for per_op_mode we only need to generate unmerged partitions instead + of using the default generate_partitions method. """ - if XnnpackOperatorSupport.check_constraint(q.args[0], ep): - return True + if self.per_op_mode: + return self.generate_per_op_partitions(ep) else: - dq = list(q.users.keys())[0] - op2 = list(dq.users.keys())[0] - return XnnpackOperatorSupport.check_constraint(op2, ep) - - @_constraint(exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default) - def dequant_per_tensor_default( - dq: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - """ - Decide if we want to pull this dq node or not. - """ - return XnnpackOperatorSupport.check_constraint(dq.args[0], ep) - - @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_channel.default) - def quant_per_channel_default( - q: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - return XnnpackOperatorSupport.quant_per_tensor_default(q, ep) - - @_constraint(exir_ops.edge.quantized_decomposed.dequantize_per_channel.default) - def dequant_per_channel_default( - dq: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - return XnnpackOperatorSupport.dequant_per_tensor_default(dq, ep) - - @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor) - def quant_per_tensor_tensor(q: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - return XnnpackOperatorSupport.quant_per_tensor_default(q, ep) + return super().generate_partitions(ep) - @_constraint(exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor) - def dequant_per_tensor_tensor( - dq: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - return XnnpackOperatorSupport.dequant_per_tensor_default(dq, ep) - - @_constraint(exir_ops.edge.quantized_decomposed.choose_qparams.tensor) - def choose_qparams_tensor(cqp: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa + def generate_per_op_partitions(self, ep: ExportedProgram) -> List[Partition]: """ - Given, cqp -> getitem -> q -> dq -> op2 - Just check q, because it will check op2 + Uses configs to generate per_op_partitions. That is no partitions are + merged together. All partitions (node + deps) returned by PartitionerConfigs + are put into their own partition. """ - getitem0 = list(cqp.users.keys())[0] - q = list(getitem0.users.keys())[0] - return XnnpackOperatorSupport.check_constraint(q, ep) - - @_constraint(exir_ops.edge.quantized_decomposed.dequantize_per_token.default) - def dequant_per_token(dq: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - node = list(dq.users.keys())[0] - assert isinstance(node, torch.fx.Node) - return ( - node.target - in [ - exir_ops.edge.aten.mm.default, - exir_ops.edge.aten.addmm.default, - ] - or get_source_fn(node) in SUPPORTED_DYN_QUANT_LINEAR_MODULES - ) - - @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_token.default) - def quant_per_token(q: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - dq = list(q.users.keys())[0] - return ( - dq.target == exir_ops.edge.quantized_decomposed.dequantize_per_token.default - and XnnpackOperatorSupport.dequant_per_token(dq, ep) - ) - - @_constraint( - exir_ops.edge.quantized_decomposed.choose_qparams_per_token_asymmetric.default - ) - def choose_qparams_per_token_asymmetric( - cqp: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - """ - Given, cqp -> getitem -> q -> dq -> {mm, addmm} - Just check q, because it will check dq - """ - getitem0 = list(cqp.users.keys())[0] - q = list(getitem0.users.keys())[0] - return ( - q.target == exir_ops.edge.quantized_decomposed.quantize_per_token.default - and XnnpackOperatorSupport.check_constraint(q, ep) - ) - - @_constraint( - exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default - ) - def dequant_per_channel_group_default( - dq: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - # Currently only supported by dqlinear weights - permute_node = list(dq.users.keys())[0] - assert isinstance(permute_node, torch.fx.Node) - # We must have a transpose on [add]mm weights - if permute_node.target != exir_ops.edge.aten.permute_copy.default: - return False - mm_node = list(permute_node.users.keys())[0] - assert isinstance(mm_node, torch.fx.Node) - return mm_node.target in [ - exir_ops.edge.aten.mm.default, - exir_ops.edge.aten.addmm.default, - ] - - @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default) - def quant_per_channel_group_default( - q: torch.fx.Node, ep: ExportedProgram # noqa - ) -> bool: - # we shouldn't have this with folded quant weights but doesn't hurt to lower it - dq = list(q.users.keys())[0] - assert isinstance(dq, torch.fx.Node) - return ( - dq.target - == exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default - and XnnpackOperatorSupport.dequant_per_channel_default(dq, ep) - ) - - @_constraint(exir_ops.edge.aten.pow.Tensor_Scalar) - def pow_tensor_scalar(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Only supports square, when args_2 = 2 - """ - power = node.args[1] - return isinstance(power, int) and power == 2 - - @_constraint(exir_ops.edge.aten.avg_pool2d.default) - def avg_pool_2d(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Arguments to avg_pool2d.default node are as follows: - - input, - - kernel_size, - - stride, - - padding, - - ceil_mode, - - count_include_pad, - - divisor_override, - - XNNPACK does not support ceil_mode = True and count_include_pad = True - Additionally, we only support divisor_override if divisor_override = pooling region - """ - args = node.args - - ceil_mode = False # default is False - if len(args) >= 5: - ceil_mode = cast(bool, args[4]) - - count_include_pad = True # default is True - if len(args) >= 6: - count_include_pad = cast(bool, args[5]) - - kernel_size = cast(List[int], args[1]) - pooling_region = kernel_size[0] * kernel_size[1] - divisor_override = pooling_region # Default divisor is pooling_region - if len(args) >= 7: - divisor_override = cast(int, args[6]) - - return ( - not (ceil_mode or count_include_pad) and divisor_override == pooling_region - ) - - @_constraint(exir_ops.edge.aten._prelu_kernel.default) - def prelu(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Input and Weight must be 4-dimensional - """ - input_dim = cast(torch.fx.Node, node.args[0]).meta["val"].dim() - weight_dim = cast(torch.fx.Node, node.args[1]).meta["val"].dim() - return input_dim == 4 and weight_dim == 4 - - @_constraint(exir_ops.edge.aten.cat.default) - def cat(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Only support concatenation of 2 - 4 tensors - """ - num_tensors = len(cast(List[torch.fx.Node], node.args[0])) - return num_tensors >= 2 and num_tensors <= 4 - - @_constraint(exir_ops.edge.aten.slice_copy.Tensor) - def slice_copy(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Support slicing with stride = 1, no zero-dim tensors - """ - stride = 1 - if len(node.args) > 4: - stride = cast(int, node.args[4]) - - if stride != 1: - return False - - input_node = get_input_node(node, 0) - output_node = node - - input_shape = list(input_node.meta["val"].shape) - output_shape = list(output_node.meta["val"].shape) - - for dim in input_shape: - if dim == 0: - return False - - for dim in output_shape: - if dim == 0: - return False - - return True - - @_constraint(exir_ops.edge.aten.amax.default) - def amax(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - A: Only with keep_dim == True - B: Only support with dim == 2 or dim == 3 - valid iff, A && B - """ - is_keep_dim = (len(node.args) == 3) and (cast(bool, node.args[3]) is True) - dim_arg_val = cast(int, node.args[1]) - return is_keep_dim and (dim_arg_val == 2 or dim_arg_val == 3) - - @_constraint(exir_ops.edge.aten._native_batch_norm_legit_no_training.default) - def batch_norm(node: torch.fx.Node, ep: ExportedProgram) -> bool: # noqa - """ - Only support batch norms that can be fused with convolutions. - This will be removed once standalone batch norm is supported. - """ - - # TODO(gjcomer) Remove after standalone batch norm (T171796544). - - conv_node = node.args[0] - assert isinstance(conv_node, torch.fx.Node) - - if conv_node.target != exir_ops.edge.aten.convolution.default: - return False - - return FuseBatchNormWithConvPass.can_fuse(conv_node, node, ep) - - -class XnnpackFloatingPointPartitioner(Partitioner): - """ - Module and Opname based partitioner for FP32 modules/ops listed in - SUPPORTED_MODULES and SUPPORTED_OPS. - """ - - def __init__( - self, - supported_modules: List[Callable] = SUPPORTED_MODULES, - supported_ops: Optional[List[Callable]] = SUPPORTED_OPS, - unsupported_modules: Optional[List[Callable]] = None, - ): - super().__init__() - self.supported_modules = set(supported_modules) - self.unsupported_modules = unsupported_modules - self.supported_ops = set(supported_ops or []) - - self.delegation_spec = DelegationSpec(XnnpackBackend.__name__, []) - - @staticmethod - def check_partitions(partitions: Union[dict, list]) -> bool: - """ - Warn users if there aren't any matches - - TODO: convert this into a stronger validation, may need a flag in - `to_backend()` or partitioner __init__() - """ - pl = len(partitions) - if pl == 0: - log.warning("Nothing can be partitioned!") - else: - log.info(f"Found {pl} subgraphs to be partitioned.") - return pl != 0 - - def get_input_deps( # noqa - self, input_nodes: List[torch.fx.Node], ep: ExportedProgram - ) -> List[torch.fx.Node]: - """ - For each input node, walk up and pull necessary param/attr nodes in the partition - """ - nodes = set() - - def is_param(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and is_param_node(ep, node) - - for inp in input_nodes: - if is_param(ep, inp): - nodes.add(inp) - - return list(nodes) - - def get_nodes( - self, src_partition: SourcePartition, ep: ExportedProgram - ) -> List[torch.fx.Node]: - """ - Return nodes from the source partition. - - This is a wrapper to allow derived classes to add their own custom - logic to extend the src_partition nodes list. - """ - return src_partition.nodes + self.get_input_deps(src_partition.input_nodes, ep) - - def qualify_nodes( - self, input_nodes: List[torch.fx.Node], ep: ExportedProgram - ) -> bool: - """ - Each node in the module (post decomposition) must satisfy the - constraints specified for XNNPACK. - - Disqualify the whole module if one of the nodes fails to satisfy. - """ - return all( - XnnpackOperatorSupport.check_constraint(node, ep) for node in input_nodes - ) - - def get_module_partitions(self, ep: ExportedProgram) -> List[List[torch.fx.Node]]: - """ - Get all partitions in the torch.fx.GraphModule for the supported - modules. - """ - graph_module = ep.graph_module - src_partition_dict = get_source_partitions( - graph_module.graph, self.supported_modules - ) - all_partitions = src_partition_dict.values() - - module_partitions = [] - for src_partitions in all_partitions: - for src_partition in src_partitions: - partition_nodes = self.get_nodes(src_partition, ep) - if self.qualify_nodes(partition_nodes, ep): - module_partitions.append(partition_nodes) - - return module_partitions - - def generate_partitions(self, ep: ExportedProgram) -> List[Any]: - """ - Generate a list of partitions for an torch.fx.GraphModule. - Also pass the supported ops to match. - """ - graph_module = ep.graph_module - matched_module_nodes = self.get_module_partitions(ep) - return generate_partitions_from_list_of_nodes( - graph_module, - matched_module_nodes, - XnnpackOperatorSupport( - ep=ep, - supported_ops=self.supported_ops, - unsupported_modules=self.unsupported_modules, - ), - ) - - def tag_nodes(self, partitions: List[Partition]) -> Dict[str, DelegationSpec]: - """ - Tag each partition in the list with its delegation tag. - """ - partition_tags: Dict[str, DelegationSpec] = {} - for partition in partitions: - # Add delegation tags - for node in partition.nodes: - delegation_tag = f"tag{partition.id}" - node.meta["delegation_tag"] = delegation_tag - partition_tags[delegation_tag] = self.delegation_spec - return partition_tags - - # override - def partition(self, exported_program: ExportedProgram) -> PartitionResult: - """ - Run the partitioner on the given graph module, then tag each partition - with its delegation tag (and partition id) - """ - partitions = self.generate_partitions(exported_program) - partition_tags: Dict[str, DelegationSpec] = {} - if self.check_partitions(partitions): - partition_tags = self.tag_nodes(partitions) - return PartitionResult( - tagged_exported_program=exported_program, partition_tags=partition_tags - ) - - -# TODO: Merge XnnpackQuantizedPartitioner and XnnpackFloatingPointPartitioner -class XnnpackQuantizedPartitioner(XnnpackFloatingPointPartitioner): - """ - Module and Opname based partitioner for statically quantized modules/ops listed in SUPPORTED_QUANT_MODULES and SUPPORTED_QUANT_OPS. - """ - - _Q_OPS = [ - exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, - ] - - _DQ_OPS = [ - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, - ] - - _QPARAM_OPS = [ - exir_ops.edge.quantized_decomposed.choose_qparams.tensor, - exir_ops.edge.quantized_decomposed.choose_qparams_per_token_asymmetric.default, - ] - - _QUANT_OPS = _Q_OPS + _DQ_OPS + _QPARAM_OPS - - def __init__( - self, - supported_modules=SUPPORTED_QUANT_MODULES, - supported_ops=SUPPORTED_QUANT_OPS, - unsupported_modules=UNSUPPORTED_QUANT_MODULES, - ): - supported_ops = supported_ops or [] + partitions = [] + matched_nodes = self.get_matched_nodes_from_configs(ep) + partition_id = itertools.count() + nodes_seen = set() + for match in matched_nodes: + match_set = set(match) + # We only create partitions from the first PartitionerConfig match + # if a subsequent partitioner match contains the same node, we do + # not create a partition for it + if match_set.isdisjoint(nodes_seen): + partitions.append( + Partition( + id=next(partition_id), + nodes=match_set, + ) + ) + nodes_seen.update(match_set) + return partitions + + +class XnnpackDynamicallyQuantizedPartitioner(XnnpackPartitioner): + def __init__(self): super().__init__( - supported_modules, supported_ops + self._QUANT_OPS, unsupported_modules - ) - - def get_input_deps( # noqa - self, input_nodes: List[torch.fx.Node], ep: ExportedProgram - ) -> List[torch.fx.Node]: - """ - For each input node, walk up and pull necessary quant/attr nodes in the partition - """ - nodes = set() - - def is_param(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and is_param_node(ep, node) - - def is_q(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and node.target in self._Q_OPS - - def is_dq(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and node.target in self._DQ_OPS - - def is_qparam(node) -> bool: - return isinstance(node, torch.fx.Node) and node.target in self._QPARAM_OPS - - def is_getitem(node) -> bool: - return ( - isinstance(node, torch.fx.Node) - and node.op == "call_function" - and node.target == operator.getitem - ) - - for inp in input_nodes: - if is_dq(ep, inp): - dq = inp - - # Possible graph we want to partition - # op(...) - # ^ - # | - # dq(0, 1, 2) - # ^ ^ ^ - # | | | - # q(0, 1, 2) # optional, only when not folded by the quantizer - # ^ ^ ^ - # | | | - # parameter ---------------' | | - # [choose_qparams --> get_item(s)] # optional, only with dynamic quant - # per_channel_zp* ------------' | - # per_channel_scale* ------------' - - # The dequant node - nodes.add(dq) - - # possible per_channel scale/zp for the dequant node args{1, 2} - for i in [1, 2]: - node = dq.args[i] - if is_param(ep, node): - nodes.add(node) - - # is it quant or param node? - prod = dq.args[0] - - assert is_q(ep, prod) or is_param( - ep, prod - ), f"Expecting quant or param node as an input to a dq node, but got {prod.target} for {prod} node" - - nodes.add(prod) - - if is_q(ep, prod): - # possible nodes for quant node args{0, 1, 2}: 0: weight, 1: scale, 2: zero_point - for i in [0, 1, 2]: - node = prod.args[i] # pyre-ignore - - # possible choose_qparam - if is_getitem(node) and is_qparam(node.args[0]): - nodes.add(node) - nodes.add(node.args[0]) - - # weights or possible per_channel scale/zp for the quant node - elif is_param(ep, node): - nodes.add(node) - return list(nodes) - - def get_output_deps( - self, output_nodes: List[torch.fx.Node], exported_program - ) -> List[torch.fx.Node]: - """ - For each output node, check all the users and insert them into the partition if needed - """ - nodes = [] - for output in output_nodes: - for node in output.users: - if node.target in self._Q_OPS: - nodes.append(node) - users = list(node.users.keys()) - for dq_user in users: - assert ( - dq_user.target in self._DQ_OPS - ), "Expecting a dq node(s) after a q node, but got target {dq_user.target} for {dq_user} node" - nodes.append(dq_user) - return nodes - - # override - def get_nodes( - self, src_partition: SourcePartition, ep: ExportedProgram - ) -> List[torch.fx.Node]: # noqa - """ - Insert quantization ops into src_partition by following the input, output node. - """ - return ( - src_partition.nodes - + self.get_input_deps(src_partition.input_nodes, ep) - + self.get_output_deps(src_partition.output_nodes, ep) - ) - - -class XnnpackPartitioner(Partitioner): - """ - Module and Opname based partitioner for FP32 modules/ops listed in - SUPPORTED_MODULES and SUPPORTED_OPS and statically quantized modules/ops listed in - SUPPORTED_QUANT_MODULES and SUPPORTED_QUANT_OPS. - """ - - _Q_OPS = [ - exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, - ] - - _DQ_OPS = [ - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, - ] - - _QPARAM_OPS = [ - exir_ops.edge.quantized_decomposed.choose_qparams.tensor, - ] - - _QUANT_OPS = _Q_OPS + _DQ_OPS + _QPARAM_OPS - - def __init__( - self, - *, - supported_modules: List[Callable] = SUPPORTED_MODULES, - supported_ops: Optional[List[Callable]] = SUPPORTED_OPS, - supported_quant_modules: List[Callable] = SUPPORTED_QUANT_MODULES, - supported_quant_ops: Optional[List[Callable]] = SUPPORTED_QUANT_OPS, - quant: Optional[bool] = None, - has_dynamic_shapes: bool = False, - _lower_recomposed_sdpa: Optional[bool] = True, - ): - super().__init__() - self.supported_modules = set(supported_modules) - self.supported_ops = set(supported_ops or []) - self.supported_quant_modules = set(supported_quant_modules) - - supported_quant_ops = supported_quant_ops or [] - self.supported_quant_ops = set(supported_quant_ops + self._QUANT_OPS) - - self.quant = quant - - # TODO(T174256335) - remove this once we have a better way to handle >2d Mask - self._lower_recomposed_sdpa: bool = ( - _lower_recomposed_sdpa if _lower_recomposed_sdpa is not None else True - ) - - self.delegation_spec = DelegationSpec(XnnpackBackend.__name__, []) - self.partition_tags: Dict[str, DelegationSpec] = {} - - self.has_dynamic_shapes = has_dynamic_shapes - if has_dynamic_shapes: - self.supported_ops = self.supported_ops - set(STATIC_OPS) - self.supported_modules = self.supported_modules - set(STATIC_MODULES) - self.supported_quant_ops = self.supported_quant_ops - set(STATIC_OPS) - self.supported_quant_modules = self.supported_quant_modules - set( - STATIC_MODULES - ) - - def get_supported_modules(self, quant: bool) -> Set[Callable]: - """ - Get supported modules - """ - if quant is True: - return self.supported_quant_modules - elif quant is False: - return self.supported_modules - else: - return self.supported_modules | self.supported_quant_modules - - def get_supported_ops(self, quant: Optional[bool]) -> Set[Callable]: - """ - Get supported ops - """ - if quant is True: - return self.supported_quant_ops - elif quant is False: - return self.supported_ops - else: - return self.supported_ops | self.supported_quant_ops - - @staticmethod - def check_partitions(partitions: Union[dict, list]) -> bool: - """ - Warn users if there aren't any matches - - TODO: convert this into a stronger validation, may need a flag in - `to_backend()` or partitioner __init__() - """ - pl = len(partitions) - if pl == 0: - log.warning("Nothing can be partitioned!") - else: - log.info(f"Found {pl} subgraphs to be partitioned.") - return pl != 0 - - def get_input_deps( # noqa - self, input_nodes: List[torch.fx.Node], ep: ExportedProgram - ) -> List[torch.fx.Node]: - """ - For each input node, walk up and pull necessary quant/attr nodes in the partition - """ - nodes = set() - - def is_param(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and is_param_node(ep, node) - - def is_q(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and node.target in self._Q_OPS - - def is_dq(ep: ExportedProgram, node) -> bool: - return isinstance(node, torch.fx.Node) and node.target in self._DQ_OPS - - def is_qparam(node) -> bool: - return isinstance(node, torch.fx.Node) and node.target in self._QPARAM_OPS - - def is_getitem(node) -> bool: - return ( - isinstance(node, torch.fx.Node) - and node.op == "call_function" - and node.target == operator.getitem - ) - - for inp in input_nodes: - if is_dq(ep, inp): - dq = inp - - # Possible graph we want to partition - # op(...) - # ^ - # | - # dq(0, 1, 2) - # ^ ^ ^ - # | | | - # q(0, 1, 2) # optional, only when not folded by the quantizer - # ^ ^ ^ - # | | | - # parameter ---------------' | | - # [choose_qparams --> get_item(s)] # optional, only with dynamic quant - # per_channel_zp* ------------' | - # per_channel_scale* ------------' - - # The dequant node - nodes.add(dq) - - # possible per_channel scale/zp for the dequant node args{1, 2} - for i in [1, 2]: - node = dq.args[i] - if is_param(ep, node): - nodes.add(node) - - # is it quant or param node? - prod = dq.args[0] - - assert is_q(ep, prod) or is_param( - ep, prod - ), f"Expecting quant or param node as an input to a dq node, but got {prod.target} for {prod} node" - - nodes.add(prod) - - if is_q(ep, prod): - # possible nodes for quant node args{0, 1, 2}: 0: weight, 1: scale, 2: zero_point - for i in [0, 1, 2]: - node = prod.args[i] # pyre-ignore - - # possible choose_qparam - if is_getitem(node) and is_qparam(node.args[0]): - nodes.add(node) - nodes.add(node.args[0]) - - # weights or possible per_channel scale/zp for the quant node - elif is_param(ep, node): - nodes.add(node) - return list(nodes) - - def get_output_deps( - self, output_nodes: List[torch.fx.Node], ep: ExportedProgram - ) -> List[torch.fx.Node]: - """ - For each output node, check all the users and insert them into the partition if needed - """ - nodes = [] - for output in output_nodes: - for node in output.users: - if node.target in self._Q_OPS: - nodes.append(node) - users = list(node.users.keys()) - for dq_user in users: - assert ( - dq_user.target in self._DQ_OPS - ), "Expecting a dq node(s) after a q node, but got target {dq_user.target} for {dq_user} node" - nodes.append(dq_user) - return nodes - - def get_nodes( - self, src_partition: SourcePartition, ep: ExportedProgram, quant: bool - ) -> List[torch.fx.Node]: - """ - Return nodes from the source partition. - """ - if quant: - # Insert quantization ops into src_partition by following the input, output node. - return ( - src_partition.nodes - + self.get_input_deps(src_partition.input_nodes, ep) - + self.get_output_deps(src_partition.output_nodes, ep) - ) - else: - return src_partition.nodes - - def qualify_nodes( - self, input_nodes: List[torch.fx.Node], ep: ExportedProgram - ) -> bool: - """ - Each node in the module (post decomposition) must satisfy the - constraints specified for XNNPACK. - - Disqualify the whole module if one of the nodes fails to satisfy. - """ - return all( - XnnpackOperatorSupport.check_constraint(node, ep) for node in input_nodes - ) - - def get_module_partitions( - self, - ep: ExportedProgram, - quant: Optional[bool], - ) -> List[List[torch.fx.Node]]: - """ - Get all partitions in the torch.fx.GraphModule for the supported - modules. - """ - graph_module = ep.graph_module - if quant is None: - module_partitions = self.get_module_partitions(ep, True) - for node_list in module_partitions: - for node in node_list: - node.meta["quant_match"] = True - fp32_module_partitions = self.get_module_partitions(ep, False) - for node_list in fp32_module_partitions: - for node in node_list: - if node.meta.get("quant_match", False): - break - else: - module_partitions.append(node_list) - for node_list in module_partitions: - for node in node_list: - node.meta.pop("quant_match", False) - return module_partitions - - src_partition_dict = get_source_partitions( - graph_module.graph, self.get_supported_modules(quant) - ) - all_partitions = src_partition_dict.values() - - module_partitions = [] - for src_partitions in all_partitions: - for src_partition in src_partitions: - partition_nodes = self.get_nodes(src_partition, ep, quant) - if self.qualify_nodes(partition_nodes, ep): - module_partitions.append(partition_nodes) - - return module_partitions - - def get_graph_partitions( - self, ep, quant: Optional[bool] - ) -> List[List[torch.fx.Node]]: - graph_module = ep.graph_module - graphs = bilinear_2d.get_graphs() - - # Temporary for lowering SDPA - if self._lower_recomposed_sdpa: - graphs += sdpa.get_graphs() - - graph_patterns = [gm_pattern.graph for gm_pattern in graphs] - partitions = generate_pattern_op_partitions( - graph_module, graph_patterns, ignore_literals=True + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, per_op_mode=True ) - graph_partitions = [] - for src_partition in partitions: - graph_partitions.append(src_partition.nodes) - return graph_partitions - def generate_partitions( - self, ep: ExportedProgram, quant: Optional[bool] - ) -> List[Any]: - """ - Generate a list of partitions for an torch.fx.GraphModule. - Also pass the supported ops to match. - """ - graph_module = ep.graph_module - matched_module_nodes = self.get_module_partitions(ep, quant) - matched_graph_nodes = self.get_graph_partitions(ep, quant) - return generate_partitions_from_list_of_nodes( - graph_module, - matched_module_nodes + matched_graph_nodes, - XnnpackOperatorSupport( - ep=ep, supported_ops=list(self.get_supported_ops(quant)) - ), - ) +class XnnpackFloatingPointPartitioner(XnnpackPartitioner): + def __init__(self): + super().__init__(config_precisions=ConfigPrecisionType.FP32) - def tag_nodes(self, partitions: List[Partition]) -> Dict[str, DelegationSpec]: - """ - Tag each partition in the list with its delegation tag. - """ - partition_tags: Dict[str, DelegationSpec] = {} - for partition in partitions: - # Add delegation tags - skip = False - for node in partition.nodes: - if "delegation_tag" in node.meta: - skip = True - if skip: - continue - for node in partition.nodes: - delegation_tag = f"tag{partition.id}" - node.meta["delegation_tag"] = delegation_tag - partition_tags[delegation_tag] = self.delegation_spec - return partition_tags - # override - def _partition( - self, exported_program: ExportedProgram, quant: Optional[bool] - ) -> PartitionResult: - """ - Run the partitioner on the given graph module, then tag each partition - with its delegation tag (and partition id) - """ - partitions = self.generate_partitions(exported_program, quant) - partition_tags: Dict[str, DelegationSpec] = {} - if self.check_partitions(partitions): - partition_tags = self.tag_nodes(partitions) - return PartitionResult( - tagged_exported_program=exported_program, partition_tags=partition_tags - ) - - def partition(self, exported_program: ExportedProgram) -> PartitionResult: - ret: PartitionResult = self._partition(exported_program, self.quant) - return ret - - -class XnnpackDynamicallyQuantizedPartitioner(XnnpackQuantizedPartitioner): - def __init__( - self, - supported_modules=SUPPORTED_DYN_QUANT_MODULES, - supported_ops=None, # no other ops are supported - ): - super().__init__(supported_modules, supported_ops) - - # override - def partition(self, exported_program: ExportedProgram) -> PartitionResult: - """ - Run the partitioner on the given graph module, then tag each partition with its delegegation tag (and partition id) - - We don't want to use `generate_*_partitions` helpers because we don't want these modules to fuse in the same delegate. - """ - partition_id = itertools.count() - partitions = [ - Partition( - id=next(partition_id), - nodes=set( - filter(lambda x: x.target != torch.ops.aten.sym_size.int, match) - ), - ) - for match in self.get_module_partitions(exported_program) - ] - partition_tags: Dict[str, DelegationSpec] = {} - self.delegation_spec = DelegationSpec( - XnnpackBackend.__name__, [CompileSpec("dqlinear_partitioner", bytes())] - ) - - if self.check_partitions(partitions): - partition_tags = self.tag_nodes(partitions) - return PartitionResult( - tagged_exported_program=exported_program, partition_tags=partition_tags - ) +class XnnpackQuantizedPartitioner(XnnpackPartitioner): + def __init__(self): + super().__init__(config_precisions=ConfigPrecisionType.STATIC_QUANT) diff --git a/backends/xnnpack/passes/TARGETS b/backends/xnnpack/passes/TARGETS index e91614c735..6bc3742abe 100644 --- a/backends/xnnpack/passes/TARGETS +++ b/backends/xnnpack/passes/TARGETS @@ -30,6 +30,7 @@ python_library( "//executorch/exir:pass_base", "//executorch/exir/dialects:lib", "//executorch/exir/passes:const_prop_pass", + "//executorch/exir/passes:memory_format_ops_pass", "//executorch/exir/program:program", ], ) diff --git a/backends/xnnpack/passes/__init__.py b/backends/xnnpack/passes/__init__.py index 1ca4fe307f..c3a85e4aa8 100644 --- a/backends/xnnpack/passes/__init__.py +++ b/backends/xnnpack/passes/__init__.py @@ -27,6 +27,7 @@ from executorch.exir.pass_base import ExportPass from executorch.exir.passes.const_prop_pass import ConstPropPass +from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass from executorch.exir.program._program import _transform from torch._export.pass_base import PassType @@ -50,6 +51,8 @@ def __init__( if not passes: # All the XNNPACK passes self.passes = [ + # TODO - remove this pass once we have a better support for dim_order ops lowering + DimOrderOpsRevertPass, ConvertToUpsampleBilinear2d, ConvertToLinearPass, ConvertToSDPAPass, diff --git a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py index 1816d6ccfc..692f1a9d14 100644 --- a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py +++ b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py @@ -44,13 +44,13 @@ class ChannelsLastTaggedReshapePass(XNNPACKPass): # Set of ops that require memory format to be channels last (NHWC) memory_sensitive_ops_nhwc = { exir_ops.edge.aten.convolution.default, - exir_ops.edge.aten.upsample_bilinear2d.default, + exir_ops.edge.aten.upsample_bilinear2d.vec, exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.max_pool2d.default, exir_ops.edge.aten.amax.default, exir_ops.edge.aten.max.dim, exir_ops.edge.aten.avg_pool2d.default, - exir_ops.edge.aten._prelu_kernel.default, + exir_ops.edge.aten.prelu.default, } # Set of ops that require memory format to be NCHW @@ -124,7 +124,7 @@ def create_call_function_node( "call_function", target=target, args=args, - kwargs=( + kwargs=( # pyre-fixme[6] {"memory_format": memory_format} if memory_format is not None else {} ), ) diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py index 69f882523c..2cef71bf92 100644 --- a/backends/xnnpack/passes/convert_to_linear.py +++ b/backends/xnnpack/passes/convert_to_linear.py @@ -13,9 +13,8 @@ from executorch.backends.transforms.addmm_mm_to_linear import ( apply_addmm_mm_to_linear_transform, ) -from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass from torch.fx.passes.infra.pass_base import PassResult from torch.fx.passes.utils.source_matcher_utils import ( @@ -27,7 +26,7 @@ logger.setLevel(logging.WARNING) -class ConvertToLinearPass(XNNPACKPass): +class ConvertToLinearPass(ExportPass): linear_modules = [ torch.nn.Linear, torch.nn.functional.linear, @@ -71,28 +70,24 @@ def get_arg(node: torch.fx.Node, arg: str): map_ = {"input": 0, "weight": 1} return None if arg == "bias" else node.args[map_[arg]] - def find_bias_for_mm(self, src_partition: SourcePartition, weight: torch.fx.Node): + def find_bias_for_mm(self, src_partition: SourcePartition, mm_node: torch.fx.Node): """ For linear decomposed with mm + add, find bias in src partition """ - out_channels = get_shape(weight)[0] - bias = None - - # Try to find bias node in all nodes - for node in src_partition.nodes: - if is_param_node(self.exported_program, node) and node != weight: - bias = node - - if bias is not None: - assert get_shape(bias) == [ - out_channels - ], f"Expected bias shape {[out_channels]} but got {get_shape(bias)}" - else: - assert exir_ops.edge.aten.add.Tensor not in [ - node.target for node in src_partition.nodes - ], f"Expecting to find bias for Linear module: {src_partition} but could not find it" - return bias + mm_users = list(mm_node.users.keys()) + if len(mm_users) != 1: + return None + + add_node = mm_users[0] + if add_node.target != exir_ops.edge.aten.add.Tensor: + return None + + for arg in add_node.all_input_nodes: + if arg != mm_node and arg in src_partition.input_nodes: + return arg + + return None def create_linear( self, @@ -119,7 +114,7 @@ def create_linear( src_partition.input_nodes + src_partition.params, # bias can be in params ) if linear_bias is None and node.target == exir_ops.edge.aten.mm.default: - linear_bias = self.find_bias_for_mm(src_partition, linear_weight) + linear_bias = self.find_bias_for_mm(src_partition, node) logger.debug(f"Found bias(?): {linear_bias} from node {node}") diff --git a/backends/xnnpack/passes/convert_to_sdpa.py b/backends/xnnpack/passes/convert_to_sdpa.py index 76bb24cc94..97aca5491d 100644 --- a/backends/xnnpack/passes/convert_to_sdpa.py +++ b/backends/xnnpack/passes/convert_to_sdpa.py @@ -83,7 +83,7 @@ def create_sdpa( kwargs={"scale": scale}, ) - sdpa_node.meta["val"] = sdpa_node.target( + sdpa_node.meta["val"] = sdpa_node.target( # pyre-fixme[29] *[n.meta["val"] for n in match.placeholder_nodes], scale=scale, ) diff --git a/backends/xnnpack/passes/convert_to_upsample_bilinear2d.py b/backends/xnnpack/passes/convert_to_upsample_bilinear2d.py index 065f3254ce..45956ee6f6 100644 --- a/backends/xnnpack/passes/convert_to_upsample_bilinear2d.py +++ b/backends/xnnpack/passes/convert_to_upsample_bilinear2d.py @@ -36,7 +36,7 @@ def create_upsample_bilinear_2d( with graph_module.graph.inserting_before(output): upsample_node = graph_module.graph.create_node( "call_function", - exir_ops.edge.aten.upsample_bilinear2d.default, + exir_ops.edge.aten.upsample_bilinear2d.vec, # TODO(T166527012): Using output_h and output_w here only works with static shapes args=(input_node, [output_h, output_w], align_corners, None), ) diff --git a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py index 2d41429eb1..ac6ccc9b89 100644 --- a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py +++ b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py @@ -12,7 +12,11 @@ SUPPORTED_IMPLICIT_Q_DQ_OP_NAMES_SET, ) from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.quant_utils import ( + is_dequant, + is_dynamic_qdq, + is_quant, +) from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassResult @@ -76,18 +80,7 @@ def is_output_node(self, node: torch.fx.Node) -> bool: return node.op == "output" def is_dynamically_quantized(self, node: torch.fx.Node) -> bool: - return any( - is_dequant(input_node) - and ( - cast( - torch._ops.OpOverload, input_node.target - )._schema.schema.overload_name - == "tensor" - or input_node.target - == exir_ops.edge.quantized_decomposed.dequantize_per_token.default - ) - for input_node in node.all_input_nodes - ) + return is_dynamic_qdq(node) def is_supported_quant_op(self, node: torch.fx.Node) -> bool: return ( @@ -139,10 +132,8 @@ def get_ending_implicit_q_nodes( ): return [next_node] elif self.is_output_node(next_node): - # Check if second_node (which is between dq and output nodes) - # is aten.linear.default - if self.is_dynamically_quantized(start_node): - return [] + # if node following dq is output node + return None else: # Check if nodes between the dq node and the next q match # a supported quant chain @@ -193,6 +184,9 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: ending_implicit_q_nodes = [] for user in first_node.users: + if self.is_dynamically_quantized(first_node): + # if the dq is a dynamic dq, then it is implicit + break user_end_nodes = self.get_ending_implicit_q_nodes(user) if user_end_nodes is None: # This user isn't part of a "supported" group diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 8c8db60065..2145ea1519 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -21,6 +21,25 @@ namespace executor { namespace xnnpack { namespace delegate { +/* + * Provide compile-time allocation. + */ +class CompileAllocator { + public: + /* + * Allocate memory which will be automatically freed at the end + * of the compilation process. + */ + void* allocateTemporary(size_t size) { + auto mem = new uint8_t[size]; + temporaries_.emplace_back(mem); + return mem; + } + + private: + std::vector> temporaries_; +}; + // Flatbuffer types using ValuePtr = const fb_xnnpack::XValue*; using NodePtr = const fb_xnnpack::XNode*; @@ -35,6 +54,23 @@ using DefineNodeFunc = Error (*)( const std::unordered_map&, NodePtr) noexcept; +/* +Convert a tensor from fp32 to bf16. +*/ +void convertF32TensorToBF16( + const float* f32_data, + uint16_t* bf16_data_out, + size_t numel) { + for (auto i = 0u; i < numel; i++) { + // Adjust the f32 value such that it rounds properly after truncation. + // Constant factor scales 1+2^-8 to 1+2e-7. + float f32_adjusted = f32_data[i] * 1.00389105f; + uint32_t f32_bits; + memcpy(&f32_bits, &f32_adjusted, sizeof(float)); + bf16_data_out[i] = static_cast(f32_bits >> 16); + } +} + /* Gets the output min and output max for a given node operator */ @@ -152,7 +188,8 @@ Error defineTensor( GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, std::vector& input_ids, - std::vector& output_ids) { + std::vector& output_ids, + CompileAllocator& allocator) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -356,12 +393,31 @@ Error defineTensor( size_t group_size = qparams->group_size(); size_t output_channels = tensor_value->dims()->Get(0); size_t input_channels = tensor_value->dims()->Get(1); + + const uint16_t* scale_data = nullptr; + uint32_t scale_numel = 0; + + // Block scales are preferably serialized as bf16 but can also be + // serialized as fp32 for backwards compatability. + if (qparams->scale_bf16() != nullptr) { + scale_data = + static_cast(qparams->scale_bf16()->data()); + scale_numel = qparams->scale_bf16()->size(); + } else { + // Read fp32 scales, convert to bf16. + auto conv_buffer = static_cast(allocator.allocateTemporary( + qparams->scale()->size() * sizeof(uint16_t))); + scale_numel = qparams->scale()->size(); + convertF32TensorToBF16( + qparams->scale()->data(), conv_buffer, scale_numel); + scale_data = conv_buffer; + } + ET_CHECK_OR_RETURN_ERROR( - qparams->scale()->size() == - output_channels * input_channels / group_size, + scale_numel == output_channels * input_channels / group_size, Internal, "scale size %zu != output channels %zu * group size %zu", - (size_t)qparams->scale()->size(), + static_cast(scale_numel), output_channels, group_size); int32_t zero_point = @@ -370,18 +426,19 @@ Error defineTensor( Debug, "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n", buffer_ptr, - qparams->scale()->size(), + scale_numel, qparams->channel_dim(), group_size, output_channels, datatype, zero_point, datatype); + status = xnn_define_blockwise_quantized_tensor_value( /*subgraph=*/subgraph_ptr, /*datatype=*/datatype, /*zero_point=*/zero_point, - /*scale=*/qparams->scale()->data(), + /*scale=*/scale_data, /*num_dims=*/tensor_value->num_dims(), /*channel_dim=*/qparams->channel_dim(), /*block_size=*/qparams->group_size(), @@ -1504,6 +1561,35 @@ Error defineScaledDotProductAttentionNode( return Error::Ok; } + +/* +Defines batch matrix multiply node into the subgraph, +using the remapped ids to map the serialized ids, +to the new ids generated when defining the tensor value +*/ +Error defineBatchMatrixMultiplyNode( + xnn_subgraph_t subgraph_ptr, + const std::unordered_map& remapped_ids, + const NodePtr node) noexcept { + auto graph_node = node->xnode_union_as_XNNBatchMatrixMultiply(); + + xnn_status status = xnn_define_batch_matrix_multiply( + subgraph_ptr, + remapped_ids.at(graph_node->input1_id()), + remapped_ids.at(graph_node->input2_id()), + remapped_ids.at(graph_node->output_id()), + graph_node->flags()); + + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to create BMM node %i with code: %s", + node->debug_handle(), + xnn_status_to_string(status)); + + return Error::Ok; +} + /* Returns not Implemented Error code. This function is meant to be called when the compiler encountes a XNodeType from the flatbuffer @@ -1566,6 +1652,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) { _DEFINE(Concatenate4) _DEFINE(StaticSlice) _DEFINE(ScaledDotProductAttention) + _DEFINE(BatchMatrixMultiply) case fb_xnnpack::XNodeUnion::NONE: default: // Adding here as a catch all, just in case return &defineNotImplementedNode; @@ -1578,14 +1665,16 @@ Builds the xnnpack runtime object using the buffer pointer. The buffer pointer must be a valid pointer to the serialized xnnpack object. It also fills the XNNExecutor object with the built xnn_runtime and the input/output ids. */ -__ET_NODISCARD Error XNNCompiler::compileModel( +ET_NODISCARD Error XNNCompiler::compileModel( const void* buffer_pointer, size_t num_bytes, XNNExecutor* executor, - MemoryAllocator* runtime_allocator) { + MemoryAllocator* runtime_allocator, + xnn_workspace_t workspace) { Result header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; + CompileAllocator compile_allocator; // Header status can only either be Error::Ok or Error::NotFound if (header.ok()) { @@ -1657,7 +1746,8 @@ __ET_NODISCARD Error XNNCompiler::compileModel( flatbuffer_graph, constant_data, input_ids, - output_ids); + output_ids, + compile_allocator); if (err != Error::Ok) { return err; @@ -1678,11 +1768,26 @@ __ET_NODISCARD Error XNNCompiler::compileModel( #endif xnn_runtime_t runtime_ptr = nullptr; - status = xnn_create_runtime_v2( + +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE + ET_CHECK_OR_RETURN_ERROR( + workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace"); + status = xnn_create_runtime_v4( subgraph.get(), + /*weight_cache=*/nullptr, // TODO - support weight cache + workspace, torch::executorch::threadpool::get_pthreadpool(), runtime_flags, &runtime_ptr); +#else + status = xnn_create_runtime_v3( + subgraph.get(), + /*weight_cache=*/nullptr, // TODO - support weight cache + torch::executorch::threadpool::get_pthreadpool(), + runtime_flags, + &runtime_ptr); +#endif + ET_CHECK_OR_RETURN_ERROR( xnn_status_success == status, Internal, diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h index 945022ae89..94deda5263 100644 --- a/backends/xnnpack/runtime/XNNCompiler.h +++ b/backends/xnnpack/runtime/XNNCompiler.h @@ -25,11 +25,12 @@ class XNNCompiler { // Takes Flatbuffer Serialized XNNPACK Model and rebuilds the xnn-subgraph // returns an executor object that holds the xnn runtime object which we // can then use to set inputs and run inference using the xnn graph. - __ET_NODISCARD static Error compileModel( + ET_NODISCARD static Error compileModel( const void* buffer_pointer, size_t num_bytes, XNNExecutor* executor, - MemoryAllocator* runtime_allocator); + MemoryAllocator* runtime_allocator, + xnn_workspace_t workspace); }; } // namespace delegate diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp index 0311489d9d..2ca1c7d4b2 100644 --- a/backends/xnnpack/runtime/XNNExecutor.cpp +++ b/backends/xnnpack/runtime/XNNExecutor.cpp @@ -22,7 +22,7 @@ using SizesType = exec_aten::SizesType; * inputs/outputs externals_ is resized to the total number of inputs and * outputs */ -__ET_NODISCARD Error XNNExecutor::initialize( +ET_NODISCARD Error XNNExecutor::initialize( xnn_runtime_t runtime, std::vector&& input_ids, std::vector&& output_ids) { @@ -62,7 +62,7 @@ __ET_NODISCARD Error XNNExecutor::initialize( * runtime correspond to their index in the list of arg passed into * delegate->execute() */ -__ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) { +ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) { // Create xnn_externals_value from evalue args xnn_status status; for (uint32_t i = 0; i < externals_.size(); ++i) { @@ -86,6 +86,11 @@ __ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) { // Reshape runtime inputs if (i < input_ids_.size()) { size_t num_dims = tensor->dim(); + ET_CHECK_OR_RETURN_ERROR( + is_contiguous_dim_order(tensor->dim_order().data(), tensor->dim()), + Internal, + "Expecting default dim_order but got a non default dim_order tensor for external input %u", + i); size_t dims[XNN_MAX_TENSOR_DIMS]; ET_CHECK_OR_RETURN_ERROR( num_dims <= XNN_MAX_TENSOR_DIMS, @@ -123,7 +128,7 @@ __ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) { * We first setup the runtime by feeding the externals_ to runtime setup. * After which we then execute the runtime through invoke_runtime. */ -__ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) { +ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) { ET_CHECK_OR_RETURN_ERROR( runtime_ != nullptr, Internal, @@ -175,7 +180,7 @@ __ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) { * XNNPACK gives the index tensor to us as int32, we need to convert it * back to int64 for ExecuTorch. */ -__ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const { +ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const { size_t output_idx_start = input_ids_.size(); for (size_t i = output_idx_start; i < externals_.size(); ++i) { uint32_t ext_id = externals_[i].id; diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h index b13951bdd1..c35307cb91 100644 --- a/backends/xnnpack/runtime/XNNExecutor.h +++ b/backends/xnnpack/runtime/XNNExecutor.h @@ -51,7 +51,7 @@ class XNNExecutor { * The input/output ids are expected to be sorted in order of their * flatbuffer id_outs */ - __ET_NODISCARD Error initialize( + ET_NODISCARD Error initialize( xnn_runtime_t runtime, std::vector&& input_ids, std::vector&& output_ids); @@ -62,19 +62,19 @@ class XNNExecutor { * input shapes will be propagated through the runtime, and perform * any additional memory planning as needed */ - __ET_NODISCARD Error prepare_args(EValue** args); + ET_NODISCARD Error prepare_args(EValue** args); /** * Executes the graph using the args prepared at prepare_args(). */ - __ET_NODISCARD Error forward(BackendExecutionContext& context); + ET_NODISCARD Error forward(BackendExecutionContext& context); /** * Prepares the outputs to be returned by the delegate * * Performs any post processing of outputs like tensor resizing */ - __ET_NODISCARD Error resize_outputs(EValue** args) const; + ET_NODISCARD Error resize_outputs(EValue** args) const; friend class XNNCompiler; }; diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index c22dd219b7..c817c010e2 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -11,17 +11,49 @@ #include #include #include + #include +#include #pragma clang diagnostic ignored "-Wglobal-constructors" namespace torch { namespace executor { -class XnnpackBackend final : public PyTorchBackendInterface { +class XnnpackBackend final : public ::executorch::runtime::BackendInterface { public: ~XnnpackBackend() = default; + XnnpackBackend() { + // Initialize XNNPACK + xnn_status status = xnn_initialize(/*allocator=*/nullptr); + if (status != xnn_status_success) { + ET_LOG( + Error, + "Failed to initialize, XNNPACK status: 0x%x", + (unsigned int)status); + return; + } + +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE + // Create a workspace for the XNNExecutor to use. This workspace will be + // shared across all delegate instances. + ET_LOG(Debug, "Creating XNN workspace"); + xnn_workspace_t workspace = nullptr; + status = xnn_create_workspace(&workspace); + if (status != xnn_status_success) { + ET_LOG( + Error, + "Failed to create XNN workspace, XNNPACK status: 0x%x", + (unsigned int)status); + workspace = nullptr; + return; + } + workspace_.reset(workspace); + ET_LOG(Debug, "Created XNN workspace: %p", workspace_.get()); +#endif // ENABLE_XNNPACK_SHARED_WORKSPACE + } + bool is_available() const override { return xnn_status_success == xnn_initialize(/*allocator=*/nullptr); } @@ -38,12 +70,12 @@ class XnnpackBackend final : public PyTorchBackendInterface { // new and since this type is not trivially destructible, we must call the // destructor manually in destroy(). new (executor) xnnpack::delegate::XNNExecutor; - Error err = xnnpack::delegate::XNNCompiler::compileModel( processed->data(), processed->size(), executor, - context.get_runtime_allocator()); + context.get_runtime_allocator(), + workspace_.get()); // This backend does not need its processed data after compiling the model. processed->Free(); @@ -65,6 +97,10 @@ class XnnpackBackend final : public PyTorchBackendInterface { EValue** args) const override { auto executor = static_cast(handle); +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE + const std::lock_guard lock(workspace_mutex_); +#endif + // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); if (err != Error::Ok) { @@ -94,6 +130,13 @@ class XnnpackBackend final : public PyTorchBackendInterface { executor->~XNNExecutor(); } } + + private: + // This is a global workspace for all delegate instances. + mutable std::mutex workspace_mutex_; + std::unique_ptr workspace_{ + nullptr, + &xnn_release_workspace}; }; namespace { diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index 5ace211149..efe717e085 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -63,6 +63,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table XNNTensorValue { @@ -134,6 +135,7 @@ union XNodeUnion { XNNConcatenate4: _XNNCat, XNNStaticSlice, XNNScaledDotProductAttention, + XNNBatchMatrixMultiply: _XNNNode2x1, } union XValueUnion { diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index b968c6d9e9..33571195d6 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -48,6 +48,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table PerChannelQuant { @@ -130,6 +131,7 @@ union XNodeUnion { XNNConcatenate4: _XNNCat, XNNStaticSlice, XNNScaledDotProductAttention, + XNNBatchMatrixMultiply: _XNNNode2x1, } union XValueUnion { diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py index 9127474c91..e3e699c58f 100644 --- a/backends/xnnpack/serialization/xnnpack_graph_schema.py +++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py @@ -177,6 +177,11 @@ class XNNConcatenate4(XNNCat): pass +@dataclass +class XNNBatchMatrixMultiply(XNNNode2x1): + pass + + @dataclass class XNNStaticTranspose: num_dims: int @@ -354,6 +359,7 @@ class XNNScaledDotProductAttention: XNNConcatenate4, XNNStaticSlice, XNNScaledDotProductAttention, + XNNBatchMatrixMultiply, ] diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 64439ed8a3..dc8cd5917b 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -36,7 +36,10 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], preprocessor_flags = [ + # Uncomment to enable per operator timings # "-DENABLE_XNNPACK_PROFILING", + # Uncomment to enable workspace sharing across delegates + # "-DENABLE_XNNPACK_SHARED_WORKSPACE" ], exported_deps = [ "//executorch/runtime/backend:interface", @@ -44,7 +47,7 @@ def define_common_targets(): deps = [ third_party_dep("XNNPACK"), "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header", - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", "//executorch/runtime/core/exec_aten/util:tensor_util", ], # XnnpackBackend.cpp needs to compile with executor as whole diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index d0fbddae23..02852871fe 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -23,8 +23,10 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs # We can't put runtime/test_runtime_utils.cpp because we don't # build aten - runtime/test_xnnexecutor.cpp ../threadpool/threadpool.cpp - ../threadpool/threadpool_guard.cpp ../threadpool/test/threadpool_test.cpp + runtime/test_xnnexecutor.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/test/threadpool_test.cpp ) et_cxx_test( @@ -32,6 +34,7 @@ et_cxx_test( SOURCES ${_test_srcs} EXTRA_LIBS + extension_threadpool xnnpack_backend XNNPACK pthreadpool diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS index abedffb8e6..629ac8275b 100644 --- a/backends/xnnpack/test/TARGETS +++ b/backends/xnnpack/test/TARGETS @@ -36,10 +36,10 @@ runtime.python_test( deps = [ "//executorch/backends/xnnpack/partition:xnnpack_partitioner", "//executorch/backends/xnnpack/test/tester:tester", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir/passes:constant_prop_pass", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/serialize:lib", "//pytorch/ao:torchao", # @manual ], external_deps = [ diff --git a/backends/xnnpack/test/models/deeplab_v3.py b/backends/xnnpack/test/models/deeplab_v3.py index a3e6ac5f5b..9913296521 100644 --- a/backends/xnnpack/test/models/deeplab_v3.py +++ b/backends/xnnpack/test/models/deeplab_v3.py @@ -32,8 +32,7 @@ def test_fp32_dl3(self): ( Tester(self.dl3, self.model_inputs) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .to_executorch() .serialize() .run_method_and_compare_outputs() diff --git a/backends/xnnpack/test/models/edsr.py b/backends/xnnpack/test/models/edsr.py index 93bf5520a8..34b5898cf4 100644 --- a/backends/xnnpack/test/models/edsr.py +++ b/backends/xnnpack/test/models/edsr.py @@ -21,8 +21,7 @@ def test_fp32_edsr(self): ( Tester(self.edsr, self.model_inputs) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .to_executorch() .serialize() .run_method_and_compare_outputs() @@ -34,8 +33,7 @@ def _test_qs8_edsr(self): Tester(self.edsr, self.model_inputs) .quantize() .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .to_executorch() .serialize() .run_method_and_compare_outputs() @@ -47,8 +45,7 @@ def test_qs8_edsr_no_calibrate(self): Tester(self.edsr, self.model_inputs) .quantize(Quantize(calibrate=False)) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .to_executorch() .serialize() .run_method_and_compare_outputs() diff --git a/backends/xnnpack/test/models/emformer_rnnt.py b/backends/xnnpack/test/models/emformer_rnnt.py index 27e278dd2e..5cf4337307 100644 --- a/backends/xnnpack/test/models/emformer_rnnt.py +++ b/backends/xnnpack/test/models/emformer_rnnt.py @@ -38,8 +38,7 @@ def test_fp32_emformer_joiner(self): ( Tester(joiner, joiner.get_example_inputs()) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .to_executorch() .serialize() @@ -65,8 +64,7 @@ def _test_fp32_emformer_predictor(self): ( Tester(predictor, predictor.get_example_inputs()) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .to_executorch() .serialize() @@ -89,8 +87,7 @@ def test_fp32_emformer_transcriber(self): ( Tester(transcriber, transcriber.get_example_inputs()) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .to_executorch() .serialize() diff --git a/backends/xnnpack/test/models/inception_v3.py b/backends/xnnpack/test/models/inception_v3.py index 7f1a728fa8..59fd56d6af 100644 --- a/backends/xnnpack/test/models/inception_v3.py +++ b/backends/xnnpack/test/models/inception_v3.py @@ -34,9 +34,7 @@ def test_fp32_ic3(self): ( Tester(self.ic3, self.model_inputs) .export() - .to_edge() - .check(list(self.all_operators)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(self.all_operators)) .to_executorch() @@ -55,9 +53,7 @@ def _test_qs8_ic3(self): Tester(self.ic3, self.model_inputs) .quantize() .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_quantization)) .to_executorch() @@ -76,9 +72,7 @@ def test_qs8_ic3_no_calibration(self): Tester(self.ic3, self.model_inputs) .quantize(Quantize(calibrate=False)) .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_quantization)) .to_executorch() diff --git a/backends/xnnpack/test/models/inception_v4.py b/backends/xnnpack/test/models/inception_v4.py index 2b9b183468..e8a785116a 100644 --- a/backends/xnnpack/test/models/inception_v4.py +++ b/backends/xnnpack/test/models/inception_v4.py @@ -32,9 +32,7 @@ def test_fp32_ic4(self): ( Tester(self.ic4, self.model_inputs) .export() - .to_edge() - .check(list(self.all_operators)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(self.all_operators)) .to_executorch() @@ -52,9 +50,7 @@ def test_qs8_ic4(self): Tester(self.ic4, self.model_inputs) .quantize() .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_quantization)) .to_executorch() diff --git a/backends/xnnpack/test/models/llama2_et_example.py b/backends/xnnpack/test/models/llama2_et_example.py index 286750b38d..6948321d53 100644 --- a/backends/xnnpack/test/models/llama2_et_example.py +++ b/backends/xnnpack/test/models/llama2_et_example.py @@ -39,10 +39,7 @@ def _test(self, dtype: torch.dtype = torch.float): ( Tester(model, example_inputs) .export() - .to_edge() - .dump_artifact() - .partition() - .dump_artifact() + .to_edge_transform_and_lower() .to_executorch() .serialize() .run_method_and_compare_outputs(atol=5e-2, inputs=example_inputs) diff --git a/backends/xnnpack/test/models/mobilebert.py b/backends/xnnpack/test/models/mobilebert.py index dee351f34d..ca18e6c265 100644 --- a/backends/xnnpack/test/models/mobilebert.py +++ b/backends/xnnpack/test/models/mobilebert.py @@ -7,7 +7,7 @@ import unittest import torch -from executorch.backends.xnnpack.test.tester import Tester +from executorch.backends.xnnpack.test.tester import Quantize, Tester from transformers import MobileBertConfig, MobileBertModel # @manual @@ -32,9 +32,19 @@ def test_fp32_mobilebert(self): ( Tester(self.mobilebert, self.example_inputs) .export() - .to_edge() - .check(list(self.supported_ops)) - .partition() + .to_edge_transform_and_lower() + .check_not(list(self.supported_ops)) + .to_executorch() + .serialize() + .run_method_and_compare_outputs(inputs=self.example_inputs) + ) + + def test_qs8_mobilebert(self): + ( + Tester(self.mobilebert, self.example_inputs) + .quantize(Quantize(calibrate=False)) + .export() + .to_edge_transform_and_lower() .check_not(list(self.supported_ops)) .to_executorch() .serialize() diff --git a/backends/xnnpack/test/models/mobilenet_v2.py b/backends/xnnpack/test/models/mobilenet_v2.py index 799f5c8110..4ee28af6b9 100644 --- a/backends/xnnpack/test/models/mobilenet_v2.py +++ b/backends/xnnpack/test/models/mobilenet_v2.py @@ -39,9 +39,7 @@ def test_fp32_mv2(self): ( Tester(self.mv2, self.model_inputs, dynamic_shapes=dynamic_shapes) .export() - .to_edge() - .check(list(self.all_operators)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(self.all_operators)) .to_executorch() @@ -67,9 +65,7 @@ def _test_qs8_mv2(self): Tester(self.mv2, self.model_inputs, dynamic_shapes=dynamic_shapes) .quantize() .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_quantization)) .to_executorch() @@ -95,9 +91,7 @@ def test_qs8_mv2_no_calibration(self): Tester(self.mv2, self.model_inputs, dynamic_shapes=dynamic_shapes) .quantize(Quantize(calibrate=False)) .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_quantization)) .to_executorch() diff --git a/backends/xnnpack/test/models/mobilenet_v3.py b/backends/xnnpack/test/models/mobilenet_v3.py index ce67f44cca..cacd8b5cc8 100644 --- a/backends/xnnpack/test/models/mobilenet_v3.py +++ b/backends/xnnpack/test/models/mobilenet_v3.py @@ -28,7 +28,6 @@ class TestMobileNetV3(unittest.TestCase): "executorch_exir_dialects_edge__ops_aten_clamp_default", "executorch_exir_dialects_edge__ops_aten_permute_copy_default", "executorch_exir_dialects_edge__ops_aten_addmm_default", - "executorch_exir_dialects_edge__ops_aten__to_copy_default", "executorch_exir_dialects_edge__ops_aten_convolution_default", "executorch_exir_dialects_edge__ops_aten_relu_default", "executorch_exir_dialects_edge__ops_aten_add_Tensor", @@ -41,9 +40,7 @@ def test_fp32_mv3(self): ( Tester(self.mv3, self.model_inputs, dynamic_shapes=self.dynamic_shapes) .export() - .to_edge() - .check(list(self.all_operators)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(self.all_operators)) .to_executorch() @@ -53,18 +50,13 @@ def test_fp32_mv3(self): @unittest.skip("T187799178: Debugging Numerical Issues with Calibration") def _test_qs8_mv3(self): - ops_after_quantization = self.all_operators - { - "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default", - } ops_after_lowering = self.all_operators ( Tester(self.mv3, self.model_inputs, dynamic_shapes=self.dynamic_shapes) .quantize() .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_tranform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_lowering)) .to_executorch() @@ -74,18 +66,13 @@ def _test_qs8_mv3(self): # TODO: Delete and only used calibrated test after T187799178 def test_qs8_mv3_no_calibration(self): - ops_after_quantization = self.all_operators - { - "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default", - } ops_after_lowering = self.all_operators ( Tester(self.mv3, self.model_inputs, dynamic_shapes=self.dynamic_shapes) .quantize(Quantize(calibrate=False)) .export() - .to_edge() - .check(list(ops_after_quantization)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(ops_after_lowering)) .to_executorch() diff --git a/backends/xnnpack/test/models/resnet.py b/backends/xnnpack/test/models/resnet.py index 616655a9d7..4ad6a7d5f4 100644 --- a/backends/xnnpack/test/models/resnet.py +++ b/backends/xnnpack/test/models/resnet.py @@ -39,8 +39,7 @@ def forward(self, x): def _test_exported_resnet(self, tester): ( tester.export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check_not( [ "executorch_exir_dialects_edge__ops_aten_convolution_default", diff --git a/backends/xnnpack/test/models/torchvision_vit.py b/backends/xnnpack/test/models/torchvision_vit.py index 2502c35f6b..6bebd284e5 100644 --- a/backends/xnnpack/test/models/torchvision_vit.py +++ b/backends/xnnpack/test/models/torchvision_vit.py @@ -73,9 +73,7 @@ def _test_exported_vit(self, tester, check_nots=None): } ( tester.export() - .to_edge() - .check(list(self.all_operators)) - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .check_not(list(lowerable_xnn_operators)) .check_not(check_nots) diff --git a/backends/xnnpack/test/models/very_big_model.py b/backends/xnnpack/test/models/very_big_model.py index ccf98808df..3545287c62 100644 --- a/backends/xnnpack/test/models/very_big_model.py +++ b/backends/xnnpack/test/models/very_big_model.py @@ -34,8 +34,7 @@ def _test_very_big_model(self): ( Tester(self.BigModel(), (torch.randn(1, 5000),)) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check(["torch.ops.higher_order.executorch_call_delegate"]) .to_executorch() .serialize() diff --git a/backends/xnnpack/test/models/w2l.py b/backends/xnnpack/test/models/w2l.py index 7f63d0b15f..07b3bf56b3 100644 --- a/backends/xnnpack/test/models/w2l.py +++ b/backends/xnnpack/test/models/w2l.py @@ -25,8 +25,7 @@ def test_fp32_w2l(self): ( Tester(self.wav2letter, self.model_inputs, self.dynamic_shape) .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check_not( [ "executorch_exir_dialectes_edge__ops_aten_convolution_default", @@ -44,8 +43,7 @@ def test_qs8_w2l(self): Tester(self.wav2letter.eval(), self.model_inputs, self.dynamic_shape) .quantize() .export() - .to_edge() - .partition() + .to_edge_transform_and_lower() .check_not( [ "executorch_exir_dialectes_edge__ops_aten_convolution_default", diff --git a/backends/xnnpack/test/ops/abs.py b/backends/xnnpack/test/ops/abs.py index 2906654dfb..fba91db05c 100644 --- a/backends/xnnpack/test/ops/abs.py +++ b/backends/xnnpack/test/ops/abs.py @@ -24,9 +24,7 @@ def _test_abs(self, inputs): Tester(self.Abs(), inputs) .export() .check_count({"torch.ops.aten.abs.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_abs_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_abs_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/add.py b/backends/xnnpack/test/ops/add.py index f8c202bd7f..784a9d3bbf 100644 --- a/backends/xnnpack/test/ops/add.py +++ b/backends/xnnpack/test/ops/add.py @@ -47,9 +47,7 @@ def _test_add(self, inputs): Tester(self.Add(), inputs) .export() .check_count({"torch.ops.aten.add.Tensor": 4}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 4}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_add_Tensor"]) .to_executorch() @@ -71,9 +69,7 @@ def test_fp32_add_constant(self): Tester(self.AddConstant(torch.randn(4, 4, 4)), inputs) .export() .check_count({"torch.ops.aten.add.Tensor": 4}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 4}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_add_Tensor"]) .to_executorch() @@ -88,9 +84,7 @@ def test_qs8_add_constant(self): .quantize() .export() .check_count({"torch.ops.aten.add.Tensor": 4}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 4}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_add_Tensor"]) .to_executorch() @@ -106,9 +100,7 @@ def test_qs8_add(self): .export() .check_count({"torch.ops.aten.add.Tensor": 4}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 4}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -129,9 +121,7 @@ def test_qs8_add2(self): .export() .check_count({"torch.ops.aten.add.Tensor": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -152,9 +142,7 @@ def test_qs8_add3(self): .export() .check_count({"torch.ops.aten.add.Tensor": 4}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 4}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -179,10 +167,7 @@ def test_fp32_add_relu(self): .export() .check_count({"torch.ops.aten.add.Tensor": 1}) .check_count({"torch.ops.aten.relu.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 1}) - .check_count({"executorch_exir_dialects_edge__ops_aten_relu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_not(["executorch_exir_dialects_edge__ops_aten_add_Tensor"]) .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) @@ -200,10 +185,7 @@ def test_qs8_add_relu(self): .check_count({"torch.ops.aten.add.Tensor": 1}) .check_count({"torch.ops.aten.relu.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 1}) - .check_count({"executorch_exir_dialects_edge__ops_aten_relu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() .serialize() @@ -246,10 +228,7 @@ def forward(self, x, z): {"torch.ops.aten.add.Tensor": 1, "torch.ops.aten.relu.default": 1} ) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_add_Tensor": 1}) - .check_count({"executorch_exir_dialects_edge__ops_aten_relu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() .serialize() diff --git a/backends/xnnpack/test/ops/avgpool2d.py b/backends/xnnpack/test/ops/avgpool2d.py index edb92d09a3..b471fd914c 100644 --- a/backends/xnnpack/test/ops/avgpool2d.py +++ b/backends/xnnpack/test/ops/avgpool2d.py @@ -33,11 +33,7 @@ def _test_argpool2d(self, inputs): Tester(self.AvgPool2d(), inputs) .export() .check_count({"torch.ops.aten.avg_pool2d.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_avg_pool2d_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) .to_executorch() @@ -62,11 +58,7 @@ def test_fp32_avgpool2d_ceil_mode_unsupported(self): Tester(self.AvgPool2d(ceil_mode=True), inputs) .export() .check_count({"torch.ops.aten.avg_pool2d.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_avg_pool2d_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) @@ -79,11 +71,7 @@ def test_fp32_avgpool2d_count_include_pad_unsupported(self): Tester(self.AvgPool2d(count_include_pad=True), inputs) .export() .check_count({"torch.ops.aten.avg_pool2d.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_avg_pool2d_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) @@ -96,10 +84,6 @@ def test_fp32_avgpool2d_divisor_override(self): Tester(self.AvgPool2d(divisor_override=5), inputs) .export() .check_count({"torch.ops.aten.avg_pool2d.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_avg_pool2d_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) diff --git a/backends/xnnpack/test/ops/bilinear2d.py b/backends/xnnpack/test/ops/bilinear2d.py index ab9d3d3c11..bf89e2196f 100644 --- a/backends/xnnpack/test/ops/bilinear2d.py +++ b/backends/xnnpack/test/ops/bilinear2d.py @@ -65,12 +65,15 @@ def forward(self, x): ) return a + # Since we may or may not enable dim order, use these ops only for + # check_not since we have `to_copy` and `to_dim_order_copy` in the list. ops = { "executorch_exir_dialects_edge__ops_aten_sub_Tensor", "executorch_exir_dialects_edge__ops_aten_mul_Tensor", "executorch_exir_dialects_edge__ops_aten_index_Tensor", "executorch_exir_dialects_edge__ops_aten_arange_start_step", "executorch_exir_dialects_edge__ops_aten__to_copy_default", + "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default", "executorch_exir_dialects_edge__ops_aten_add_Tensor", "executorch_exir_dialects_edge__ops_aten_clamp_default", } @@ -80,9 +83,7 @@ def test_fp32_static_resize_bilinear2d(self): ( Tester(self.StaticResizeBilinear2dModule(), example_inputs) .export() - .to_edge() - .check(self.ops) - .partition() + .to_edge_transform_and_lower() .check_not(self.ops) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() @@ -90,14 +91,12 @@ def test_fp32_static_resize_bilinear2d(self): .run_method_and_compare_outputs() ) - def test_fp32_static_resize_bilinear2d_with_align_cornesr(self): + def test_fp32_static_resize_bilinear2d_with_align_corners(self): example_inputs = (torch.randn(2, 3, 4, 5),) ( Tester(self.StaticResizeBilinear2dModuleWithAlignCorners(), example_inputs) .export() - .to_edge() - .check(self.ops) - .partition() + .to_edge_transform_and_lower() .check_not(self.ops) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() @@ -111,13 +110,7 @@ def test_fp32_static_resize_bilinear2d_antialiased(self): ( Tester(self.Bilinear2dAntiAlias(), example_inputs) .export() - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten__upsample_bilinear2d_aa_default": 2 - } - ) - .partition() + .to_edge_transform_and_lower() .check_count( { "executorch_exir_dialects_edge__ops_aten__upsample_bilinear2d_aa_default": 2 diff --git a/backends/xnnpack/test/ops/bmm.py b/backends/xnnpack/test/ops/bmm.py new file mode 100644 index 0000000000..1c6235e5f7 --- /dev/null +++ b/backends/xnnpack/test/ops/bmm.py @@ -0,0 +1,46 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.xnnpack.test.tester import Tester + + +class TestBMM(unittest.TestCase): + class BMM(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + return torch.bmm(x, y) + + def _test_bmm(self, inputs): + ( + Tester(self.BMM(), inputs) + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .to_edge_transform_and_lower() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) + + def test_fp16_bmm(self): + inputs = ( + torch.randn(2, 3, 4).to(torch.float16), + torch.randn(2, 4, 6).to(torch.float16), + ) + self._test_bmm(inputs) + + def test_fp32_bmm(self): + inputs = ( + torch.randn(2, 3, 4), + torch.randn(2, 4, 6), + ) + self._test_bmm(inputs) diff --git a/backends/xnnpack/test/ops/cat.py b/backends/xnnpack/test/ops/cat.py index 15524c0134..23fca91f5b 100644 --- a/backends/xnnpack/test/ops/cat.py +++ b/backends/xnnpack/test/ops/cat.py @@ -56,11 +56,7 @@ def _test_cat(self, module, inputs, cat_num=1, quant=False, quant_ops=2): } ) - ( - tester.to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_cat": 1}) - .partition() - ) + tester.to_edge_transform_and_lower() if quant: tester.check_not(["torch.ops.quantized_decomposed"]) @@ -155,9 +151,7 @@ def test_fp32_cat_unsupported(self): Tester(self.Cat5(), inputs) .export() .check_count({"torch.ops.aten.cat": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_cat": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"executorch_exir_dialects_edge__ops_aten_cat": 1}) ) diff --git a/backends/xnnpack/test/ops/ceil.py b/backends/xnnpack/test/ops/ceil.py index 8d59f3b35d..6dbebf3650 100644 --- a/backends/xnnpack/test/ops/ceil.py +++ b/backends/xnnpack/test/ops/ceil.py @@ -24,9 +24,7 @@ def _test_ceil(self, inputs): Tester(self.Ceil(), inputs) .export() .check_count({"torch.ops.aten.ceil.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_ceil_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_ceil_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/clamp.py b/backends/xnnpack/test/ops/clamp.py index c52fd011f8..9fb8935553 100644 --- a/backends/xnnpack/test/ops/clamp.py +++ b/backends/xnnpack/test/ops/clamp.py @@ -26,9 +26,7 @@ def _test_clamp(self, module, inputs): Tester(module, inputs) .export() .check_count({"torch.ops.aten.clamp.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_clamp_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"]) .to_executorch() @@ -64,9 +62,7 @@ def test_qs8_clamp(self): .export() .check_count({"torch.ops.aten.clamp.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_clamp_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/conv1d.py b/backends/xnnpack/test/ops/conv1d.py index 1759b1452d..833ad69da6 100644 --- a/backends/xnnpack/test/ops/conv1d.py +++ b/backends/xnnpack/test/ops/conv1d.py @@ -7,13 +7,14 @@ import unittest import torch -from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( - XnnpackFloatingPointPartitioner, +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, ) +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.test.test_xnnpack_utils import randomize_bn from executorch.backends.xnnpack.test.tester import RunPasses, Tester -from executorch.backends.xnnpack.test.tester.tester import Partition +from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower from executorch.exir.passes.constant_prop_pass import constant_prop_pass @@ -93,8 +94,8 @@ def _test_conv1d( conv_count, quantized=False, dynamic_shape=None, - partition=None, passes=None, + stage=None, skip_to_executorch=False, ): tester = ( @@ -104,15 +105,9 @@ def _test_conv1d( else Tester(module, inputs) ) .export() - .check_count({"torch.ops.aten.conv1d.default": conv_count}) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_convolution_default": conv_count - } - ) .run_passes(passes) - .partition(partition) + .check_count({"torch.ops.aten.conv1d.default": conv_count}) + .to_edge_transform_and_lower(stage) .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) ) @@ -170,7 +165,11 @@ def test_qs8_conv1d_with_floating_point_partitioner(self): 1, quantized=True, dynamic_shape=dynamic_shapes, - partition=Partition(XnnpackFloatingPointPartitioner()), + stage=ToEdgeTransformAndLower( + partitioners=[ + XnnpackPartitioner(config_precisions=ConfigPrecisionType.FP32) + ] + ), passes=RunPasses(pass_functions=[constant_prop_pass]), skip_to_executorch=True, ) diff --git a/backends/xnnpack/test/ops/conv2d.py b/backends/xnnpack/test/ops/conv2d.py index 4a281e2265..95b22bb3f8 100644 --- a/backends/xnnpack/test/ops/conv2d.py +++ b/backends/xnnpack/test/ops/conv2d.py @@ -164,14 +164,13 @@ def _test( ( tester.export() .check_count({"torch.ops.aten.conv2d": conv_count}) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_convolution_default": conv_count - } - ) - .partition() + .to_edge_transform_and_lower() .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"]) + .check_not( + [ + "executorch_exir_dialects_edge__ops__native_batch_norm_legit_no_training_default" + ] + ) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() .serialize() diff --git a/backends/xnnpack/test/ops/div.py b/backends/xnnpack/test/ops/div.py index 3815b2f084..9bca5feed4 100644 --- a/backends/xnnpack/test/ops/div.py +++ b/backends/xnnpack/test/ops/div.py @@ -32,9 +32,7 @@ def _test_div(self, inputs): Tester(self.Div(), inputs) .export() .check_count({"torch.ops.aten.div.Tensor": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_div_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_div_Tensor"]) .to_executorch() @@ -62,9 +60,7 @@ def test_fp32_div_single_input(self): Tester(self.DivSingleInput(), inputs) .export() .check_count({"torch.ops.aten.div.Tensor": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_div_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_div_Tensor"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/elu.py b/backends/xnnpack/test/ops/elu.py index 03bdfc508d..f976c29d79 100644 --- a/backends/xnnpack/test/ops/elu.py +++ b/backends/xnnpack/test/ops/elu.py @@ -28,9 +28,7 @@ def _test_elu(self, inputs): Tester(self.ELU(), inputs) .export() .check_count({"torch.ops.aten.elu.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_elu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -42,17 +40,17 @@ def _test_elu(self, inputs): .run_method_and_compare_outputs() ) - @unittest.skip("T171810227 - Missing recomposition for ELU") + @unittest.skip("PyTorch Pin Update Required") def _test_fp16_elu(self): inputs = (torch.randn(1, 3, 3).to(torch.float16),) self._test_elu(inputs) - @unittest.skip("T171810227 - Missing recomposition for ELU") + @unittest.skip("PyTorch Pin Update Required") def _test_fp32_elu(self): inputs = (torch.randn(1, 3, 3),) self._test_elu(inputs) - @unittest.skip("T171810227 - Missing recomposition for ELU") + @unittest.skip("Update Quantizer to quantize Elu") def _test_qs8_elu(self): inputs = (torch.randn(1, 3, 4, 4),) ( @@ -61,9 +59,7 @@ def _test_qs8_elu(self): .export() .check_count({"torch.ops.aten.elu.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_elu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -76,7 +72,7 @@ def _test_qs8_elu(self): .run_method_and_compare_outputs() ) - @unittest.skip("T171810227 - Missing recomposition for ELU") + @unittest.skip("Update Quantizer to quantize Elu") def _test_qs8_elu_functional(self): inputs = (torch.randn(1, 3, 4, 4),) ( @@ -85,9 +81,7 @@ def _test_qs8_elu_functional(self): .export() .check_count({"torch.ops.aten.elu.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_elu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/floor.py b/backends/xnnpack/test/ops/floor.py index cb65ca2aa5..dfbe7fb18c 100644 --- a/backends/xnnpack/test/ops/floor.py +++ b/backends/xnnpack/test/ops/floor.py @@ -24,9 +24,7 @@ def _test_floor(self, inputs): Tester(self.Floor(), inputs) .export() .check_count({"torch.ops.aten.floor.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_floor_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_floor_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/hardswish.py b/backends/xnnpack/test/ops/hardswish.py index 7b54810f03..899a119ed4 100644 --- a/backends/xnnpack/test/ops/hardswish.py +++ b/backends/xnnpack/test/ops/hardswish.py @@ -28,11 +28,7 @@ def _test_hardswish(self, inputs): Tester(self.Hardswish(), inputs) .export() .check_count({"torch.ops.aten.hardswish.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_hardswish_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -44,28 +40,21 @@ def _test_hardswish(self, inputs): .run_method_and_compare_outputs() ) - @unittest.skip("T158969708 - Missing recomposition pass for hardswish") - def _test_fp16_hardswish(self): + def test_fp16_hardswish(self): inputs = (torch.randn(1, 3, 3).to(torch.float16),) self._test_hardswish(inputs) - @unittest.skip("T158969708 - Missing recomposition pass for hardswish") - def _test_fp32_hardswish(self): + def test_fp32_hardswish(self): inputs = (torch.randn(1, 3, 3),) self._test_hardswish(inputs) - @unittest.skip("T158969708 - Missing recomposition pass for hardswish") - def _test_fp32_hardswish_functional(self): + def test_fp32_hardswish_functional(self): inputs = (torch.randn(1, 3, 3),) ( Tester(self.HardswishFunctional(), inputs) .export() .check_count({"torch.ops.aten.hardswish.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_hardswish_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/hardtanh.py b/backends/xnnpack/test/ops/hardtanh.py index d13624663c..e35e840e3c 100644 --- a/backends/xnnpack/test/ops/hardtanh.py +++ b/backends/xnnpack/test/ops/hardtanh.py @@ -29,11 +29,7 @@ def test_fp32_hardtanh(self): Tester(self.HardTanh(), (input,)) .export() .check_count({"torch.ops.aten.hardtanh.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_hardtanh_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"]) .to_executorch() @@ -48,11 +44,7 @@ def test_fp32_hardtanh_bound(self): Tester(self.HardTanh(-2.0, 2.0), (input,)) .export() .check_count({"torch.ops.aten.hardtanh.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_hardtanh_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"]) .to_executorch() @@ -74,11 +66,7 @@ def test_qs8_hardtanh(self): torch.ops.aten.hardtanh.default: 1, } ) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_hardtanh_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/leaky_relu.py b/backends/xnnpack/test/ops/leaky_relu.py index ae5f2e3197..32f7348697 100644 --- a/backends/xnnpack/test/ops/leaky_relu.py +++ b/backends/xnnpack/test/ops/leaky_relu.py @@ -30,11 +30,7 @@ def _test_leaky_relu(self, module, inputs): Tester(module, inputs) .export() .check_count({"torch.ops.aten.leaky_relu.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_leaky_relu_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -62,11 +58,7 @@ def test_fp32_leaky_relu_functional(self): Tester(self.LeakyReLUFunctional(), inputs) .export() .check_count({"torch.ops.aten.leaky_relu.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_leaky_relu_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -79,7 +71,7 @@ def test_fp32_leaky_relu_functional(self): ) @unittest.skip("T172863987 - Missing quantizer support.") - def test_qs8_leaky_relu(self): + def _test_qs8_leaky_relu(self): inputs = (torch.randn(1, 3, 3),) ( Tester(self.LeakyReLU(negative_slope=0.2), inputs) @@ -91,11 +83,7 @@ def test_qs8_leaky_relu(self): "quantized_decomposed::quantize_per_tensor": 3, } ) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_leaky_relu_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -109,7 +97,7 @@ def test_qs8_leaky_relu(self): ) @unittest.skip("T172863987 - Missing quantizer support.") - def test_qs8_leaky_relu_default_slope(self): + def _test_qs8_leaky_relu_default_slope(self): """ The leaky_relu visitor has logic to handle the default slope, since it's apparently not passed through on export. This test ensures that this matches the eager mode behavior. @@ -126,11 +114,7 @@ def test_qs8_leaky_relu_default_slope(self): "quantized_decomposed::quantize_per_tensor": 3, } ) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_leaky_relu_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py index 2ce1c2d3c3..d8de79f283 100644 --- a/backends/xnnpack/test/ops/linear.py +++ b/backends/xnnpack/test/ops/linear.py @@ -10,20 +10,183 @@ from typing import Optional, Tuple import torch -from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( - XnnpackDynamicallyQuantizedPartitioner, +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, ) +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.test.tester import Quantize, Tester -from executorch.backends.xnnpack.test.tester.tester import Partition +from executorch.backends.xnnpack.test.tester.tester import ( + Partition, + ToEdgeTransformAndLower, +) from torch.ao.quantization.quantizer.xnnpack_quantizer import ( get_symmetric_quantization_config, ) from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig +try: + from torchao.quantization.quant_api import ( + int8_dynamic_activation_int4_weight, + quantize_, + unwrap_tensor_subclass, + ) + + torchao_installed = True +except: + torchao_installed = False + + +# Pytorch Modules Used for Testing +class BaseLinear(torch.nn.Module): + def __init__( + self, + in_size: int = 2, + input_channels: int = 4, + output_channels: int = 4, + dtype: torch.dtype = torch.float, + use_bias: bool = False, + ): + super().__init__() + self.linear = torch.nn.Linear( + input_channels, output_channels, bias=use_bias + ).to(dtype=dtype) + + self.ic = input_channels + self.oc = output_channels + + assert dtype in [torch.float, torch.half], "Unsupported op dtype" + self.op_dtype = dtype + self.in_size = in_size + + def forward(self, x): + return self.linear(x) + + def get_inputs(self): + return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),) + + +class AddMMModule(torch.nn.Module): + def __init__(self, in_size, out_size): + super().__init__() + self.mat = torch.nn.Parameter(torch.randn(in_size, out_size)) + self.bias = torch.nn.Parameter(torch.randn(1, out_size)) + + def forward(self, x): + return torch.addmm(self.bias, x, self.mat) + + +class LinearReluModule(torch.nn.Module): + def __init__(self, in_size, out_size, use_bias, dtype=torch.float): + super().__init__() + self.dtype = dtype + self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias).to(dtype=dtype) + + def forward(self, x): + return torch.nn.functional.relu(self.linear(x)) + + def get_inputs(self): + return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),) + + +class LinearParallelSequentialModule(torch.nn.Module): + def __init__( + self, + in_size=2, + input_size=4, + intermediate_size=5, + output_size=3, + dtype=torch.float, + ): + super().__init__() + self.linear1_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear2_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear3_weight = torch.nn.Parameter( + torch.rand(output_size, intermediate_size) + ) + self.linear3_bias = torch.nn.Parameter(torch.rand(output_size)) + self.in_size = in_size + self.input_size = input_size + self.dtype = torch.float + + def forward(self, x, y): + a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias) + b = torch.nn.functional.linear(y, self.linear2_weight, self.linear2_bias) + c = torch.nn.functional.linear(b, self.linear3_weight, self.linear3_bias) + return (a, c) + + def get_inputs(self): + return ( + torch.rand(self.in_size, self.input_size, dtype=self.dtype), + torch.rand(self.in_size, self.input_size, dtype=self.dtype), + ) + + +class LinearSequential(torch.nn.Module): + def __init__( + self, + in_size=2, + input_size=4, + intermediate_size=5, + output_size=3, + dtype=torch.float, + ): + super().__init__() + self.linear1_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear2_weight = torch.nn.Parameter( + torch.rand(output_size, intermediate_size) + ) + self.linear2_bias = torch.nn.Parameter(torch.rand(output_size)) + self.in_size = in_size + self.input_size = input_size + self.dtype = torch.float + + def forward(self, x): + a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias) + b = torch.nn.functional.linear(a, self.linear2_weight, self.linear2_bias) + return b + + def get_inputs(self): + return (torch.rand(self.in_size, self.input_size, dtype=torch.float),) + class TestLinear(unittest.TestCase): + """ + Test Class for XNNPACK Linear Operators. + + Notes: + - XNNPACK Does not support Per Tensor Quantized Weights with Dynamic Activations + - XNNPACK Only supports Per-Token Activation, so Dynamic per-tensor Quantization + As done by the default dynamic quantization flow does Per-Token Quantization + Activation under the hood, where the torch.nn.Module is doing Per-Tensor Quantization + on the Activation. This is sufficient because Per-Token Quantization on Activations + should produce strictly better results compared to Per-Tensor Quantization + """ + + @staticmethod + def _get_4b_dqconfig() -> QuantizationConfig: + # Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK. + qconfig: QuantizationConfig = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=True, + weight_qmin=-8, + weight_qmax=7, + ) + return qconfig + def test_fp16_linear(self): for use_bias in (True, False): for num_batch_dims in range(1, 3): @@ -61,33 +224,13 @@ def test_qc8_linear(self): ) def test_fp32_addmm(self): - """ - Note that the ConvertToLinear pass requires the weight matrix to be transposed. - """ - - class AddMMModule(torch.nn.Module): - def __init__(self, in_size, out_size): - super().__init__() - self.mat = torch.nn.Parameter(torch.randn(out_size, in_size)) - self.bias = torch.nn.Parameter(torch.randn(1, out_size)) - - def forward(self, x): - return torch.addmm(self.bias, x, torch.transpose(self.mat, 0, 1)) - + # Note that the ConvertToLinear pass requires the weight matrix to be transposed. self._test_linear( lambda in_size, out_size: AddMMModule(in_size, out_size), uses_bias=True, ) def test_fp32_linear_fused_relu(self): - class LinearReluModule(torch.nn.Module): - def __init__(self, in_size, out_size, use_bias): - super().__init__() - self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) - - def forward(self, x): - return torch.nn.functional.relu(self.linear(x)) - for use_bias in (True, False): for num_batch_dims in range(1, 3): self._test_linear( @@ -101,14 +244,6 @@ def forward(self, x): ) def test_qs8_linear_fused_relu(self): - class LinearReluModule(torch.nn.Module): - def __init__(self, in_size, out_size, use_bias): - super().__init__() - self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) - - def forward(self, x): - return torch.nn.functional.relu(self.linear(x)) - for use_bias in (True, False): for num_batch_dims in range(1, 3): self._test_linear( @@ -134,21 +269,6 @@ def test_qs8_linear(self): quant_type="per_tensor", ) - @unittest.skip("XNNPACK currently only supports per-channel dynamic quantization.") - def _test_qd8_per_tensor_linear(self): - for uses_bias in (False, True): - inputs = (torch.randn(2, 4),) - module = torch.nn.Linear(4, 5, bias=uses_bias) - dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},) - - self._test_dqlinear( - module, - inputs, - dynamic_shapes=dynamic_shapes, - is_per_channel=False, - uses_bias=uses_bias, - ) - def test_qd8_per_channel_linear(self): for uses_bias in (False, True): inputs = (torch.randn(2, 4),) @@ -162,19 +282,6 @@ def test_qd8_per_channel_linear(self): uses_bias=uses_bias, ) - @staticmethod - def _get_4b_dqconfig() -> QuantizationConfig: - """ - Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK. - """ - qconfig: QuantizationConfig = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=True, - weight_qmin=-8, - weight_qmax=7, - ) - return qconfig - def test_qd8_per_channel_4w_linear(self): qconfig = self._get_4b_dqconfig() input_channels = [2, 63] @@ -263,38 +370,12 @@ def test_qd8_per_channel_linear_with_two_batch(self): ) def test_qd8_per_channel_linear_sequential(self): - in_size = 2 - input_size = 4 - intermediate_size = 5 - output_size = 3 - - class LinearSequential(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear2_weight = torch.nn.Parameter( - torch.rand(output_size, intermediate_size) - ) - self.linear2_bias = torch.nn.Parameter(torch.rand(output_size)) - - def forward(self, x): - a = torch.nn.functional.linear( - x, self.linear1_weight, self.linear1_bias - ) - b = torch.nn.functional.linear( - a, self.linear2_weight, self.linear2_bias - ) - return b - - inputs = (torch.rand(in_size, input_size, dtype=torch.float),) + lin_mod = LinearSequential() + inputs = lin_mod.get_inputs() dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},) self._test_dqlinear( - LinearSequential(), + lin_mod, inputs, dynamic_shapes=dynamic_shapes, linear_count=2, @@ -303,53 +384,16 @@ def forward(self, x): atol=1e-1, ) - def test_qd8_per_channel_linear_parellel_and_sequential(self): - in_size = 2 - input_size = 4 - intermediate_size = 5 - output_size = 3 - - class LinearModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear2_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear3_weight = torch.nn.Parameter( - torch.rand(output_size, intermediate_size) - ) - self.linear3_bias = torch.nn.Parameter(torch.rand(output_size)) - - def forward(self, x, y): - a = torch.nn.functional.linear( - x, self.linear1_weight, self.linear1_bias - ) - b = torch.nn.functional.linear( - y, self.linear2_weight, self.linear2_bias - ) - c = torch.nn.functional.linear( - b, self.linear3_weight, self.linear3_bias - ) - return (a, c) - - inputs = ( - torch.rand(in_size, input_size, dtype=torch.float), - torch.rand(in_size, input_size, dtype=torch.float), - ) + def test_qd8_per_channel_linear_parallel_and_sequential(self): + lin_mod = LinearParallelSequentialModule() + inputs = lin_mod.get_inputs() dynamic_shapes = ( {0: torch.export.Dim("batch", max=100)}, {0: torch.export.Dim("batch2", max=100)}, ) self._test_dqlinear( - LinearModule(), + lin_mod, inputs, dynamic_shapes=dynamic_shapes, linear_count=3, @@ -358,413 +402,59 @@ def forward(self, x, y): atol=1e-1, ) - class ManualDQLinear(torch.nn.Module): - def __init__( - self, - input_channels: int = 4, - output_channels: int = 4, - dtype: torch.dtype = torch.float, - weight_n_bit: int = 4, - group_size: int = 0, - force_groupwise_quant: bool = False, - use_bias: bool = False, - ): - super().__init__() - - self.ic = input_channels - self.oc = output_channels - - assert dtype in [torch.float, torch.half], "Unsupported op dtype" - self.op_dtype = dtype - - self.group_size = self.ic if group_size == 0 else group_size - self.num_groups = 1 - if self.group_size != self.ic: - assert self.ic % self.group_size == 0 - assert self.group_size % 8 == 0 # TODO make this 16 - self.num_groups = self.ic // self.group_size - - assert weight_n_bit in [4, 8], "Unsupported weight_n_bit" - self.w_n_bit = weight_n_bit - self.w_quant_min, self.w_quant_max = self.get_min_max(self.w_n_bit) - - self.w = torch.nn.Parameter( - torch.randn(self.oc, self.ic), requires_grad=False - ) - self.w_q = torch.nn.Parameter( - torch.zeros(self.oc, self.ic), requires_grad=False - ) - # Quantize the weights as per folded setup - if self.group_size != self.ic or force_groupwise_quant: - self.w_scales = torch.nn.Parameter( - torch.zeros(self.oc, self.num_groups), requires_grad=False - ) - self.w_zero_points = torch.nn.Parameter( - torch.zeros(self.oc, self.num_groups), requires_grad=False - ) - self.quant_weight_per_channel_group() - else: # per_channel quantization - self.w_scales = torch.nn.Parameter( - torch.zeros(self.oc), requires_grad=False - ) - self.w_zero_points = torch.nn.Parameter( - torch.zeros(self.oc), requires_grad=False - ) - self.quant_weight_per_channel() - - self.bias = ( - torch.nn.Parameter( - torch.randn(self.oc).to(self.op_dtype), requires_grad=False - ) - if use_bias - else None - ) - - def get_min_max(self, n_bit: int = 4): - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - return min_int, max_int - - def get_channel_qparams_symmetric( - self, - w: torch.Tensor, - n_bit: int = 4, - precision: torch.dtype = torch.float32, - ): - assert w.dim() == 2 - - to_quant = w.to(precision) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - min_int, max_int = self.get_min_max(n_bit) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max( - scales, torch.full_like(scales, torch.finfo(torch.float32).eps) - ) - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0]), zeros.to( - precision - ).reshape(w.shape[0]).reshape(w.shape[0]) - - # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues - def get_group_qparams_symmetric( - self, w, n_bit=4, groupsize=128, precision=torch.float32 - ): - # needed for GPTQ with padding - if groupsize > w.shape[-1]: - groupsize = w.shape[-1] - assert groupsize > 1 - assert w.shape[-1] % groupsize == 0 - assert w.dim() == 2 - - to_quant = w.reshape(-1, groupsize) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max( - scales, torch.full_like(scales, torch.finfo(torch.float32).eps) - ) - # TODO: make sure abs(scales) is not too small? - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0], -1), zeros.to( - precision - ).reshape(w.shape[0], -1) - - # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues - def group_quantize_tensor_symmetric( - self, w, n_bit=4, group_size=128, precision=torch.float32 - ): - scales, zeros = self.get_group_qparams_symmetric( - w, n_bit, group_size, precision - ) - n_bit = 4 - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - # TODO: currently we don't know how to express torch.int4, we'll - # add torch.int4 to core later - w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group( - w, scales, zeros, min_int, max_int, torch.int8, group_size - ) - - return w_int8, scales, zeros - - def fwd_input_per_token(self, input: torch.Tensor) -> torch.Tensor: - ip_quant_min = -128 - ip_quant_max = 127 - input = input.to(self.op_dtype) - ( - ip_scales, - ip_zero_points, - ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric( - input, torch.int8 - ) - - input = torch.ops.quantized_decomposed.quantize_per_token( - input, - ip_scales, - ip_zero_points, - ip_quant_min, - ip_quant_max, - torch.int8, - ) - input = torch.ops.quantized_decomposed.dequantize_per_token( - input, - ip_scales, - ip_zero_points, - ip_quant_min, - ip_quant_max, - torch.int8, - self.op_dtype, - ) - input = input.to(self.op_dtype) - return input - - def quant_weight_per_channel(self): - ( - self.w_scales.data, - self.w_zero_points.data, - ) = self.get_channel_qparams_symmetric( - self.w, n_bit=self.w_n_bit, precision=self.op_dtype - ) - self.w_q.data = torch.ops.quantized_decomposed.quantize_per_channel( - self.w, - self.w_scales, - self.w_zero_points, - axis=0, - quant_min=self.w_quant_min, - quant_max=self.w_quant_max, - dtype=torch.int8, - ) - - def quant_weight_per_channel_group(self): - self.w_q.data, w, zp = self.group_quantize_tensor_symmetric( - self.w, - n_bit=self.w_n_bit, - group_size=self.group_size, - ) - expected_min, expected_max = self.get_min_max(self.w_n_bit) - assert ( - torch.min(self.w_q.data) >= expected_min - ), "Found smaller than min element in quantized weight tensor" - assert ( - torch.max(self.w_q.data) <= expected_max - ), "Found larger than max element in quantized weight tensor" - assert ( - w.ndim == 2 and zp.ndim == 2 - ), f"Expecting 2d scales and zp tensors, but got {w.shape}, {zp.shape}" - self.w_scales.data, self.w_zero_points.data = w, zp - - def fwd_weight_per_channel(self) -> torch.Tensor: - # This is HACKY because the dequant will produce fp32 - return torch.ops.quantized_decomposed.dequantize_per_channel( - self.w_q, - self.w_scales, - self.w_zero_points, - axis=0, - quant_min=self.w_quant_min, - quant_max=self.w_quant_max, - dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later - ) - - def fwd_weight_per_channel_group(self) -> torch.Tensor: - return torch.ops.quantized_decomposed.dequantize_per_channel_group( - self.w_q, - self.w_scales, - self.w_zero_points, - self.w_quant_min, - self.w_quant_max, - dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later - group_size=self.group_size, - output_dtype=self.op_dtype, - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - # Input - input = self.fwd_input_per_token(input).to(self.op_dtype) - - # Weights - w = ( - self.fwd_weight_per_channel_group() - if self.w_scales.ndim == 2 - else self.fwd_weight_per_channel() - ).to(self.op_dtype) - assert isinstance(w, torch.Tensor) - return torch.nn.functional.linear(input, w, self.bias) - - def _test_manual_dq_linear( - self, - mod: torch.nn.Module, - inputs: Tuple[torch.Tensor], - weight_groupwise: bool = False, - use_bias: bool = False, - atol: float = 1e-3, - rtol: float = 1e-3, - ): - linear_edge_op = ( - "executorch_exir_dialects_edge__ops_aten_addmm_default" - if use_bias - else "executorch_exir_dialects_edge__ops_aten_mm_default" - ) - - weight_dq_edge_op = ( - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_group_default" - if weight_groupwise - else "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default" - ) - - ( - Tester(mod, inputs) - .export() - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_per_token_asymmetric_default": 1, - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_token_default": 1, - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_token_default": 1, - weight_dq_edge_op: 1, - linear_edge_op: 1, - } - ) - .partition(Partition(partitioner=XnnpackDynamicallyQuantizedPartitioner())) - .check_count( - { - "torch.ops.higher_order.executorch_call_delegate": 1, - } - ) - .check_not( - [ - "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_per_token_asymmetric_default", - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_token_default", - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_token_default", - weight_dq_edge_op, - linear_edge_op, - ] - ) - .to_executorch() - .serialize() - .run_method_and_compare_outputs(atol=atol, rtol=rtol) - ) - - def _run_manual_dqlinear_tests(self, weight_n_bit: int, op_dtype: torch.dtype): - in_sizes = [1, 4, 4] - input_sizes = [4, 37, 17] - output_sizes = [4, 17, 37] - - for use_bias in [True, False]: - for i, _ in enumerate(in_sizes): - in_size = int(in_sizes[i]) - input_size = int(input_sizes[i]) - output_size = int(output_sizes[i]) - mod = self.ManualDQLinear( - input_channels=input_size, - output_channels=output_size, - weight_n_bit=weight_n_bit, - dtype=op_dtype, - use_bias=use_bias, - ) - - inputs = (torch.randn(1, in_size, input_size).to(op_dtype),) - self._test_manual_dq_linear(mod, inputs, use_bias=use_bias) - - def test_qd8_fp32_per_token_weight_per_channel_int8(self): - self._run_manual_dqlinear_tests(8, torch.float) - - def test_qd8_fp32_per_token_weight_per_channel_int4(self): - self._run_manual_dqlinear_tests(4, torch.float) - - # This fails because the output tensor dtype is different, but if you squint and ignore that and look at the values, - # it is not too bad. - # Difference: max: 0.042601585388183594, abs: 0.042601585388183594. - # -- Model vs. Reference -- - # Numel: 68, 68 - # Median: -0.7754800915718079, -0.7755751013755798 - # Mean: -0.6128872036933899, -0.6143574714660645 - # Max: 12.518657684326172, 12.516003608703613 - # Min: -20.070953369140625, -20.077701568603516 - @unittest.skip("Need to fix the dq_per_channel output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_int8(self): - self._run_manual_dqlinear_tests(8, torch.float16) - - @unittest.skip("Need to fix the dq_per_channel output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_int4(self): - self._run_manual_dqlinear_tests(4, torch.float16) - + @unittest.skipIf( + not torchao_installed, "Per Channel Group Quantization Required TorchAO" + ) def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: - for i, _ in enumerate(M_sizes): - M = int(M_sizes[i]) - K = int(K_sizes[i]) - N = int(N_sizes[i]) - bl = int(bl_sizes[i]) - mod = self.ManualDQLinear( + for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes): + lin_mod = BaseLinear( input_channels=K, output_channels=N, - weight_n_bit=4, dtype=torch.float, - group_size=bl, - force_groupwise_quant=True, use_bias=use_bias, ) inputs = (torch.randn(1, M, K),) - self._test_manual_dq_linear( - mod, - inputs, - weight_groupwise=True, - use_bias=use_bias, + self._test_groupwise_dq_linear( + lin_mod, inputs, group_size=bl, use_bias=use_bias ) + @unittest.skipIf( + not torchao_installed, "Per Channel Group Quantization Required TorchAO" + ) def test_qd8_fp16_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: - for i, _ in enumerate(M_sizes): - M = int(M_sizes[i]) - K = int(K_sizes[i]) - N = int(N_sizes[i]) - bl = int(bl_sizes[i]) - mod = self.ManualDQLinear( + for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes): + lin_mod = BaseLinear( + in_size=M, input_channels=K, output_channels=N, - weight_n_bit=4, dtype=torch.float16, - group_size=bl, - force_groupwise_quant=True, use_bias=use_bias, ) - inputs = (torch.randn(1, M, K, dtype=torch.float16),) - self._test_manual_dq_linear( - mod, - inputs, - weight_groupwise=True, - use_bias=use_bias, - atol=0.1, - rtol=0.1, + inputs = lin_mod.get_inputs() + # This requires slightly higher atol, but if you look at error it is not that bad: + # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375. + # -- Model vs. Reference -- + # Numel: 4, 4 + # Median: -0.05023193359375, -0.0516357421875 + # Mean: 0.2373046875, 0.237060546875 + # Max: 1.0078125, 1.0078125 + # Min: -0.08465576171875, -0.08441162109375 + self._test_groupwise_dq_linear( + lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=1e-2 ) def _test_linear( @@ -776,23 +466,30 @@ def _test_linear( dtype: torch.dtype = torch.float, atol=1e-03, ): - aten_op, edge_op = ( - ( - "aten.addmm.default", - "executorch_exir_dialects_edge__ops_aten_addmm_default", - ) + edge_op = ( + "executorch_exir_dialects_edge__ops_aten_addmm_default" if uses_bias - else ( - "aten.mm.default", - "executorch_exir_dialects_edge__ops_aten_mm_default", - ) + else "executorch_exir_dialects_edge__ops_aten_mm_default" ) in_sizes = [3, 4, 4] input_sizes = [4, 37, 17] output_sizes = [4, 17, 37] - quant = quant_type is not None + quant_config = None + if quant_type is not None: + if quant_type == "per_channel": + quant_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=False, + ) + elif quant_type == "per_tensor": + quant_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False, + ) + else: + raise ValueError(f"Unsupported quant type {quant_type}") """ Note that torch.nn.Linear maps to aten.mm.default (no bias) or aten.addmm.default (bias), @@ -803,7 +500,6 @@ def _test_linear( input_size = int(input_sizes[i]) output_size = int(output_sizes[i]) input_shape = [in_size] * num_batch_dims + [input_size] - print(f"Testing input_shape {input_shape} with {output_size} out_channels") module = make_module(input_size, output_size).eval().to(dtype) inputs = (torch.randn(input_shape).to(dtype),) @@ -812,42 +508,41 @@ def _test_linear( dynamic_shape[i] = torch.export.Dim(f"batch{i}", min=2, max=in_size) dynamic_shape = (dynamic_shape,) - print(dynamic_shape) - tester = Tester(module, inputs, dynamic_shapes=dynamic_shape) + for legacy_mode in (True, False): + tester = Tester(module, inputs, dynamic_shapes=dynamic_shape) - if quant: - if quant_type == "per_channel": - quant_config = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=False, - ) - elif quant_type == "per_tensor": - quant_config = get_symmetric_quantization_config( - is_per_channel=False, - is_dynamic=False, - ) - else: - raise ValueError(f"Unsupported quant type {quant_type}") - tester.quantize(Quantize(quantization_config=quant_config)) - - tester.export() - if quant: - tester.check(["torch.ops.quantized_decomposed"]) + if quant_config: + tester.quantize(Quantize(quantization_config=quant_config)) - tester.to_edge() - tester.check_count({edge_op: 1}) + tester.export() + if quant_config: + tester.check(["torch.ops.quantized_decomposed"]) - tester.partition() - tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) - tester.check_not([edge_op]) + if legacy_mode: + tester.to_edge() + tester.partition() + else: + tester.to_edge_transform_and_lower() - if quant: - tester.check_not([edge_op, "torch.ops.quantized_decomposed"]) + tester.check_count( + {"torch.ops.higher_order.executorch_call_delegate": 1} + ) + tester.check_not([edge_op]) + + if quant_config: + tester.check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) - tester.to_executorch() - tester.serialize() - tester.run_method_and_compare_outputs(qtol=quant, atol=atol) + tester.to_executorch() + tester.serialize() + tester.run_method_and_compare_outputs( + qtol=bool(quant_config), atol=atol + ) def _test_dqlinear( self, @@ -860,36 +555,96 @@ def _test_dqlinear( qconfig: Optional[QuantizationConfig] = None, atol=5e-02, ): - aten_op, edge_op = ( - ( - "aten.addmm.default", - "executorch_exir_dialects_edge__ops_aten_addmm_default", - ) - if uses_bias - else ( - "aten.mm.default", - "executorch_exir_dialects_edge__ops_aten_mm_default", - ) - ) - quant_config = qconfig or get_symmetric_quantization_config( is_per_channel=is_per_channel, is_dynamic=True, ) + for legacy_partitioner in (True, False): + for per_op_mode in (True, False): + DynamicallyQuantizedPartitioner = XnnpackPartitioner( + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, + per_op_mode=per_op_mode, + ) - tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes) - tester.quantize(Quantize(quantization_config=quant_config)) + tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes) + tester.quantize(Quantize(quantization_config=quant_config)) + tester.export() - tester.export() - tester.to_edge() - tester.check_count({edge_op: linear_count}) + if legacy_partitioner: + tester.to_edge() + tester.partition(Partition(DynamicallyQuantizedPartitioner)) + else: + tester.to_edge_transform_and_lower( + ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) + ) + tester.check_count( + { + "torch.ops.higher_order.executorch_call_delegate": ( + linear_count if per_op_mode else 1 + ) + } + ) + tester.check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) - tester.partition( - Partition(partitioner=XnnpackDynamicallyQuantizedPartitioner()) + tester.to_executorch() + tester.serialize() + tester.run_method_and_compare_outputs(atol=atol) + + def _test_groupwise_dq_linear( + self, + mod: torch.nn.Module, + inputs: Tuple[torch.Tensor], + use_bias: bool = False, + group_size: int = 8, + num_linears: int = 1, + atol: float = 5e-3, + rtol: float = 5e-3, + ): + quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size)) + unwrap_tensor_subclass(mod) + DynamicallyQuantizedPartitioner = XnnpackPartitioner( + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, + per_op_mode=True, + ) + tester = ( + Tester(mod, inputs) + .export() + .check_count( + { + "torch.ops.quant.choose_qparams_affine.default": 1 * num_linears, + "torch.ops.quant.quantize_affine.default": 1 * num_linears, + "torch.ops.quant.dequantize_affine.default": 2 * num_linears, + "torch.ops.aten.linear.default": 1 * num_linears, + } + ) + ) + ( + tester.to_edge_transform_and_lower( + ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) + ) ) - tester.check(["torch.ops.higher_order.executorch_call_delegate"]) - tester.check_not([edge_op]) - tester.to_executorch() - tester.serialize() - tester.run_method_and_compare_outputs(atol=atol) + ( + tester.check_count( + { + "torch.ops.higher_order.executorch_call_delegate": 1, + } + ) + .check_not( + [ + "executorch_exir_dialects_edge__ops_quant_choose_qparams_affine_default", + "executorch_exir_dialects_edge__ops_quant_quantize_affine_default", + "executorch_exir_dialects_edge__ops_quant_dequantize_affine_default", + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) + .to_executorch() + .serialize() + .run_method_and_compare_outputs(atol=atol, rtol=rtol) + ) diff --git a/backends/xnnpack/test/ops/lstm.py b/backends/xnnpack/test/ops/lstm.py new file mode 100644 index 0000000000..bfc6113c41 --- /dev/null +++ b/backends/xnnpack/test/ops/lstm.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner + +from executorch.backends.xnnpack.test.tester import Tester +from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower + + +class TestLSTM(unittest.TestCase): + class LSTMLinear(torch.nn.Module): + def __init__(self, input_size, hidden_size, out_size): + super().__init__() + self.lstm = torch.nn.LSTM( + input_size=input_size, hidden_size=hidden_size, batch_first=True + ) + self.linear = torch.nn.Linear(hidden_size, hidden_size) + self.linear2 = torch.nn.Linear(hidden_size, out_size) + + def forward(self, x): + x, hs = self.lstm(x) + x = self.linear(x[:, -1, :]) + x = self.linear2(x) + return torch.nn.functional.log_softmax(x, dim=1) + + def test_fp32_lstm(self): + ( + Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),)) + .export() + .to_edge_transform_and_lower() + .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"]) + .check_not( + ["p_lstm_weight", "p_lstm_bias"] + ) # These Should be Consumed by Delegate + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) + + def test_fp32_lstm_force_dynamic_linear(self): + ( + Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),)) + .export() + .to_edge_transform_and_lower( + ToEdgeTransformAndLower( + partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)] + ) + ) + .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"]) + # Weights are supplied as input to linears + .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"]) + # Biases are owned by delegates + .check_not(["p_lstm_bias"]) + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) diff --git a/backends/xnnpack/test/ops/max_dim.py b/backends/xnnpack/test/ops/max_dim.py index e16a4f8b15..c660a5a6d2 100644 --- a/backends/xnnpack/test/ops/max_dim.py +++ b/backends/xnnpack/test/ops/max_dim.py @@ -13,14 +13,12 @@ class TestMaxDim(unittest.TestCase): class Max(torch.nn.Module): def forward(self, x): - x = torch.add(x, x) max_values_1, max_indices_1 = torch.max(x, dim=2, keepdim=True) max_values_2, max_indices_2 = torch.max(x, dim=3, keepdim=True) return (max_values_1, max_indices_1, max_values_2, max_indices_2) class MaxNoIndices(torch.nn.Module): def forward(self, x): - x = torch.add(x, x) max_values_1, _ = torch.max(x, dim=2, keepdim=True) max_values_2, _ = torch.max(x, dim=3, keepdim=True) return (max_values_1, max_values_2) @@ -30,39 +28,36 @@ def _test_max_dim(self, inputs): Tester(self.Max(), inputs) .export() .check_count({"torch.ops.aten.max.dim": 2}) - .to_edge() + .to_edge_transform_and_lower() + .check_not(["torch.ops.higher_order.executorch_call_delegate"]) .check_count({"executorch_exir_dialects_edge__ops_aten_max_dim": 2}) - .partition() - .check_count({"torch.ops.higher_order.executorch_call_delegate": 2}) + ) + + def _test_max_dim_no_indicies(self, inputs): + ( + Tester(self.MaxNoIndices(), inputs) + .export() + .check_count({"torch.ops.aten.max.dim": 2}) + .to_edge_transform_and_lower() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_max_dim"]) .to_executorch() .serialize() .run_method_and_compare_outputs() ) - @unittest.skip("T171468483 - Fails to partition due to index output dtype.") - def _test_fp16_max_dim(self): + def test_fp16_max_dim_with_indicies(self): inputs = (torch.randn(16, 3, 12, 12).to(torch.float16),) self._test_max_dim(inputs) - @unittest.skip("T171468483 - Fails to partition due to index output dtype.") - def _test_fp32_max_dim(self): + def test_fp32_max_dim_with_indices(self): inputs = (torch.randn(16, 3, 12, 12),) self._test_max_dim(inputs) - @unittest.skip("T171468483 - Fails to partition due to index output dtype.") - def _test_fp32_max_dim_no_indices(self): + def test_fp32_max_dim_no_indices(self): inputs = (torch.randn(16, 3, 12, 12),) - ( - Tester(self.MaxNoIndices(), inputs) - .export() - .check_count({"torch.ops.aten.max.dim": 2}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_max_dim": 2}) - .partition() - .check_count({"torch.ops.higher_order.executorch_call_delegate": 2}) - .check_not(["executorch_exir_dialects_edge__ops_aten_max_dim"]) - .to_executorch() - .serialize() - .run_method_and_compare_outputs() - ) + self._test_max_dim_no_indicies(inputs) + + def test_fp16_max_dim_no_indices(self): + inputs = (torch.randn(16, 3, 12, 12).to(torch.float16),) + self._test_max_dim_no_indicies(inputs) diff --git a/backends/xnnpack/test/ops/maximum.py b/backends/xnnpack/test/ops/maximum.py index feff02744d..30dfa5503a 100644 --- a/backends/xnnpack/test/ops/maximum.py +++ b/backends/xnnpack/test/ops/maximum.py @@ -23,9 +23,7 @@ def _test_maximum(self, inputs): Tester(self.Maximum(), inputs) .export() .check_count({"torch.ops.aten.maximum.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_maximum_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_maximum_default"]) .to_executorch() @@ -56,9 +54,7 @@ def test_fp32_maximum_broadcast(self): Tester(self.Maximum(), inputs) .export() .check_count({"torch.ops.aten.maximum.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_maximum_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_maximum_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/maxpool2d.py b/backends/xnnpack/test/ops/maxpool2d.py index bbc76743b0..1031852176 100644 --- a/backends/xnnpack/test/ops/maxpool2d.py +++ b/backends/xnnpack/test/ops/maxpool2d.py @@ -55,14 +55,7 @@ def _test_maxpool2d(self, inputs): Tester(self.MaxPool2d(3, 1, 0, 1), inputs) .export() .check_count({"torch.ops.aten.max_pool2d.default": 1}) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1, - } - ) - .check(["getitem"]) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -91,13 +84,7 @@ def test_fp32_maxpool2d_unsupported(self): Tester(self.MaxPool2dUnsupported(), inputs) .export() .check_count({"torch.ops.aten.max_pool2d_with_indices.default": 1}) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1 - } - ) - .partition() + .to_edge_transform_and_lower() # We expect it not be be delegated. .check_count( { @@ -115,13 +102,7 @@ def test_fp32_maxpool2d_unsupported_ceilmode(self): Tester(self.MaxPool2dUnsupportedCeilMode(), inputs) .export() .check_count({"torch.ops.aten.max_pool2d.default": 1}) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1 - } - ) - .partition() + .to_edge_transform_and_lower() # We expect it not be be delegated. .check_count({"torch.ops.higher_order.executorch_call_delegate": 0}) .check_count( @@ -153,13 +134,7 @@ def forward(self, x): .export() .check_count({"torch.ops.aten.max_pool2d.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1 - } - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/mean_dim.py b/backends/xnnpack/test/ops/mean_dim.py index 750b0e8f50..3bac5f3239 100644 --- a/backends/xnnpack/test/ops/mean_dim.py +++ b/backends/xnnpack/test/ops/mean_dim.py @@ -26,9 +26,7 @@ def _test_mean_dim(self, inputs): Tester(self.MeanDim((-1, -2)), inputs) .export() .check_count({"torch.ops.aten.mean.dim": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) .to_executorch() @@ -54,9 +52,20 @@ def test_fp32_mean_dim_unsupported(self): Tester(self.MeanDim((3)), inputs) .export() .check_count({"torch.ops.aten.mean.dim": 1}) - .to_edge() + .to_edge_transform_and_lower() .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) - .partition() + ) + + def test_fp32_mean_dim_unsupported_3d(self): + """ + XNNPack mean.dim implementation only supports 4D tensors. + """ + inputs = (torch.randn(1, 5, 4),) + ( + Tester(self.MeanDim((-1, -2)), inputs) + .export() + .check_count({"torch.ops.aten.mean.dim": 1}) + .to_edge_transform_and_lower() .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) ) @@ -72,9 +81,7 @@ def test_qs8_mean_dim(self): torch.ops.quantized_decomposed.quantize_per_tensor.default: 3, } ) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/minimum.py b/backends/xnnpack/test/ops/minimum.py index 121fbeb185..406ac8485e 100644 --- a/backends/xnnpack/test/ops/minimum.py +++ b/backends/xnnpack/test/ops/minimum.py @@ -23,9 +23,7 @@ def _test_minimum(self, inputs): Tester(self.Minimum(), inputs) .export() .check_count({"torch.ops.aten.minimum.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_minimum_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_minimum_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/multiply.py b/backends/xnnpack/test/ops/multiply.py index d151f58bd6..db50bc5dd4 100644 --- a/backends/xnnpack/test/ops/multiply.py +++ b/backends/xnnpack/test/ops/multiply.py @@ -36,9 +36,7 @@ def _test_mul(self, inputs): Tester(self.Mul(), inputs) .export() .check_count({"torch.ops.aten.mul.Tensor": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_mul_Tensor"]) .to_executorch() @@ -65,9 +63,7 @@ def test_qs8_mul(self): .export() .check_count({"torch.ops.aten.mul.Tensor": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -88,9 +84,7 @@ def test_qs8_mul2(self): .export() .check_count({"torch.ops.aten.mul.Tensor": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -111,9 +105,7 @@ def test_qs8_mul_functional(self): .export() .check_count({"torch.ops.aten.mul.Tensor": 3}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mul_Tensor": 3}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -139,9 +131,7 @@ def test_qs8_mul_relu(self): } ) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/negate.py b/backends/xnnpack/test/ops/negate.py index c4a47bb93c..4d158612e9 100644 --- a/backends/xnnpack/test/ops/negate.py +++ b/backends/xnnpack/test/ops/negate.py @@ -24,9 +24,7 @@ def _test_negate(self, inputs): Tester(self.Negate(), inputs) .export() .check_count({"torch.ops.aten.neg.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_neg_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_neg_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/permute.py b/backends/xnnpack/test/ops/permute.py index 2c99537675..b348fc8af6 100644 --- a/backends/xnnpack/test/ops/permute.py +++ b/backends/xnnpack/test/ops/permute.py @@ -36,11 +36,7 @@ def _test_permute(self, inputs): Tester(self.Permute([0, 2, 3, 1]), inputs) .export() .check_count({"torch.ops.aten.permute.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_permute_copy_default"]) .to_executorch() @@ -62,11 +58,7 @@ def test_fp32_permute_copy(self): Tester(self.PermuteCopy([0, 2, 3, 1]), inputs) .export() .check_count({"torch.ops.aten.permute_copy.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_permute_copy_default"]) .to_executorch() @@ -86,11 +78,7 @@ def test_qs8_permute(self): torch.ops.quantized_decomposed.quantize_per_tensor.default: 3, } ) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -115,11 +103,7 @@ def test_qs8_permute_copy(self): torch.ops.quantized_decomposed.quantize_per_tensor.default: 3, } ) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/pow.py b/backends/xnnpack/test/ops/pow.py index cbe637a6e7..ac902ae44b 100644 --- a/backends/xnnpack/test/ops/pow.py +++ b/backends/xnnpack/test/ops/pow.py @@ -25,11 +25,7 @@ def _test_pow2(self, inputs): Tester(self.Pow(2), inputs) .export() .check_count({"torch.ops.aten.pow.Tensor_Scalar": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar"]) .to_executorch() @@ -58,10 +54,6 @@ def test_fp32_pow_unsupported(self): Tester(self.Pow(3), inputs) .export() .check_count({"torch.ops.aten.pow.Tensor_Scalar": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) diff --git a/backends/xnnpack/test/ops/prelu.py b/backends/xnnpack/test/ops/prelu.py index 985ddecf36..f73648dfa2 100644 --- a/backends/xnnpack/test/ops/prelu.py +++ b/backends/xnnpack/test/ops/prelu.py @@ -24,12 +24,8 @@ def _test_prelu(self, module, inputs): ( Tester(module, inputs) .export() - .check_count({"torch.ops.aten._prelu_kernel.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten__prelu_kernel_default": 1} - ) - .partition() + .check_count({"torch.ops.aten.prelu.default": 1}) + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( ["executorch_exir_dialects_edge__ops_aten__prelu_kernel_default"] @@ -39,13 +35,12 @@ def _test_prelu(self, module, inputs): .run_method_and_compare_outputs() ) - @unittest.skip("T158653285 - Missing recomposition for PReLU") - def test_fp16_prelu(self): + @unittest.skip("XNNPACK Expects FP16 inputs but FP32 weights") + def _test_fp16_prelu(self): module = self.PReLU().to(torch.float16) inputs = (torch.randn(1, 5, 3, 2).to(torch.float16),) self._test_prelu(module, inputs) - @unittest.skip("T158653285 - Missing recomposition for PReLU") def test_fp32_prelu(self): module = self.PReLU() inputs = (torch.randn(1, 5, 3, 2),) diff --git a/backends/xnnpack/test/ops/quantize_per_tensor.py b/backends/xnnpack/test/ops/quantize_per_tensor.py index f912428a8a..c211798753 100644 --- a/backends/xnnpack/test/ops/quantize_per_tensor.py +++ b/backends/xnnpack/test/ops/quantize_per_tensor.py @@ -24,13 +24,7 @@ def forward(self, x): ( Tester(Quant(), inputs) .export() - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 1 - } - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -60,13 +54,7 @@ def forward(self, x): ( Tester(Dequant(), inputs) .export() - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 1 - } - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/relu.py b/backends/xnnpack/test/ops/relu.py index 3ab1c72b57..8672b1d3e4 100644 --- a/backends/xnnpack/test/ops/relu.py +++ b/backends/xnnpack/test/ops/relu.py @@ -26,9 +26,7 @@ def test_fp32_relu(self): Tester(self.Relu(), inputs) .export() .check_count({"torch.ops.aten.relu.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_relu_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/sdpa.py b/backends/xnnpack/test/ops/sdpa.py index d68bcab208..de5c7174ab 100644 --- a/backends/xnnpack/test/ops/sdpa.py +++ b/backends/xnnpack/test/ops/sdpa.py @@ -8,7 +8,10 @@ from typing import Optional import torch +from executorch.backends.xnnpack.partition.config.generic_node_configs import SDPAConfig +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.test.tester import Tester +from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower class TestSDPA(unittest.TestCase): @@ -61,9 +64,9 @@ def _test(self, module, inputs, atol=1e-03, rtol=1e-03): ( Tester(module, inputs) .export() - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_bmm_default": 2}) - .partition() + .to_edge_transform_and_lower( + ToEdgeTransformAndLower([XnnpackPartitioner(configs=[SDPAConfig])]) + ) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( ["executorch_exir_dialects_edge__ops_aten_bmm_default"], diff --git a/backends/xnnpack/test/ops/sigmoid.py b/backends/xnnpack/test/ops/sigmoid.py index 3dde395922..a9acd4df6d 100644 --- a/backends/xnnpack/test/ops/sigmoid.py +++ b/backends/xnnpack/test/ops/sigmoid.py @@ -25,9 +25,7 @@ def _test_sigmoid(self, inputs): Tester(self.Sigmoid(), inputs) .export() .check_count({"torch.ops.aten.sigmoid.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_sigmoid_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/slice_copy.py b/backends/xnnpack/test/ops/slice_copy.py index 50c8610b9c..8ff3736857 100644 --- a/backends/xnnpack/test/ops/slice_copy.py +++ b/backends/xnnpack/test/ops/slice_copy.py @@ -7,8 +7,7 @@ import unittest import torch -from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner -from executorch.backends.xnnpack.test.tester import Partition, Tester +from executorch.backends.xnnpack.test.tester import Tester class TestSliceCopy(unittest.TestCase): @@ -82,11 +81,7 @@ def forward(self, x): Tester(module, inputs) .export() .check_count({"torch.ops.aten.slice.Tensor": 3}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) @@ -105,11 +100,7 @@ def forward(self, x): Tester(module, inputs) .export() .check_count({"torch.ops.aten.slice.Tensor": 3}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 3} - ) - .partition() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) @@ -124,12 +115,13 @@ def forward(self, x): inputs = (torch.randn(5, 5, 5),) ( - Tester(SliceCopy(), inputs) - .export() - .to_edge() - .partition( - Partition(partitioner=XnnpackPartitioner(has_dynamic_shapes=True)) + Tester( + SliceCopy(), + inputs, + dynamic_shapes=({2: torch.export.Dim("dim_2", min=4, max=100)},), ) + .export() + .to_edge_transform_and_lower() .check_not(["torch.ops.higher_order.executorch_call_delegate"]) ) @@ -154,11 +146,7 @@ def forward(self, x): "quantized_decomposed::quantize_per_tensor": 3, } ) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 3} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/softmax.py b/backends/xnnpack/test/ops/softmax.py index 697b6f9294..cc544a28a2 100644 --- a/backends/xnnpack/test/ops/softmax.py +++ b/backends/xnnpack/test/ops/softmax.py @@ -29,11 +29,7 @@ def _test_softmax(self, inputs): Tester(self.Softmax(dim), inputs) .export() .check_count({"torch.ops.aten.softmax": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten__softmax_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten__softmax_default"]) .to_executorch() @@ -63,11 +59,7 @@ def test_fp32_softmax_unsupported(self): Tester(self.Softmax(dim), inputs) .export() .check_count({"torch.ops.aten.softmax": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten__softmax_default": 1} - ) - .partition() + .to_edge_transform_and_lower() # Should not be delegated .check(["executorch_exir_dialects_edge__ops_aten__softmax_default"]) ) diff --git a/backends/xnnpack/test/ops/sqrt.py b/backends/xnnpack/test/ops/sqrt.py index e2a5f4ac2f..eaeb3b9f70 100644 --- a/backends/xnnpack/test/ops/sqrt.py +++ b/backends/xnnpack/test/ops/sqrt.py @@ -25,9 +25,7 @@ def _test_sqrt(self, inputs): Tester(self.Sqrt(), inputs) .export() .check_count({"torch.ops.aten.sqrt.default": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_sqrt_default": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_sqrt_default"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/square.py b/backends/xnnpack/test/ops/square.py index 5a7ffbc2cd..32a1963934 100644 --- a/backends/xnnpack/test/ops/square.py +++ b/backends/xnnpack/test/ops/square.py @@ -28,11 +28,7 @@ def _test_square(self, inputs): Tester(self.Square(), inputs) .export() .check_count({"torch.ops.aten.square.default": 1}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar"]) .to_executorch() diff --git a/backends/xnnpack/test/ops/static_constant_pad.py b/backends/xnnpack/test/ops/static_constant_pad.py index 2917d681ab..a0a74e3840 100644 --- a/backends/xnnpack/test/ops/static_constant_pad.py +++ b/backends/xnnpack/test/ops/static_constant_pad.py @@ -88,11 +88,7 @@ def _test_static_constant_pad_functional(self, inputs): Tester(self.StaticConstantPadFunctional(), inputs) .export() .check_count({"torch.ops.aten.pad.default": 8}) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default": 8} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( ["executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default"] @@ -139,11 +135,7 @@ def forward(self, x): .export() .check_count({"torch.ops.aten.pad.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -164,11 +156,7 @@ def test_qs8_static_constant_pad_2d(self): .export() .check_count({"torch.ops.aten.pad.default": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count( - {"executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default": 1} - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/ops/sub.py b/backends/xnnpack/test/ops/sub.py index d4d64168bb..fb3d3d3f94 100644 --- a/backends/xnnpack/test/ops/sub.py +++ b/backends/xnnpack/test/ops/sub.py @@ -32,9 +32,7 @@ def _test_sub(self, inputs): Tester(self.Sub(), inputs) .export() .check_count({"torch.ops.aten.sub.Tensor": 1}) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_sub_Tensor"]) .to_executorch() @@ -62,9 +60,7 @@ def _test_qs8_sub(self): .export() .check_count({"torch.ops.aten.sub.Tensor": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -86,9 +82,7 @@ def _test_qs8_sub2(self): .export() .check_count({"torch.ops.aten.sub.Tensor": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -110,9 +104,7 @@ def _test_qs8_sub3(self): .export() .check_count({"torch.ops.aten.sub.Tensor": 1}) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count({"executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1}) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ @@ -144,14 +136,7 @@ def forward(self, x, y): } ) .check(["torch.ops.quantized_decomposed"]) - .to_edge() - .check_count( - { - "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 1, - "executorch_exir_dialects_edge__ops_aten_relu_default": 1, - } - ) - .partition() + .to_edge_transform_and_lower() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not( [ diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py index c6b1513d31..ea9217e04a 100644 --- a/backends/xnnpack/test/test_xnnpack_utils.py +++ b/backends/xnnpack/test/test_xnnpack_utils.py @@ -25,6 +25,12 @@ # import the xnnpack backend implementation from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend +from executorch.devtools import BundledProgram + +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ExecutorchProgram, ExirExportedProgram from executorch.exir.backend.backend_api import to_backend, validation_disabled @@ -34,12 +40,6 @@ _load_for_executorch_from_buffer, ) from executorch.extension.pytree import tree_flatten -from executorch.sdk import BundledProgram - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from torch.ao.quantization import ( # @manual default_per_channel_symmetric_qnnpack_qconfig, @@ -72,6 +72,7 @@ get_symmetric_quantization_config, XNNPACKQuantizer, ) +from torch.export import export_for_training from torch.testing import FileCheck @@ -315,10 +316,11 @@ def quantize_and_test_model_with_quantizer( ): module.eval() # program capture - m = torch._export.capture_pre_autograd_graph( + + m = export_for_training( module, example_inputs, - ) + ).module() quantizer = XNNPACKQuantizer() quantization_config = get_symmetric_quantization_config() diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py index a93c20a6b1..7586c4f231 100644 --- a/backends/xnnpack/test/tester/tester.py +++ b/backends/xnnpack/test/tester/tester.py @@ -11,10 +11,9 @@ import sys from abc import ABC, abstractmethod from collections import Counter, OrderedDict -from typing import Any, Callable, Dict, List, Optional, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import torch -import torch.export._trace as export_trace from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.passes import XNNPACKPassManager from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config @@ -24,11 +23,14 @@ ExecutorchBackendConfig, ExecutorchProgramManager, to_edge, + to_edge_transform_and_lower, ) from executorch.exir.backend.backend_api import validation_disabled from executorch.exir.backend.partitioner import Partitioner from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass + from executorch.exir.print_program import pretty_print, print_program +from torch.export import export_for_training logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -40,6 +42,7 @@ logger.warning(f"{e=}") pass +from executorch.exir.program._program import _transform from torch._export.pass_base import PassType from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.ao.quantization.quantizer.quantizer import Quantizer @@ -154,10 +157,10 @@ def __init__( def run( self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]] ) -> None: - captured_graph = export_trace._export( - artifact, inputs, pre_dispatch=True - ).module() + assert inputs is not None + captured_graph = export_for_training(artifact, inputs).module() + assert isinstance(captured_graph, torch.fx.GraphModule) prepared = prepare_pt2e(captured_graph, self.quantizer) if self.calibrate: @@ -234,23 +237,72 @@ def __init__( ): self.pass_list = pass_list self.pass_functions = pass_functions - self.edge_dialect_program = None + self.edge_or_aten_program = None - def run(self, artifact: EdgeProgramManager, inputs=None) -> None: - self.edge_dialect_program = artifact - if self.pass_list: - pass_manager = XNNPACKPassManager( - artifact.exported_program(), self.pass_list - ) - self.edge_dialect_program._edge_programs["forward"] = ( - pass_manager.transform() - ) - if self.pass_functions: - assert isinstance(self.pass_functions, list) - for pass_function in self.pass_functions: - self.edge_dialect_program._edge_programs["forward"] = pass_function( - self.edge_dialect_program.exported_program() + def run( + self, artifact: Union[EdgeProgramManager, ExportedProgram], inputs=None + ) -> None: + if isinstance(artifact, EdgeProgramManager): + self.edge_or_aten_program = artifact + if self.pass_list: + pass_manager = XNNPACKPassManager( + artifact.exported_program(), self.pass_list + ) + self.edge_or_aten_program._edge_programs["forward"] = ( + pass_manager.transform() ) + if self.pass_functions: + assert isinstance(self.pass_functions, list) + for pass_function in self.pass_functions: + self.edge_or_aten_program._edge_programs["forward"] = pass_function( + self.edge_or_aten_program.exported_program() + ) + else: + transformed_ep = artifact + if self.pass_list: + assert isinstance(self.pass_list, list) + for pass_ in self.pass_list: + transformed_ep = _transform(transformed_ep, pass_()) + + if self.pass_functions: + assert isinstance(self.pass_functions, list) + for pass_function in self.pass_functions: + transformed_ep = pass_function(transformed_ep) + + self.edge_or_aten_program = transformed_ep + + @property + def artifact(self) -> Union[EdgeProgramManager, ExportedProgram]: + return self.edge_or_aten_program + + @property + def graph_module(self) -> str: + if isinstance(self.edge_or_aten_program, EdgeProgramManager): + return self.edge_or_aten_program.exported_program().graph_module + else: + return self.edge_or_aten_program.graph_module + + +@register_stage +class ToEdgeTransformAndLower(Stage): + def __init__( + self, + partitioners: Optional[List[Partitioner]] = None, + edge_compile_config: Optional[EdgeCompileConfig] = None, + ): + self.partitioners = partitioners or [XnnpackPartitioner()] + self.edge_compile_conf = ( + edge_compile_config or get_xnnpack_edge_compile_config() + ) + self.edge_dialect_program = None + + def run(self, artifact: ExportedProgram, inputs=None) -> None: + artifact_to_run = copy.deepcopy(artifact) + self.edge_dialect_program = to_edge_transform_and_lower( + artifact_to_run, + compile_config=self.edge_compile_conf, + partitioner=self.partitioners, + ) @property def artifact(self) -> EdgeProgramManager: @@ -372,13 +424,22 @@ def __init__( self.pipeline = { self.stage_name(Quantize): [self.stage_name(Export)], self.stage_name(Export): [ + self.stage_name(RunPasses), self.stage_name(ToEdge), + self.stage_name(ToEdgeTransformAndLower), + ], + self.stage_name(ToEdgeTransformAndLower): [ + self.stage_name(RunPasses), + self.stage_name(ToExecutorch), ], self.stage_name(ToEdge): [ self.stage_name(Partition), self.stage_name(RunPasses), ], - self.stage_name(RunPasses): [self.stage_name(Partition)], + self.stage_name(RunPasses): [ + self.stage_name(Partition), + self.stage_name(ToEdgeTransformAndLower), + ], # TODO Make this Stage optional self.stage_name(Partition): [self.stage_name(ToExecutorch)], self.stage_name(ToExecutorch): [self.stage_name(Serialize)], @@ -500,7 +561,13 @@ def to_edge(self, to_edge_stage: Optional[ToEdge] = None): if not to_edge_stage: to_edge_stage = ToEdge() to_edge_stage.edge_compile_conf._skip_dim_order = True - return self._run_stage(to_edge_stage) + res = self._run_stage(to_edge_stage) + return res + + def to_edge_transform_and_lower( + self, to_edge_and_transform_stage: Optional[ToEdgeTransformAndLower] = None + ): + return self._run_stage(to_edge_and_transform_stage or ToEdgeTransformAndLower()) def run_passes(self, run_passes_stage: Optional[RunPasses] = None): return self._run_stage(run_passes_stage or RunPasses()) diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index 1d139a3b4b..87ee0b46b8 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit 1d139a3b4b7155889c88c31f370a82c48e7ca89c +Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3 diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo index d6860c477c..16bfc1622c 160000 --- a/backends/xnnpack/third-party/cpuinfo +++ b/backends/xnnpack/third-party/cpuinfo @@ -1 +1 @@ -Subproject commit d6860c477c99f1fce9e28eb206891af3c0e1a1d7 +Subproject commit 16bfc1622c6902d6f91d316ec54894910c620325 diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py index bda7952717..e9b23e4a78 100644 --- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py +++ b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import print_function +from pathlib import Path import collections import os import sys @@ -36,8 +37,8 @@ "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)", "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)", @@ -46,7 +47,7 @@ # add non-prod microkernel sources here: } -SRC_NAMES = set([ +SRC_NAMES = { "OPERATOR_SRCS", "SUBGRAPH_SRCS", "LOGGING_SRCS", @@ -81,30 +82,42 @@ "PROD_AVX512F_MICROKERNEL_SRCS", "PROD_AVX512SKX_MICROKERNEL_SRCS", "PROD_AVX512VBMI_MICROKERNEL_SRCS", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_AVX512VNNI_MICROKERNEL_SRCS", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_RVV_MICROKERNEL_SRCS", "PROD_AVXVNNI_MICROKERNEL_SRCS", "AARCH32_ASM_MICROKERNEL_SRCS", "AARCH64_ASM_MICROKERNEL_SRCS", # add non-prod microkernel sources here: -]) +} def handle_singleline_parse(line): start_index = line.find("(") end_index = line.find(")") line = line[start_index+1:end_index] key_val = line.split(" ") - return key_val[0], list(map(lambda x: x[4:], key_val[1:])) + return key_val[0], [x[4:] for x in key_val[1:]] def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"): + print(f"Updating sources from {cmakefile}") sources = collections.defaultdict(list) with open(os.path.join(xnnpack_path, cmakefile)) as cmake: lines = cmake.readlines() i = 0 while i < len(lines): line = lines[i] + + if lines[i].startswith("INCLUDE"): + file, _ = handle_singleline_parse(line) + if file.startswith("cmake/gen/"): + path = Path(xnnpack_path) / "XNNPACK" / file + local_sources = update_sources(xnnpack_path, path.absolute().as_posix()) + for k,v in local_sources.items(): + if k in sources: + sources[k] = sources[k] + local_sources[k] + else: + sources[k] = local_sources[k] if lines[i].startswith("SET") and "src/" in lines[i]: name, val = handle_singleline_parse(line) @@ -132,7 +145,7 @@ def gen_wrappers(xnnpack_path): xnnpack_sources = collections.defaultdict(list) sources = update_sources(xnnpack_path) - microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake") + microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake") for key in microkernels_sources: sources[key] = microkernels_sources[key] @@ -186,6 +199,8 @@ def gen_wrappers(xnnpack_path): def main(argv): + print("Generating wrappers...") + if argv is None or len(argv) == 0: gen_wrappers(".") else: diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl index a1add44664..7f0a8ca6f2 100644 --- a/backends/xnnpack/third-party/xnnpack.buck.bzl +++ b/backends/xnnpack/third-party/xnnpack.buck.bzl @@ -1,7 +1,6 @@ load("//third-party:glob_defs.bzl", "subdir_glob") load( ":xnnpack_src_defs.bzl", - "JIT_SRCS", "LOGGING_SRCS", "OPERATOR_SRCS", "SUBGRAPH_SRCS", @@ -69,27 +68,6 @@ def define_xnnpack(): ], ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. - native.cxx_library( - name = "jit_memory", - srcs = JIT_SRCS, - headers = subdir_glob([ - ("XNNPACK/src", "**/*.h"), - ]), - header_namespace = "", - compiler_flags = [ - "-std=c++17", - ], - preferred_linkage = "static", - preprocessor_flags = [ - "-DXNN_LOG_LEVEL=0", - ], - exported_deps = [ - ":clog", - ":interface", - ], - ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. native.cxx_library( name = "operators", @@ -139,7 +117,6 @@ def define_xnnpack(): preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_SPARSE=0", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0", "-DXNN_ENABLE_MEMOPT", @@ -1223,7 +1200,6 @@ def define_xnnpack(): ] ARM_XNNPACK_DEPS = [ - ":jit_memory", ":ukernels_armsimd32", ":ukernels_fp16arith", ":ukernels_asm", @@ -1246,11 +1222,10 @@ def define_xnnpack(): "XNNPACK/src/configs/hardware-config.c", "XNNPACK/src/microparams-init.c", "XNNPACK/src/operator-run.c", - "XNNPACK/src/operators/post-operation.c", "XNNPACK/src/microkernel-utils.c", ], headers = subdir_glob([ - ("XNNPACK/src", "xnnpack/*.h"), + ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h"), ]), exported_headers = { @@ -1271,7 +1246,6 @@ def define_xnnpack(): "-DXNN_NO_X8_OPERATORS", "-DXNN_ENABLE_MEMOPT", "-DXNN_ENABLE_SPARSE=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_ASSEMBLY", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION", "-DXNN_ENABLE_ARM_DOTPROD", diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl index 0a0beba7ef..d8ebe7c72b 100644 --- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl @@ -200,7 +200,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ ] PROD_XOP_MICROKERNEL_SRCS = [ - "XNNPACK/src/amalgam/gen/xop.c", ] PROD_AVX512F_MICROKERNEL_SRCS = [ @@ -493,30 +492,18 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] XNNPACK_SRCS = [ diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl index 2dbb41ff01..a9d4af95cc 100644 --- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl @@ -92,7 +92,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ ] PROD_XOP_MICROKERNEL_SRCS = [ - "xnnpack_wrappers/amalgam/gen/xop.c", ] PROD_FMA3_MICROKERNEL_SRCS = [ @@ -447,28 +446,16 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] diff --git a/backends/xnnpack/threadpool/test/targets.bzl b/backends/xnnpack/threadpool/test/targets.bzl deleted file mode 100644 index 7bbcd8c4c0..0000000000 --- a/backends/xnnpack/threadpool/test/targets.bzl +++ /dev/null @@ -1,20 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - _THREADPOOL_TESTS = [ - "threadpool_test.cpp", - ] + (["fb/threadpool_use_n_threads_test.cpp"] if not runtime.is_oss else []) - - runtime.cxx_test( - name = "threadpool_test", - srcs = _THREADPOOL_TESTS, - deps = [ - "//executorch/backends/xnnpack/threadpool:threadpool", - ], - ) diff --git a/backends/xnnpack/utils/TARGETS b/backends/xnnpack/utils/TARGETS index b542006e3b..55615e1106 100644 --- a/backends/xnnpack/utils/TARGETS +++ b/backends/xnnpack/utils/TARGETS @@ -9,6 +9,7 @@ python_library( "//caffe2:torch", "//executorch/exir:lib", "//executorch/exir:pass_manager", + "//executorch/exir/backend/canonical_partitioners:config_partitioner_lib", "//executorch/exir/dialects:lib", "//pytorch/ao:torchao", # @manual ], diff --git a/backends/xnnpack/utils/configs.py b/backends/xnnpack/utils/configs.py index 3fe290606c..9dda84c5e5 100644 --- a/backends/xnnpack/utils/configs.py +++ b/backends/xnnpack/utils/configs.py @@ -12,8 +12,12 @@ ### XNNPACK Configs ### -def get_xnnpack_edge_compile_config() -> exir.EdgeCompileConfig: - return exir.EdgeCompileConfig(_check_ir_validity=False, _skip_dim_order=True) +def get_xnnpack_edge_compile_config( + skip_dim_order: bool = True, +) -> exir.EdgeCompileConfig: + return exir.EdgeCompileConfig( + _check_ir_validity=False, _skip_dim_order=skip_dim_order + ) def get_transform_passes(additional_passes=None) -> List[PassType]: diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py index 0b6e7e496a..7c035757a6 100644 --- a/backends/xnnpack/utils/quant_utils.py +++ b/backends/xnnpack/utils/quant_utils.py @@ -4,29 +4,185 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator +from itertools import accumulate +from typing import cast + import torch -from executorch.exir.dialects._ops import ops as exir_ops - -DQ_TARGETS = { - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, +from executorch.exir.backend.canonical_partitioners.config_partitioner import ( + format_target_name, +) + +_Q_OPS = { + "quantize_per_tensor.tensor", + "quantize_per_tensor.default", + "quantize_per_channel.default", + "quantize_per_channel_group.default", + "quantize_per_token.default", + "quantize_affine.default", +} + +_DQ_OPS = { + "dequantize_per_tensor.tensor", + "dequantize_per_tensor.default", + "dequantize_per_channel.default", + "dequantize_per_channel_group.default", + "dequantize_per_token.default", + "dequantize_affine.default", } -Q_TARGETS = { - exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, + +_QPARAM_OPS = { + "choose_qparams.tensor", + "choose_qparams_per_token_asymmetric.default", + "choose_qparams_affine.default", } +_DYNAMIC_OPS = { + "quantize_per_tensor.tensor", + "quantize_per_token.default", + "dequantize_per_tensor.tensor", + "dequantize_per_token.default", +} + + +def is_dynamic_qdq(node: torch.fx.Node) -> bool: + if node.op != "call_function": + return False + node_name = format_target_name(node.target.__name__) # pyre-ignore + is_dynamic_affine = is_per_token(node) and not is_per_channel_group(node) + + return node_name in _DYNAMIC_OPS or is_dynamic_affine + + +def is_qparam(node: torch.fx.Node) -> bool: + if node.op != "call_function": + return False + node_name = format_target_name(node.target.__name__) # pyre-ignore + + return node_name in _QPARAM_OPS + + +def is_quant(node: torch.fx.Node) -> bool: + if node.op != "call_function": + return False + node_name = format_target_name(node.target.__name__) # pyre-ignore + + return node_name in _Q_OPS + + +def is_dequant(node: torch.fx.Node) -> bool: + if node.op != "call_function": + return False + node_name = format_target_name(node.target.__name__) # pyre-ignore + + return node_name in _DQ_OPS + + +def is_per_channel(node: torch.fx.Node) -> bool: + if not (is_quant(node) or is_dequant(node)): + return False + + is_affine_per_channel_group = is_per_channel_group(node) + is_per_channel = "per_channel" in node.target.__name__ # pyre-ignore + + return is_per_channel or is_affine_per_channel_group + + +def is_affine_qdq(node: torch.fx.Node) -> bool: + if not (is_quant(node) or is_dequant(node)): + return False + + return "quantize_affine" in node.target.__name__ # pyre-ignore + + +def _get_block_size_input_scale(node: torch.fx.Node): + assert is_affine_qdq(node) + block_size = node.args[1] + input_val = node.all_input_nodes[0].meta["val"] + scale_val = node.all_input_nodes[1].meta["val"] + return block_size, input_val, scale_val + + +def is_per_token(node: torch.fx.Node): + if not (is_quant(node) or is_dequant(node)): + return False + + if "per_token" in node.target.__name__: # pyre-ignore + return True + elif is_affine_qdq(node): + block_size, input_val, scale_val = _get_block_size_input_scale(node) + flag = True + scale_numel_expected = 1 + for i in range(len(block_size) - 1): + flag &= block_size[i] == 1 + scale_numel_expected *= input_val.shape[i] + + flag &= block_size[-1] == input_val.shape[-1] + flag &= scale_val.numel() == scale_numel_expected + return flag + + return False + + +def is_per_channel_group(node: torch.fx.Node): + if not (is_quant(node) or is_dequant(node)): + return False + + if "per_channel_group" in node.target.__name__: # pyre-ignore + return True + elif is_affine_qdq(node): + block_size, input_val, scale_val = _get_block_size_input_scale(node) + flag = True + flag &= len(block_size) == 2 + flag &= block_size[0] == 1 + group_size = block_size[1] + scale_numel = list(accumulate(scale_val.shape, operator.mul))[-1] + input_numel = list(accumulate(input_val.shape, operator.mul))[-1] + flag &= input_numel == group_size * scale_numel + return flag + + return False + + +def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node): + if not is_affine_qdq(node): + return None, None + # make sure input_dtype and zero_point_domain have expected values + input_node = node.args[0] + scale_node = node.args[2] + zero_point_node = node.args[3] + args = [input_node, scale_node, zero_point_node] + assert ( + len(node.args) > 4 + ), f"expecting at least 6 args, got node: {node.format_node()}" + + if node.args[4] != torch.int8: + return None, None + target_dtype = cast(torch.dtype, node.args[4]) + + if len(node.args) > 6: + # quant_min + args.append(node.args[5]) + # quant_max + args.append(node.args[6]) + else: + dtype_info = torch.iinfo(target_dtype) + quant_min = dtype_info.min + quant_max = dtype_info.max + args.append(quant_min) + args.append(quant_max) + + # add target_dtype_node after quant_min/quant_max + args.append(target_dtype) + # zero_point_domain + if len(node.args) > 7 and node.args[7] != "INT": + return None, None -def is_quant(tensor: torch.fx.Node) -> bool: - return tensor.target in Q_TARGETS + if is_per_channel_group(node): + block_sizes = cast(list[int], node.args[1]) + args.append(block_sizes[-1]) + args.append(node.args[-1]) -def is_dequant(tensor: torch.fx.Node) -> bool: - return tensor.target in DQ_TARGETS + return args diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py index 5c76922472..b802d73c16 100644 --- a/backends/xnnpack/utils/utils.py +++ b/backends/xnnpack/utils/utils.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import cast, Optional, Tuple +from typing import Any, cast, Optional, Tuple import executorch.exir as exir import torch @@ -62,6 +62,20 @@ def check_or_raise(condition: bool, err: str) -> None: raise RuntimeError(err) +def is_node(node: Any) -> bool: + """ + returns true if node is a torch.fx.Node, otherwise false + """ + return isinstance(node, torch.fx.Node) + + +def is_getitem(node: torch.fx.Node) -> bool: + if node.op != "call_function": + return False + + return node.target.__name__ == "getitem" # pyre-ignore + + def get_input_node(node: torch.fx.Node, input_index: int) -> torch.fx.Node: return cast(torch.fx.Node, node.args[input_index]) diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py index 5d4c05a535..b7ee440c28 100644 --- a/backends/xnnpack/xnnpack_preprocess.py +++ b/backends/xnnpack/xnnpack_preprocess.py @@ -78,6 +78,22 @@ def generate_node_to_external_map( return node_to_external_map +def assert_default_dim_order(edge_graph_module: torch.fx.GraphModule) -> None: + for node in edge_graph_module.graph.nodes: + if node.op != "placeholder": + continue + + # We expect the default dim order for all tensor-like inputs i.e. inputs, buffers, and params + t = node.meta.get("val", None) + if t is not None and getattr(t, "dim_order", None) is not None: + default_dim_order = tuple(range(t.dim())) + if t.dim_order() != default_dim_order: + raise RuntimeError( + f"XNNPACK backend only supports contiguous memory format for inputs." + f"Expecting dim_order: {default_dim_order}, but got {node.meta['val'].dim_order()} for a placeholder node {node}." + ) + + @final class XnnpackBackend(BackendDetails): @staticmethod @@ -126,6 +142,9 @@ def preprocess( node_to_external_map = generate_node_to_external_map(ep, graph_module) + # Make sure all inputs are contiguous_format or NCHW or default dim order + assert_default_dim_order(graph_module) + # TODO retrace the graph module to lift the new params may have # been added to the graph in passes diff --git a/build/Codegen.cmake b/build/Codegen.cmake index 01f35718b8..381cd0958f 100644 --- a/build/Codegen.cmake +++ b/build/Codegen.cmake @@ -78,7 +78,8 @@ function(generate_bindings_for_kernels) # Executorch runtime. execute_process( COMMAND - "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib;print(get_python_lib())" + "${PYTHON_EXECUTABLE}" -c + "from distutils.sysconfig import get_python_lib;print(get_python_lib())" OUTPUT_VARIABLE site-packages-out ERROR_VARIABLE site-packages-out-error RESULT_VARIABLE site-packages-result @@ -150,9 +151,8 @@ function(gen_custom_ops_aot_lib) include(${EXECUTORCH_ROOT}/build/Utils.cmake) target_link_options_shared_lib(${GEN_LIB_NAME}) - if(EXECUTORCH_BUILD_PYBIND AND APPLE) - target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops_shared) - target_link_options(${GEN_LIB_NAME} PRIVATE -undefined dynamic_lookup) + if(TARGET portable_lib) + target_link_libraries(${GEN_LIB_NAME} PRIVATE portable_lib) else() target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops) endif() diff --git a/build/Test.cmake b/build/Test.cmake index b2b23cb03a..d6ef124793 100644 --- a/build/Test.cmake +++ b/build/Test.cmake @@ -5,8 +5,8 @@ # LICENSE file in the root directory of this source tree. # -# This file is intended to have helper functions for test-related -# CMakeLists.txt files. +# This file is intended to have helper functions for test-related CMakeLists.txt +# files. # # ### Editing this file ### # @@ -25,61 +25,66 @@ find_package(executorch CONFIG REQUIRED) enable_testing() find_package(GTest CONFIG REQUIRED) +target_link_options_shared_lib(cpuinfo) target_link_options_shared_lib(extension_data_loader) target_link_options_shared_lib(portable_kernels) target_link_options_shared_lib(portable_ops_lib) +target_link_options_shared_lib(pthreadpool) target_link_options_shared_lib(quantized_ops_lib) # Add code coverage flags to supported compilers if(EXECUTORCH_USE_CPP_CODE_COVERAGE) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path") - string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path") + string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path") + string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path") elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping") - string(APPEND CMAKE_CXX_FLAGS " -fprofile-instr-generate -fcoverage-mapping") + string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping") + string(APPEND CMAKE_CXX_FLAGS + " -fprofile-instr-generate -fcoverage-mapping" + ) else() - message(ERROR "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported") + message(ERROR + "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported" + ) endif() endif() -# A helper function to generate a gtest cxx executable target -# @param target_name: name for the executable -# @param SOURCES : test sources to be compiled. Sometimes -# util sources are used as well -# @param EXTRA LIBS : additional libraries to be linked against -# the target. gtest, gmock, executorch are linked by default, but Sometimes -# user may need additional libraries like kernels. -# We use CMake package executorch in this helper, so user can easily add -# installed libraries. +# A helper function to generate a gtest cxx executable target @param +# target_name: name for the executable @param SOURCES : test +# sources to be compiled. Sometimes util sources are used as well @param EXTRA +# LIBS : additional libraries to be linked against the target. +# gtest, gmock, executorch are linked by default, but Sometimes user may need +# additional libraries like kernels. We use CMake package executorch in this +# helper, so user can easily add installed libraries. # -# Example: -# et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels) +# Example: et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels) # # This defines a gtest executable my_test, compiling my_test.cpp, and linking # against libportable_kernels.a. # function(et_cxx_test target_name) -set(multi_arg_names SOURCES EXTRA_LIBS) -cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN}) + set(multi_arg_names SOURCES EXTRA_LIBS) + cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN}) -# Let files say "include ". -target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..) + # Let files say "include ". + target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..) -set(ET_TEST_UTIL_SOURCES ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp) + set(ET_TEST_UTIL_SOURCES + ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp + ) -add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES}) -# Includes gtest, gmock, executorch by default -target_link_libraries( - ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch - ${ET_CXX_TEST_EXTRA_LIBS} -) + add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES}) + # Includes gtest, gmock, executorch by default + target_link_libraries( + ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch + ${ET_CXX_TEST_EXTRA_LIBS} + ) -# add_test adds a test target to be used by ctest. -# We use `ExecuTorchTest` as the ctest target name for the test executable -# Usage: cd cmake-out/path/to/test/; ctest -# Note: currently we directly invoke the test target, without using ctest -add_test(ExecuTorchTest ${target_name}) + # add_test adds a test target to be used by ctest. We use `ExecuTorchTest` as + # the ctest target name for the test executable Usage: cd + # cmake-out/path/to/test/; ctest Note: currently we directly invoke the test + # target, without using ctest + add_test(ExecuTorchTest ${target_name}) endfunction() diff --git a/build/Utils.cmake b/build/Utils.cmake index 56fc1e104b..3ea616d590 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -65,6 +65,12 @@ function(executorch_print_configuration_summary) message(STATUS " EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL : " "${EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL}" ) + message(STATUS " EXECUTORCH_BUILD_EXTENSION_TENSOR : " + "${EXECUTORCH_BUILD_EXTENSION_TENSOR}" + ) + message(STATUS " EXECUTORCH_BUILD_EXTENSION_TRAINING : " + "${EXECUTORCH_BUILD_EXTENSION_TRAINING}" + ) message( STATUS " EXECUTORCH_BUILD_FLATC : ${EXECUTORCH_BUILD_FLATC}" @@ -97,7 +103,7 @@ function(executorch_print_configuration_summary) "${EXECUTORCH_BUILD_KERNELS_QUANTIZED}" ) message( - STATUS " EXECUTORCH_BUILD_SDK : ${EXECUTORCH_BUILD_SDK}" + STATUS " EXECUTORCH_BUILD_DEVTOOLS : ${EXECUTORCH_BUILD_DEVTOOLS}" ) message( STATUS @@ -143,11 +149,21 @@ function(macos_kernel_link_options target_name) ) endfunction() +# Same as kernel_link_options but it's for MSVC linker +function(msvc_kernel_link_options target_name) + target_link_options( + ${target_name} INTERFACE + "SHELL:LINKER:/WHOLEARCHIVE:$" + ) +endfunction() + # Ensure that the load-time constructor functions run. By default, the linker # would remove them since there are no other references to them. function(target_link_options_shared_lib target_name) if(APPLE) macos_kernel_link_options(${target_name}) + elseif(MSVC) + msvc_kernel_link_options(${target_name}) else() kernel_link_options(${target_name}) endif() @@ -171,11 +187,20 @@ function(extract_sources sources_file) set(executorch_root ${CMAKE_CURRENT_SOURCE_DIR}) endif() + if(ANDROID_ABI) + if("${ANDROID_ABI}" STREQUAL "arm64-v8a") + set(target_platforms_arg "--target-platforms=shim//:android-arm64") + elseif("${ANDROID_ABI}" STREQUAL "x86_64") + set(target_platforms_arg "--target-platforms=shim//:android-x86_64") + else() + message(FATAL_ERROR "Unsupported ANDROID_ABI setting ${ANDROID_ABI}. Please add it here!") + endif() + endif() execute_process( COMMAND ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file} - --buck2=${BUCK2} + --buck2=${BUCK2} ${target_platforms_arg} OUTPUT_VARIABLE gen_srcs_output ERROR_VARIABLE gen_srcs_error RESULT_VARIABLE gen_srcs_exit_code diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 5bba039a31..42034c254f 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -17,25 +17,33 @@ build_jar() { build_android_native_library() { ANDROID_ABI="$1" - TOKENIZER="$2" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" CMAKE_OUT="cmake-out-android-${ANDROID_ABI}" - if [[ $TOKENIZER == "tiktoken" ]]; then - EXECUTORCH_USE_TIKTOKEN=ON + QNN_SDK_ROOT="${QNN_SDK_ROOT:-}" + if [ -n "$QNN_SDK_ROOT" ]; then + EXECUTORCH_BUILD_QNN=ON else - EXECUTORCH_USE_TIKTOKEN=OFF + EXECUTORCH_BUILD_QNN=OFF fi + cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-23 \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_LOG_LEVEL=Info \ -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \ + -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" @@ -46,27 +54,15 @@ build_android_native_library() { fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release - cmake examples/models/llama2 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI="$ANDROID_ABI" \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${CMAKE_OUT}"/examples/models/llama2 - - cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release - cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_LOG_LEVEL=Info \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -75,6 +71,19 @@ build_android_native_library() { # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + + # Copy QNN related so library + if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then + cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + fi } build_aar() { @@ -87,23 +96,28 @@ build_aar() { # between Java and JNI find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file - zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so AndroidManifest.xml + zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml + cp executorch.aar executorch-llama.aar popd } -build_android_llm_demo_app() { +build_android_demo_apps() { mkdir -p examples/demo-apps/android/LlamaDemo/app/libs cp ${BUILD_AAR_DIR}/executorch-llama.aar examples/demo-apps/android/LlamaDemo/app/libs pushd examples/demo-apps/android/LlamaDemo ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest popd + + mkdir -p extension/android/benchmark/app/libs + cp ${BUILD_AAR_DIR}/executorch.aar extension/android/benchmark/app/libs + pushd extension/android/benchmark + ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest + popd } collect_artifacts_to_be_uploaded() { - TOKENIZER="$1" - ARTIFACTS_DIR_NAME="$2" - DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo_${TOKENIZER}" + ARTIFACTS_DIR_NAME="$1" + DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo" # The app directory is named using its build flavor as a suffix. mkdir -p "${DEMO_APP_DIR}" # Collect the app and its test suite @@ -118,20 +132,26 @@ collect_artifacts_to_be_uploaded() { # Collect JAR and AAR cp extension/android/build/libs/executorch.jar "${DEMO_APP_DIR}" find "${BUILD_AAR_DIR}/" -name 'executorch*.aar' -exec cp {} "${DEMO_APP_DIR}" \; + # Collect MiniBench APK + MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench" + mkdir -p "${MINIBENCH_APP_DIR}" + cp extension/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}" + cp extension/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}" } BUILD_AAR_DIR="$(mktemp -d)" export BUILD_AAR_DIR -ANDROID_ABIS=("arm64-v8a" "x86_64") +if [ -z "$ANDROID_ABIS" ]; then + ANDROID_ABIS=("arm64-v8a" "x86_64") +fi export ANDROID_ABIS -TOKENIZER="${1:-bpe}" -ARTIFACTS_DIR_NAME="$2" +ARTIFACTS_DIR_NAME="$1" build_jar for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do - build_android_native_library ${ANDROID_ABI} ${TOKENIZER} + build_android_native_library ${ANDROID_ABI} done build_aar -build_android_llm_demo_app -collect_artifacts_to_be_uploaded ${TOKENIZER} ${ARTIFACTS_DIR_NAME} +build_android_demo_apps +collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME} diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh index 73635c3f90..348111e2b4 100755 --- a/build/build_apple_frameworks.sh +++ b/build/build_apple_frameworks.sh @@ -33,6 +33,7 @@ libexecutorch_no_prim_ops.a,\ libextension_apple.a,\ libextension_data_loader.a,\ libextension_module.a,\ +libextension_tensor.a,\ :$HEADERS_PATH" FRAMEWORK_BACKEND_COREML="backend_coreml:\ @@ -76,7 +77,7 @@ usage() { echo echo "Options:" echo " --output=DIR Output directory. Default: 'cmake-out'" - echo " --Debug Use Debug build mode. Default: 'Release'" + echo " --Debug Use Debug build mode. Default: Uses Release build mode." echo " --toolchain=FILE Cmake toolchain file. Default: '\$SOURCE_ROOT_DIR/third-party/ios-cmake/ios.toolchain.cmake'" echo " --buck2=FILE Buck2 executable path. Default: Path of buck2 found in the current \$PATH" echo " --python=FILE Python executable path. Default: Path of python3 found in the current \$PATH" @@ -90,7 +91,7 @@ usage() { echo " --xnnpack Include this flag to build the XNNPACK backend." echo echo "Example:" - echo " $0 /path/to/source/root --output=cmake-out --Release --toolchain=/path/to/cmake/toolchain --buck2=/path/to/buck2 --python=/path/to/python3 --coreml --mps --xnnpack" + echo " $0 /path/to/source/root --output=cmake-out --toolchain=/path/to/cmake/toolchain --buck2=/path/to/buck2 --python=/path/to/python3 --coreml --mps --xnnpack" exit 0 } @@ -162,9 +163,11 @@ cmake_build() { -DEXECUTORCH_BUILD_COREML=$COREML \ -DEXECUTORCH_BUILD_MPS=$MPS \ -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=$CUSTOM \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=$OPTIMIZED \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=$QUANTIZED \ @@ -188,6 +191,7 @@ mkdir -p "$HEADERS_PATH" "$SOURCE_ROOT_DIR"/build/print_exported_headers.py --buck2="$BUCK2" --targets \ //extension/module: \ + //extension/tensor: \ | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch" cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.h "$HEADERS_PATH/executorch" diff --git a/build/build_apple_llm_demo.sh b/build/build_apple_llm_demo.sh new file mode 100755 index 0000000000..9fe1c1bcd7 --- /dev/null +++ b/build/build_apple_llm_demo.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +ARTIFACTS_DIR_NAME="$1" +APP_PATH="extension/apple/Benchmark/Benchmark" + +xcodebuild build-for-testing \ + -project "${APP_PATH}.xcodeproj" \ + -scheme Benchmark \ + -destination "platform=iOS" \ + -sdk iphoneos \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER="ExecuTorch Benchmark" \ + CODE_SIGN_IDENTITY="iPhone Distribution" \ + CODE_SIGNING_REQUIRED=No \ + CODE_SIGNING_ALLOWED=No + +# The hack to figure out where the xctest package locates +BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") + +# Prepare the demo app, debug mode here is the default from xcodebuild and match +# with what we have in the test spec +MODE="Release" +PLATFORM="iphoneos" +pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" + +rm -rf Payload && mkdir Payload +APP_NAME=Benchmark + +ls -lah +cp -r "${APP_NAME}.app" Payload && zip -vr "${APP_NAME}.ipa" Payload + +popd + +# Prepare the test suite +pushd "${BUILD_DIR}" + +ls -lah +zip -vr "${APP_NAME}.xctestrun.zip" *.xctestrun + +popd + +if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then + mkdir -p "${ARTIFACTS_DIR_NAME}" + # Prepare all the artifacts to upload + cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/" + cp "${BUILD_DIR}/${APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/" + + ls -lah "${ARTIFACTS_DIR_NAME}/" +fi diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 80abd46409..c0011f175e 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -73,6 +73,7 @@ excludes = [ deps = [ "executorch", "executorch_no_prim_ops", + "extension_threadpool", "portable_kernels", ] @@ -116,6 +117,20 @@ deps = [ "executorch", ] +[targets.optimized_native_cpu_ops_oss] +buck_targets = [ + "//configurations:optimized_native_cpu_ops_oss", +] +filters = [ + ".cpp$", +] +excludes = [ +] +deps = [ + "executorch_no_prim_ops", + "executorch", + "portable_kernels", +] # ---------------------------------- core end ---------------------------------- # ---------------------------------- extension start ---------------------------------- [targets.extension_data_loader] @@ -158,6 +173,71 @@ deps = [ "executorch_no_prim_ops", ] +[targets.extension_llm_runner] +buck_targets = [ + "//extension/llm/runner:runner_lib", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_no_prim_ops", + "extension_module", + "extension_runner_util", +] + +[targets.extension_tensor] +buck_targets = [ + "//extension/tensor:tensor", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_no_prim_ops", +] + +[targets.extension_threadpool] +buck_targets = [ + "//extension/threadpool:threadpool", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_no_prim_ops", +] + +[targets.extension_training] +buck_targets = [ + "//extension/training/module:training_module", + "//extension/training/optimizer:sgd", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch_no_prim_ops", +] + +[targets.train_xor] +buck_targets = [ + "//extension/training/examples/XOR:train_xor", +] +filters = [ + ".cpp$", +] +excludes = [ + "^codegen", +] +deps = [ + "executorch", + "executorch_no_prim_ops", + "portable_kernels", +] # ---------------------------------- extension end ---------------------------------- # ---------------------------------- binary start ---------------------------------- @@ -282,10 +362,13 @@ filters = [ # ---------------------------------- LLama start ---------------------------------- [targets.custom_ops] buck_targets = [ - "//examples/models/llama2/custom_ops:custom_ops", + "//extension/llm/custom_ops:custom_ops", ] filters = [ - ".cpp$", + # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO: + # remove filters and patch extract_sources.py's Buck query to fetch + # srcs; presumably filters is here to remove .h files. + "(.cpp$)|(fht.*\\.c$)", ] excludes = [ "^codegen", @@ -294,6 +377,7 @@ deps = [ "executorch", "executorch_no_prim_ops", "optimized_kernels", + "extension_threadpool", "xnnpack_backend", ] @@ -316,5 +400,6 @@ deps = [ "portable_kernels", "quantized_kernels", "xnnpack_backend", + "optimized_native_cpu_ops_oss", ] # ---------------------------------- LLama end ---------------------------------- diff --git a/build/constraints/TARGETS b/build/constraints/TARGETS new file mode 100644 index 0000000000..fd09ad5ebd --- /dev/null +++ b/build/constraints/TARGETS @@ -0,0 +1 @@ +oncall("executorch") diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 962990d7c8..18b6c7801b 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -46,6 +46,9 @@ set(lib_list extension_module extension_module_static extension_runner_util + extension_tensor + extension_threadpool + extension_training xnnpack_backend XNNPACK cpuinfo diff --git a/build/extract_sources.py b/build/extract_sources.py index ce8b3de981..5004fe0c50 100755 --- a/build/extract_sources.py +++ b/build/extract_sources.py @@ -11,7 +11,7 @@ import re from enum import Enum -from typing import Any, Optional, Sequence +from typing import Any, List, Optional, Sequence from buck_util import Buck2Runner @@ -96,7 +96,12 @@ def __init__( else: self._config[k] = v - def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]: + def get_sources( + self, graph: "Graph", runner: Buck2Runner, buck_args: Optional[List[str]] + ) -> frozenset[str]: + if buck_args is None: + buck_args = [] + if self._state == Target._InitState.READY: return self._sources # Detect cycles. @@ -113,7 +118,7 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]: ) # Get the complete list of source files that this target depends on. - sources: set[str] = set(runner.run(["cquery", query])) + sources: set[str] = set(runner.run(["cquery", query] + buck_args)) # Keep entries that match all of the filters. filters = [re.compile(p) for p in self._config.get("filters", [])] @@ -128,7 +133,9 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]: # its deps. Remove entries that are already covered by the transitive # set of dependencies. for dep in self._config.get("deps", []): - sources.difference_update(graph.by_name[dep].get_sources(graph, runner)) + sources.difference_update( + graph.by_name[dep].get_sources(graph, runner, buck_args) + ) self._sources = frozenset(sources) self._state = Target._InitState.READY @@ -173,6 +180,9 @@ def parse_args() -> argparse.Namespace: metavar="file", help="Path to the file to generate.", ) + parser.add_argument( + "--target-platforms", help="--target-platforms to pass to buck cquery, if any." + ) return parser.parse_args() @@ -199,8 +209,12 @@ def main(): # Run the queries and get the lists of source files. target_to_srcs: dict[str, list[str]] = {} runner: Buck2Runner = Buck2Runner(args.buck2) + buck_args = [] + if args.target_platforms: + buck_args = ["--target-platforms"] + buck_args.append(args.target_platforms) for name, target in graph.by_name.items(): - target_to_srcs[name] = sorted(target.get_sources(graph, runner)) + target_to_srcs[name] = sorted(target.get_sources(graph, runner, buck_args)) # Generate the requested format. output: bytes diff --git a/build/packaging/post_build_script.sh b/build/packaging/post_build_script.sh index fd71b18565..c785139b3d 100644 --- a/build/packaging/post_build_script.sh +++ b/build/packaging/post_build_script.sh @@ -7,4 +7,8 @@ set -eux -echo "This script is run after building ExecuTorch binaries" +# This script is run after building ExecuTorch binaries + +# Rename pip-out directory, to avoid using shared libraries in pip-out during +# smoke test. +mv pip-out BACKUP-pip-out diff --git a/build/pip_data_bin_init.py.in b/build/pip_data_bin_init.py.in index 9644c5621d..0c9d60e049 100644 --- a/build/pip_data_bin_init.py.in +++ b/build/pip_data_bin_init.py.in @@ -21,7 +21,9 @@ def _find_executable_files_under(dir): for filename in os.listdir(dir): filepath = os.path.join(dir, filename) if os.path.isfile(filepath) and os.access(filepath, os.X_OK): - bin_names.append(filename) + # Remove .exe suffix on windows. + filename_without_ext = os.path.splitext(filename)[0] + bin_names.append(filename_without_ext) return bin_names # The list of binaries to create wrapper functions for. diff --git a/build/test_ios_ci.sh b/build/test_ios_ci.sh index 5fa6ef7d24..50c6448d4b 100755 --- a/build/test_ios_ci.sh +++ b/build/test_ios_ci.sh @@ -11,6 +11,9 @@ APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo" MODEL_NAME="mv3" SIMULATOR_NAME="executorch" +# If this is set, copy the build artifacts to this directory +ARTIFACTS_DIR_NAME="$1" + finish() { EXIT_STATUS=$? if xcrun simctl list | grep -q "$SIMULATOR_NAME"; then @@ -64,3 +67,49 @@ xcodebuild test \ -project "$APP_PATH.xcodeproj" \ -scheme MobileNetClassifierTest \ -destination name="$SIMULATOR_NAME" + +# NB: https://docs.aws.amazon.com/devicefarm/latest/developerguide/test-types-ios-xctest-ui.html +say "Package The Test Suite" + +xcodebuild build-for-testing \ + -project "$APP_PATH.xcodeproj" \ + -scheme MobileNetClassifierTest \ + -destination platform="iOS" \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER=ExecuTorchDemo \ + CODE_SIGN_IDENTITY="iPhone Distribution" + +# The hack to figure out where the xctest package locates +BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") + +# Prepare the demo app +MODE="Debug" +PLATFORM="iphoneos" +pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" + +rm -rf Payload && mkdir Payload +MOCK_APP_NAME=ExecuTorchDemo + +ls -lah +cp -r "${MOCK_APP_NAME}.app" Payload && zip -vr "${MOCK_APP_NAME}.ipa" Payload + +popd + +# Prepare the test suite +pushd "${BUILD_DIR}" + +ls -lah +zip -vr "${MOCK_APP_NAME}.xctestrun.zip" *.xctestrun + +popd + +if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then + mkdir -p "${ARTIFACTS_DIR_NAME}" + # Prepare all the artifacts to upload + cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${MOCK_APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/" + cp "${BUILD_DIR}/${MOCK_APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/" + + ls -lah "${ARTIFACTS_DIR_NAME}/" +fi diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index a7790be7fe..3076cde1a9 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include "${fn_header}" // Generated Function import headers @@ -21,7 +22,8 @@ // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast // arguments into C++ types (instead of IValue) and delegate to unboxed kernels. -using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>; +using KernelSpan = + ::executorch::runtime::Span; namespace torch { namespace executor { namespace function { @@ -31,15 +33,15 @@ static Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; -// Explicitly convert to ArrayRef, so that the API can take an empty C array of +// Explicitly convert to Span, so that the API can take an empty C array of // Kernels. -static KernelArrayRef kernel_array_ref( +static KernelSpan kernel_span( kernels_to_register, kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel)); // Return value not used. Keep the static variable assignment to register // kernels in static initialization time. -static auto success_with_kernel_reg = register_kernels(kernel_array_ref); +static auto success_with_kernel_reg = register_kernels(kernel_span); } // namespace } // namespace function } // namespace executor diff --git a/codegen/templates/RegisterKernels.cpp b/codegen/templates/RegisterKernels.cpp index 2313a30a30..91eac20022 100644 --- a/codegen/templates/RegisterKernels.cpp +++ b/codegen/templates/RegisterKernels.cpp @@ -19,7 +19,8 @@ Error register_all_kernels() { Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; - Error success_with_kernel_reg = register_kernels(kernels_to_register); + Error success_with_kernel_reg = + ::executorch::runtime::register_kernels({kernels_to_register}); if (success_with_kernel_reg != Error::Ok) { ET_LOG(Error, "Failed register all kernels"); return success_with_kernel_reg; diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py index f21fb8dc6b..fbb191a6a8 100644 --- a/codegen/tools/gen_oplist.py +++ b/codegen/tools/gen_oplist.py @@ -230,7 +230,7 @@ def gen_oplist( if model_file_path: assert os.path.isfile( model_file_path - ), "The value for --model_file_path needs to be a valid file." + ), f"The value for --model_file_path needs to be a valid file, got {model_file_path}" op_set.update(_get_operators(model_file_path)) source_name = model_file_path et_kernel_metadata = merge_et_kernel_metadata( @@ -239,7 +239,7 @@ def gen_oplist( if ops_schema_yaml_path: assert os.path.isfile( ops_schema_yaml_path - ), "The value for --ops_schema_yaml_path needs to be a valid file." + ), f"The value for --ops_schema_yaml_path needs to be a valid file, got {ops_schema_yaml_path}" et_kernel_metadata = merge_et_kernel_metadata( et_kernel_metadata, _get_et_kernel_metadata_from_ops_yaml(ops_schema_yaml_path), @@ -300,14 +300,33 @@ def main(args: List[Any]) -> None: ) options = parser.parse_args(args) - gen_oplist( - output_path=options.output_path, - model_file_path=options.model_file_path, - ops_schema_yaml_path=options.ops_schema_yaml_path, - root_ops=options.root_ops, - ops_dict=options.ops_dict, - include_all_operators=options.include_all_operators, - ) + try: + gen_oplist( + output_path=options.output_path, + model_file_path=options.model_file_path, + ops_schema_yaml_path=options.ops_schema_yaml_path, + root_ops=options.root_ops, + ops_dict=options.ops_dict, + include_all_operators=options.include_all_operators, + ) + except Exception as e: + command = ["python codegen/tools/gen_oplist.py"] + if options.model_file_path: + command.append(f"--model_file_path {options.model_file_path}") + if options.ops_schema_yaml_path: + command.append(f"--ops_schema_yaml_path {options.ops_schema_yaml_path}") + if options.root_ops: + command.append(f"--root_ops {options.root_ops}") + if options.ops_dict: + command.append(f"--ops_dict {options.ops_dict}") + if options.include_all_operators: + command.append("--include-all-operators") + repro_command = " ".join(command) + raise RuntimeError( + f"""Failed to generate selected_operators.yaml. Repro command: + {repro_command} + """ + ) from e if __name__ == "__main__": diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py index d455ddb689..bd1d008248 100644 --- a/codegen/tools/test/test_gen_oplist.py +++ b/codegen/tools/test/test_gen_oplist.py @@ -42,7 +42,7 @@ def test_gen_op_list_with_wrong_path( mock_get_operators: NonCallableMock, ) -> None: args = ["--output_path=wrong_path", "--model_file_path=path2"] - with self.assertRaises(AssertionError): + with self.assertRaises(RuntimeError): gen_oplist.main(args) @patch("executorch.codegen.tools.gen_oplist._get_kernel_metadata_for_model") diff --git a/configurations/targets.bzl b/configurations/targets.bzl index dc88c13744..6a5341c290 100644 --- a/configurations/targets.bzl +++ b/configurations/targets.bzl @@ -20,7 +20,7 @@ def define_common_targets(): runtime.cxx_library( name = "executor_cpu_optimized", exported_deps = [ - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", ] + get_all_cpu_backend_targets(), visibility = [ "//executorch/test/...", @@ -28,7 +28,7 @@ def define_common_targets(): ], ) - # Add a commong configuration of cpu optimized operators. This adds a bit of confusion + # Add a common configuration of cpu optimized operators. This adds a bit of confusion # with the above executorch_cpu_optimized target. Generally it would make sense # to just add optimized operators to that target but because executorch_cpu_optimized # might be used elsewhere, I dont want to include ops in that target and find out @@ -50,3 +50,21 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) + + # TODO(T183193812): delete this target after optimized-oss.yaml is gone + executorch_generated_lib( + name = "optimized_native_cpu_ops_oss", + deps = [ + "//executorch/kernels/optimized:optimized_operators", + "//executorch/kernels/optimized:optimized_oplist", + "//executorch/kernels/portable:executorch_aten_ops", + "//executorch/kernels/portable:operators", + ], + functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml", + fallback_yaml_target = "//executorch/kernels/portable:functions.yaml", + define_static_targets = True, + visibility = [ + "//executorch/examples/...", + "@EXECUTORCH_CLIENTS", + ], + ) diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt new file mode 100644 index 0000000000..776d421a8d --- /dev/null +++ b/devtools/CMakeLists.txt @@ -0,0 +1,220 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(NOT FLATCC_EXECUTABLE) + set(FLATCC_EXECUTABLE flatcc) +endif() + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +if(NOT PYTHON_EXECUTABLE) + resolve_python_executable() +endif() + +if(NOT FLATC_EXECUTABLE) + set(FLATC_EXECUTABLE flatc) +endif() + +# Paths to headers generated from the .fbs files. set(_etdump_schemas +# etdump_schema_flatcc.fbs scalar_type.fbs) + +set(_etdump_schema_names "etdump_schema_flatcc.fbs" "scalar_type.fbs") +set(_bundled_input_schema_names "bundled_program_schema.fbs" "scalar_type.fbs") + +foreach(schema_file ${_etdump_schema_names}) + list(APPEND _etdump_schema__srcs + "${CMAKE_CURRENT_SOURCE_DIR}/etdump/${schema_file}" + ) +endforeach() + +foreach(schema_file ${_bundled_input_schema_names}) + list(APPEND _bundled_program_schema__srcs + "${CMAKE_CURRENT_SOURCE_DIR}/bundled_program/schema/${schema_file}" + ) +endforeach() + +set(FLATCC_TEST + OFF + CACHE BOOL "" +) +set(FLATCC_REFLECTION + OFF + CACHE BOOL "" +) +set(FLATCC_DEBUG_CLANG_SANITIZE + OFF + CACHE BOOL "" +) +set(_flatcc_source_dir ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/flatcc) +add_subdirectory(${_flatcc_source_dir} ${CMAKE_BINARY_DIR}/third-party/flatcc) + +# Fix for "relocation R_X86_64_32 against `.rodata' can not be used when making +# a shared object; recompile with -fPIC" when building on some x86 linux +# systems. +set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON) + +# Assume we are cross-compiling and the CMAKE_TOOLCHAIN_FILE is set +include(ExternalProject) + +# The include directory that will contain the generated schema headers. +set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/include") +set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/bundled_program") + +# TODO(dbort): Only enable this when cross-compiling. It can cause build race +# conditions (libflatcc.a errors) when enabled. +option(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT + "Whether to build the flatcc commandline tool as a separate project" ON +) + +if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT) + # Add the host project. We build this separately so that we can generate + # headers on the host during the build, even if we're cross-compiling the + # flatcc runtime to a different architecture. + execute_process( + COMMAND + ${CMAKE_COMMAND} ${_flatcc_source_dir} -DFLATCC_TEST=OFF + -DFLATCC_REFLECTION=OFF + # See above comment about POSITION_INDEPENDENT_CODE. + -DCMAKE_POSITION_INDEPENDENT_CODE=ON -B${CMAKE_BINARY_DIR}/_host_build + ) + execute_process( + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build + ) + set(_etdump_schema_gen_dep) + # TODO(dbort): flatcc installs its files directly in its source directory + # instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing + # this. We build flatcc twice in the executorch build: once to get the + # `flatcc` host commandline tool, and once to get the (potentially + # cross-compiled) target runtime library. The host build will put its outputs + # in the source tree, making the cross-compiling target build think that the + # outputs have already been built. It will then try to link against the + # host-architecture libraries, failing when cross-compiling. To work around + # this, delete the host outputs after running this command (which only runs + # when setting up the cmake files, not when actually building). This leaves + # room for the target build to put its own files in the source tree. We should + # try to remove this hack, ideally by submitting an upstream PR that adds an + # option to change the installation location. + set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/* + ${_flatcc_source_dir}/lib/* + ) +else() + # If we're not cross-compiling, we can just use the plain commandline target. + set(_etdump_schema_gen_dep flatcc_cli) + set(_etdump_schema_cleanup_paths "") +endif() + +set(_etdump_schema__outputs) +foreach(fbs_file ${_etdump_schema_names}) + string(REGEX REPLACE "[.]fbs$" "_reader.h" generated "${fbs_file}") + list(APPEND _etdump_schema__outputs + "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}" + ) + string(REGEX REPLACE "[.]fbs$" "_builder.h" generated "${fbs_file}") + list(APPEND _etdump_schema__outputs + "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}" + ) +endforeach() + +# lint_cmake: -linelength +set(_bundled_program_schema__outputs) +foreach(fbs_file ${_bundled_input_schema_names}) + string(REGEX REPLACE "[.]fbs$" "_generated.h" generated "${fbs_file}") + list( + APPEND + _bundled_program_schema__outputs + "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema/${generated}" + ) +endforeach() + +add_library(etdump_schema INTERFACE ${_etdump_schema__outputs}) +add_library( + bundled_program_schema INTERFACE ${_bundled_program_schema__outputs} +) + +file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/devtools/etdump) +file(MAKE_DIRECTORY + ${_program_schema__include_dir}/executorch/devtools/bundled_program +) + +add_custom_command( + OUTPUT ${_etdump_schema__outputs} + COMMAND + # Note that the flatcc project actually writes its outputs into the source + # tree instead of under the binary directory, and there's no way to change + # that behavior. + ${_flatcc_source_dir}/bin/flatcc -cwr -o + ${_program_schema__include_dir}/executorch/devtools/etdump + ${_etdump_schema__srcs} + COMMAND rm -f ${_etdump_schema_cleanup_paths} + DEPENDS ${_etdump_schema_gen_dep} + COMMENT "Generating etdump headers" +) + +add_library( + etdump ${CMAKE_CURRENT_SOURCE_DIR}/etdump/etdump_flatcc.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/etdump/emitter.cpp +) + +target_link_libraries( + etdump + PUBLIC etdump_schema flatccrt + PRIVATE executorch +) + +add_custom_command( + OUTPUT ${_bundled_program_schema__outputs} + COMMAND + ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o + "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema" + ${_bundled_program_schema__srcs} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools + DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs} + COMMENT "Generating bundled_program headers" + VERBATIM +) + +# add_library(bundled_program INTERFACE ${_bundled_program_schema__outputs}) +add_library( + bundled_program + ${CMAKE_CURRENT_SOURCE_DIR}/bundled_program/bundled_program.cpp +) +target_link_libraries(bundled_program executorch bundled_program_schema) + +set_target_properties(bundled_program PROPERTIES LINKER_LANGUAGE CXX) +target_include_directories( + bundled_program PUBLIC ${_bundled_schema__include_dir} + ${EXECUTORCH_ROOT}/third-party/flatbuffers/include +) + +target_include_directories( + etdump PUBLIC ${_program_schema__include_dir} ${_flatcc_source_dir}/include +) + +# Install libraries +install( + TARGETS bundled_program etdump flatccrt + DESTINATION ${CMAKE_BINARY_DIR}/lib + INCLUDES + DESTINATION ${_common_include_directories} +) diff --git a/devtools/TARGETS b/devtools/TARGETS new file mode 100644 index 0000000000..06964b8387 --- /dev/null +++ b/devtools/TARGETS @@ -0,0 +1,13 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("executorch") + +python_library( + name = "lib", + srcs = ["__init__.py"], + deps = [ + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", + "//executorch/devtools/inspector:lib", + ], +) diff --git a/devtools/__init__.py b/devtools/__init__.py new file mode 100644 index 0000000000..821d75901f --- /dev/null +++ b/devtools/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import executorch.devtools.inspector as inspector +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.etrecord import ETRecord, generate_etrecord, parse_etrecord +from executorch.devtools.inspector import Inspector + +__all__ = [ + "ETRecord", + "Inspector", + "generate_etrecord", + "parse_etrecord", + "inspector", + "BundledProgram", +] diff --git a/devtools/backend_debug/TARGETS b/devtools/backend_debug/TARGETS new file mode 100644 index 0000000000..95529192a3 --- /dev/null +++ b/devtools/backend_debug/TARGETS @@ -0,0 +1,23 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.python_library( + name = "delegation_info", + srcs = [ + "__init__.py", + "delegation_info.py", + ], + visibility = [ + "//executorch/...", + "//executorch/exir/backend/...", + "//executorch/test/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "fbsource//third-party/pypi/pandas:pandas", + "//caffe2:torch", + "//executorch/exir:lowered_backend_module", + "//executorch/exir/backend/canonical_partitioners:duplicate_constant_node_pass", + ], +) diff --git a/devtools/backend_debug/__init__.py b/devtools/backend_debug/__init__.py new file mode 100644 index 0000000000..b457b7d11d --- /dev/null +++ b/devtools/backend_debug/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.devtools.backend_debug.delegation_info import ( + DelegationBreakdown, + get_delegation_info, +) + +__all__ = ["DelegationBreakdown", "get_delegation_info"] diff --git a/devtools/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py new file mode 100644 index 0000000000..b237d162f7 --- /dev/null +++ b/devtools/backend_debug/delegation_info.py @@ -0,0 +1,176 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import re +from collections import defaultdict +from dataclasses import asdict, dataclass +from typing import Dict + +import pandas as pd +import torch + + +# Column names of the DataFrame returned by DelegationInfo.get_operator_delegation_dataframe() +# which describes the summarized delegation information grouped by each operator type +_OCCURRENCES_IN_DELEGATED_GRAPHS = "occurrences_in_delegated_graphs" +_OCCURRENCES_IN_NON_DELEGATED_GRAPHS = "occurrences_in_non_delegated_graphs" + + +@dataclass +class DelegationBreakdown: + """ + DelegationBreakdown contains the number of delegated and non-delegated nodes + of the operator type op_type. + + Args: + delegated: The number of delegated nodes. + non_delegated: The number of non-delegated nodes. + """ + + op_type: str = "" + delegated: int = 0 + non_delegated: int = 0 + + +@dataclass +class DelegationInfo: + """ + DelegationInfo contains information of a delegated graph module. + + Args: + num_delegated_subgraphs: The number of delegated subgraphs. + num_delegated_nodes: The number of delegated nodes. + num_non_delegated_nodes: The number of non-delegated nodes. + delegation_by_operator: A dictionary of operator type to DelegationBreakdown. + """ + + num_delegated_subgraphs: int + num_delegated_nodes: int + num_non_delegated_nodes: int + delegation_by_operator: Dict[str, DelegationBreakdown] + + def get_summary(self) -> str: + """ + Get a summary of the delegation information in string format. + + Args: + None + + Returns: + A string containing information of some class attributes for easy print-out. + """ + + # Assemble and return the summary string + summary_str = f"Total delegated subgraphs: {self.num_delegated_subgraphs}\n" + summary_str += f"Number of delegated nodes: {self.num_delegated_nodes}\n" + summary_str += ( + f"Number of non-delegated nodes: {self.num_non_delegated_nodes}\n" + ) + return summary_str + + def get_operator_delegation_dataframe(self) -> pd.DataFrame: + """ + Get the delegation information grouped by operator type in a pandas DataFrame. + + Args: + None + + Returns: + Returns a pandas DataFrame containing the following columns: + - op_type: The operator type, with the last row being "Total". + - occurrences_in_delegated_graphs: The number of occurrences of the op_type in delegated subgraphs. + - occurrences_in_non_delegated_graphs: The number of occurrences of the op_type not in delegated subgraphs. + With the last row being the total number of delegated and non-delegated occurrences of each op_type. + """ + + # Convert the dict to a dataframe + list_of_dicts = [ + asdict(breakdown) for breakdown in self.delegation_by_operator.values() + ] + df = pd.DataFrame(list_of_dicts) + # Rename columns for better understandability + df = df.rename( + columns={ + "delegated": _OCCURRENCES_IN_DELEGATED_GRAPHS, + "non_delegated": _OCCURRENCES_IN_NON_DELEGATED_GRAPHS, + } + ) + df = df.sort_values(by="op_type", ignore_index=True) + + # Add a Total row at the bottom + total_delegated_nodes = df[_OCCURRENCES_IN_DELEGATED_GRAPHS].sum() + total_non_delegated_nodes = df[_OCCURRENCES_IN_NON_DELEGATED_GRAPHS].sum() + df.loc[len(df)] = ["Total", total_delegated_nodes, total_non_delegated_nodes] + + return df + + +def get_delegation_info( + graph_module: torch.fx.GraphModule, +) -> DelegationInfo: + """ + Util function to get the delegation information of the given graph module. + + Args: + graph_module: The lowered graph module to get the delegation information from. + + Returns: + Return a DelegationInfo object containing the delegation information. + """ + + def _get_op_type(node_name: str) -> str: + # node_name is in format or _x in which x is an integer suffix. + return re.sub(r"_[\d]+$", "", node_name) + + op_occurrences_dict = defaultdict(lambda: DelegationBreakdown()) + + def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None: + op_type = _get_op_type(node_name) + op_occurrences_dict[op_type].op_type = op_type + if delegated: + op_occurrences_dict[op_type].delegated += 1 + else: + op_occurrences_dict[op_type].non_delegated += 1 + + delegated_subgraph_counter = 0 + + lowered_module_dict = { + node.name: getattr(graph_module, node.name) + for node in graph_module.graph.nodes + if node.op == "get_attr" and node.name.startswith("lowered_module_") + } + + for node in graph_module.graph.nodes: + if ( + node.op == "call_function" + and _get_op_type(node.name) != "executorch_call_delegate" + ): + # Non-delegated node + _insert_op_occurrences_dict(node_name=node.name, delegated=False) + # Check if the node is a lowered module + if node.op == "get_attr" and node.name.startswith("lowered_module_"): + lowered_module = lowered_module_dict[node.name] + delegated_subgraph_counter += 1 + for node_in_lowered_module in lowered_module.original_module.graph.nodes: + if node_in_lowered_module.op == "call_function": + # Delegated node + _insert_op_occurrences_dict( + node_name=node_in_lowered_module.name, delegated=True + ) + + # Calculate the total number of delegated and non-delegated nodes + num_delegated_nodes = 0 + num_non_delegated_nodes = 0 + for value in op_occurrences_dict.values(): + num_delegated_nodes += value.delegated + num_non_delegated_nodes += value.non_delegated + + return DelegationInfo( + num_delegated_nodes=num_delegated_nodes, + num_non_delegated_nodes=num_non_delegated_nodes, + num_delegated_subgraphs=delegated_subgraph_counter, + delegation_by_operator=op_occurrences_dict, + ) diff --git a/devtools/backend_debug/tests/TARGETS b/devtools/backend_debug/tests/TARGETS new file mode 100644 index 0000000000..ae234df8ce --- /dev/null +++ b/devtools/backend_debug/tests/TARGETS @@ -0,0 +1,17 @@ +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "test_delegation_info", + srcs = [ + "test_delegation_info.py", + ], + deps = [ + "fbsource//third-party/pypi/pandas:pandas", + "//caffe2:torch", + "//executorch/devtools/backend_debug:delegation_info", + "//executorch/exir:lib", + "//executorch/exir/backend/test:op_partitioner_demo", + ], +) diff --git a/devtools/backend_debug/tests/test_delegation_info.py b/devtools/backend_debug/tests/test_delegation_info.py new file mode 100644 index 0000000000..6ff5169094 --- /dev/null +++ b/devtools/backend_debug/tests/test_delegation_info.py @@ -0,0 +1,79 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import pandas as pd + +import torch +from executorch.devtools.backend_debug import DelegationBreakdown, get_delegation_info +from executorch.exir import to_edge +from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo +from pandas.testing import assert_frame_equal + + +class TestUtils(unittest.TestCase): + def test_get_delegation_info(self): + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, a, x, b): + y = torch.mm(a, x) + z = y + b + a = z - a + y = torch.mm(a, x) + z = y + b + return z + + m = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2)) + edge = to_edge(torch.export.export(m, inputs)).to_backend( + AddMulPartitionerDemo() + ) + delegation_info = get_delegation_info(edge.exported_program().graph_module) + + self.assertEqual(delegation_info.num_delegated_subgraphs, 2) + self.assertEqual(delegation_info.num_delegated_nodes, 4) + self.assertEqual(delegation_info.num_non_delegated_nodes, 3) + expected_delegation_by_op_dict = { + "aten_add_tensor": DelegationBreakdown( + op_type="aten_add_tensor", delegated=2, non_delegated=0 + ), + "aten_mm_default": DelegationBreakdown( + op_type="aten_mm_default", delegated=2, non_delegated=0 + ), + "aten_sub_tensor": DelegationBreakdown( + op_type="aten_sub_tensor", delegated=0, non_delegated=1 + ), + "getitem": DelegationBreakdown( + op_type="getitem", delegated=0, non_delegated=2 + ), + } + self.assertEqual( + delegation_info.delegation_by_operator, expected_delegation_by_op_dict + ) + + self.assertIn( + "Total delegated subgraphs", + delegation_info.get_summary(), + ) + + df = delegation_info.get_operator_delegation_dataframe() + expected_df = pd.DataFrame( + { + "op_type": [ + "aten_add_tensor", + "aten_mm_default", + "aten_sub_tensor", + "getitem", + "Total", + ], + "occurrences_in_delegated_graphs": [2, 2, 0, 0, 4], + "occurrences_in_non_delegated_graphs": [0, 0, 1, 2, 3], + } + ) + assert_frame_equal(expected_df, df) diff --git a/devtools/bundled_program/TARGETS b/devtools/bundled_program/TARGETS new file mode 100644 index 0000000000..27560f7087 --- /dev/null +++ b/devtools/bundled_program/TARGETS @@ -0,0 +1,51 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() + +runtime.python_library( + name = "core", + srcs = [ + "core.py", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + ":config", + ":version", + "//caffe2:torch", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", + "//executorch/exir:schema", + "//executorch/exir:tensor", + "//executorch/exir/_serialize:lib", + ], +) + +runtime.python_library( + name = "config", + srcs = [ + "config.py", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "fbsource//third-party/pypi/typing-extensions:typing-extensions", + "//caffe2:torch", + ], +) + +runtime.python_library( + name = "version", + srcs = [ + "version.py", + ], + visibility = [ + "//executorch/devtools/...", + ], +) diff --git a/sdk/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp similarity index 91% rename from sdk/bundled_program/bundled_program.cpp rename to devtools/bundled_program/bundled_program.cpp index 39e6ea960e..54f84f6fef 100644 --- a/sdk/bundled_program/bundled_program.cpp +++ b/devtools/bundled_program/bundled_program.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include @@ -16,20 +16,28 @@ #include #endif // USE_ATEN_LIB +#include #include #include #include #include #include -#include -namespace torch { -namespace executor { +using exec_aten::ArrayRef; +using exec_aten::Half; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Method; +using ::executorch::runtime::Result; + +namespace executorch { namespace bundled_program { namespace { -#define kMaxDim 16 +constexpr size_t kMaxDim = 16; #ifdef USE_ATEN_LIB @@ -53,6 +61,7 @@ at::Tensor tensor_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { } #else // !USE_ATEN_LIB +using torch::executor::TensorImpl; // Create a tensorimpl with same content using bundled tensor TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { ScalarType scalar_type = @@ -234,9 +243,9 @@ get_method_test_suite( } // namespace // Load testset_idx-th bundled data into the Method -__ET_NODISCARD Error LoadBundledInput( +ET_NODISCARD Error load_bundled_input( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx) { ET_CHECK_OR_RETURN_ERROR( bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( @@ -319,19 +328,19 @@ __ET_NODISCARD Error LoadBundledInput( ET_CHECK_OR_RETURN_ERROR( status == Error::Ok, NotSupported, - "set_input failed during load bundled inputs with status %" PRIu32, - static_cast(status)); + "set_input failed during load bundled inputs with status 0%" PRIx32, + static_cast(status)); } - internal::event_tracer_set_bundled_input_index( + ::executorch::runtime::internal::event_tracer_set_bundled_input_index( method.get_event_tracer(), testset_idx); return Error::Ok; } -__ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( +ET_NODISCARD Error verify_method_outputs( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol, double atol) { @@ -390,12 +399,12 @@ __ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( return Error::Ok; } -__ET_NODISCARD Error GetProgramData( +ET_NODISCARD Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, size_t* out_program_data_len) { - if (IsBundledProgram(file_data)) { + if (is_bundled_program(file_data, file_data_len)) { auto program_bundled = bundled_program_flatbuffer::GetBundledProgram(file_data); *out_program_data = program_bundled->program()->data(); @@ -410,11 +419,13 @@ __ET_NODISCARD Error GetProgramData( return Error::Ok; } -bool IsBundledProgram(void* file_data) { +bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) { + // Even though the flatbuffer API doesn't accept a length, it's important to + // require one so that we could change the internal representation, or use a + // future API that does require a length. return bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( file_data); } } // namespace bundled_program -} // namespace executor -} // namespace torch +} // namespace executorch diff --git a/devtools/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h new file mode 100644 index 0000000000..884ca6f21b --- /dev/null +++ b/devtools/bundled_program/bundled_program.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace executorch { +namespace bundled_program { + +/** + * An opaque pointer to a serialized bundled program. + */ +using SerializedBundledProgram = const void; + +/** + * Load testset_idx-th bundled input of method_idx-th Method test in + * bundled_program_ptr to given Method. + * + * @param[in] method The Method to verify. + * @param[in] bundled_program_ptr The bundled program contains expected output. + * @param[in] testset_idx The index of input needs to be set into given Method. + * + * @returns Return Error::Ok if load successfully, or the error happens during + * execution. + */ +ET_NODISCARD ::executorch::runtime::Error load_bundled_input( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, + size_t testset_idx); + +/** + * Compare the Method's output with testset_idx-th bundled expected + * output in method_idx-th Method test. + * + * @param[in] method The Method to extract outputs from. + * @param[in] bundled_program_ptr The bundled program contains expected output. + * @param[in] testset_idx The index of expected output needs to be compared. + * @param[in] rtol Relative tolerance used for data comparsion. + * @param[in] atol Absolute tolerance used for data comparsion. + * + * @returns Return Error::Ok if two outputs match, or the error happens during + * execution. + */ +ET_NODISCARD ::executorch::runtime::Error verify_method_outputs( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, + size_t testset_idx, + double rtol = 1e-5, + double atol = 1e-8); + +/** + * Finds the serialized ExecuTorch program data in the provided bundled program + * file data. + * + * The returned buffer is appropriate for constructing a + * torch::executor::Program. + * + * @param[in] file_data The contents of an ExecuTorch program or bundled program + * file. + * @param[in] file_data_len The length of file_data, in bytes. + * @param[out] out_program_data The serialized Program data, if found. + * @param[out] out_program_data_len The length of out_program_data, in bytes. + * + * @returns Error::Ok if the given file is bundled program, a program was found + * in it, and out_program_data/out_program_data_len point to the data. Other + * values on failure. + */ +ET_NODISCARD ::executorch::runtime::Error get_program_data( + void* file_data, + size_t file_data_len, + const void** out_program_data, + size_t* out_program_data_len); + +/** + * Checks whether the given file is a bundled program. + * + * @param[in] file_data The contents of the given file. + * @param[in] file_data_len The length of file_data, in bytes. + * + * @returns true if the given file is a bundled program, false otherwise + */ +bool is_bundled_program(void* file_data, size_t file_data_len); + +/// DEPRECATED: Use the version with the file_data_len parameter. +ET_DEPRECATED inline bool is_bundled_program(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return is_bundled_program(file_data, 128); +} + +} // namespace bundled_program +} // namespace executorch + +namespace torch { +namespace executor { +namespace bundled_program { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using serialized_bundled_program = + ::executorch::bundled_program::SerializedBundledProgram; + +ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx) { + return ::executorch::bundled_program::load_bundled_input( + method, bundled_program_ptr, testset_idx); +} + +ET_NODISCARD inline ::executorch::runtime::Error +VerifyResultWithBundledExpectedOutput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx, + double rtol = 1e-5, + double atol = 1e-8) { + return ::executorch::bundled_program::verify_method_outputs( + method, bundled_program_ptr, testset_idx, rtol, atol); +} + +ET_NODISCARD inline ::executorch::runtime::Error GetProgramData( + void* file_data, + size_t file_data_len, + const void** out_program_data, + size_t* out_program_data_len) { + return ::executorch::bundled_program::get_program_data( + file_data, file_data_len, out_program_data, out_program_data_len); +} + +inline bool IsBundledProgram(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return ::executorch::bundled_program::is_bundled_program(file_data, 128); +} +} // namespace bundled_program +} // namespace executor +} // namespace torch diff --git a/sdk/bundled_program/config.py b/devtools/bundled_program/config.py similarity index 88% rename from sdk/bundled_program/config.py rename to devtools/bundled_program/config.py index 3bfbe7bc69..9756317760 100644 --- a/sdk/bundled_program/config.py +++ b/devtools/bundled_program/config.py @@ -39,7 +39,7 @@ """ All supported types for input/expected output of MethodTestCase. -Namedtuple is also supported and listed implicity since it is a subclass of tuple. +Namedtuple is also supported and listed implicitly since it is a subclass of tuple. """ # pyre-ignore @@ -59,23 +59,23 @@ def __init__( """Single test case for verifying specific method Args: - input: All inputs required by eager_model with specific inference method for one-time execution. + inputs: All inputs required by eager_model with specific inference method for one-time execution. It is worth mentioning that, although both bundled program and ET runtime apis support setting input other than `torch.tensor` type, only the input in `torch.tensor` type will be actually updated in the method, and the rest of the inputs will just do a sanity check if they match the default value in method. - expected_output: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling. + expected_outputs: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling. Returns: self """ # TODO(gasoonjia): Update type check logic. - # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check. + # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check. self.inputs: List[ConfigValue] = self._flatten_and_sanity_check(inputs) self.expected_outputs: List[ConfigValue] = [] if expected_outputs is not None: - # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check. + # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check. self.expected_outputs = self._flatten_and_sanity_check(expected_outputs) def _flatten_and_sanity_check( diff --git a/sdk/bundled_program/core.py b/devtools/bundled_program/core.py similarity index 98% rename from sdk/bundled_program/core.py rename to devtools/bundled_program/core.py index 4fede5e595..c775fb1510 100644 --- a/sdk/bundled_program/core.py +++ b/devtools/bundled_program/core.py @@ -8,19 +8,19 @@ import typing from typing import Dict, List, Optional, Sequence, Type, Union -import executorch.exir.schema as core_schema +import executorch.devtools.bundled_program.schema as bp_schema -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.exir.schema as core_schema import torch import torch.fx +from executorch.devtools.bundled_program.config import ConfigValue, MethodTestSuite + +from executorch.devtools.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION from executorch.exir import ExecutorchProgram, ExecutorchProgramManager from executorch.exir._serialize import _serialize_pte_binary from executorch.exir.tensor import get_scalar_type, scalar_type_enum, TensorSpec -from executorch.sdk.bundled_program.config import ConfigValue, MethodTestSuite - -from executorch.sdk.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION # pyre-ignore supported_program_type_table: Dict[Type[core_schema.KernelTypes], ConfigValue] = { @@ -230,7 +230,7 @@ def _assert_valid_bundle( Other checks not related to correspondence are done in config.py Args: - program: The program to be bundled. + executorch_program: The program to be bundled. method_test_suites: The testcases for specific methods to be bundled. """ diff --git a/sdk/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md similarity index 100% rename from sdk/bundled_program/schema/README.md rename to devtools/bundled_program/schema/README.md diff --git a/devtools/bundled_program/schema/TARGETS b/devtools/bundled_program/schema/TARGETS new file mode 100644 index 0000000000..51c004cbec --- /dev/null +++ b/devtools/bundled_program/schema/TARGETS @@ -0,0 +1,24 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() + +runtime.python_library( + name = "bundled_program_schema_py", + srcs = [ + "__init__.py", + "bundled_program_schema.py", + ], + visibility = [ + "//executorch/devtools/bundled_program/...", + "//executorch/devtools/etrecord/...", + ], + deps = [ + "//executorch/exir:scalar_type", + ], +) diff --git a/sdk/bundled_program/schema/__init__.py b/devtools/bundled_program/schema/__init__.py similarity index 100% rename from sdk/bundled_program/schema/__init__.py rename to devtools/bundled_program/schema/__init__.py diff --git a/sdk/bundled_program/schema/bundled_program_schema.fbs b/devtools/bundled_program/schema/bundled_program_schema.fbs similarity index 100% rename from sdk/bundled_program/schema/bundled_program_schema.fbs rename to devtools/bundled_program/schema/bundled_program_schema.fbs diff --git a/sdk/bundled_program/schema/bundled_program_schema.py b/devtools/bundled_program/schema/bundled_program_schema.py similarity index 100% rename from sdk/bundled_program/schema/bundled_program_schema.py rename to devtools/bundled_program/schema/bundled_program_schema.py diff --git a/sdk/bundled_program/schema/scalar_type.fbs b/devtools/bundled_program/schema/scalar_type.fbs similarity index 100% rename from sdk/bundled_program/schema/scalar_type.fbs rename to devtools/bundled_program/schema/scalar_type.fbs diff --git a/devtools/bundled_program/schema/targets.bzl b/devtools/bundled_program/schema/targets.bzl new file mode 100644 index 0000000000..532a01e039 --- /dev/null +++ b/devtools/bundled_program/schema/targets.bzl @@ -0,0 +1,83 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +BUNLDED_STEM = "bundled_program_schema" +SCALAR_TYPE_STEM = "scalar_type" + +INPUT_BUNDLED = BUNLDED_STEM + ".fbs" +INPUT_SCALAR_TYPE = SCALAR_TYPE_STEM + ".fbs" + +OUTPUT_BUNDLED_HEADER = BUNLDED_STEM + "_generated.h" +OUTPUT_SCALAR_TYPE_HEADER = SCALAR_TYPE_STEM + "_generated.h" + +BUNDLED_GEN_RULE_NAME = "generate_bundled_program" + +BUNDLED_LIBRARY_NAME = BUNLDED_STEM + "_fbs" + +def _generate_schema_header(rule_name, srcs, headers, default_header): + """Generate header file given flatbuffer schema + """ + runtime.genrule( + name = rule_name, + srcs = srcs, + # We're only generating a single file, so it seems like we could use + # `out`, but `flatc` takes a directory as a parameter, not a single + # file. Use `outs` so that `${OUT}` is expanded as the containing + # directory instead of the file itself. + outs = {header: [header] for header in headers}, + default_outs = [default_header], + cmd = " ".join([ + "$(exe {})".format(runtime.external_dep_location("flatc")), + "--cpp", + "--cpp-std c++11", + "--gen-mutable", + "--scoped-enums", + "-o ${OUT}", + "${SRCS}", + # Let our infra know that the file was generated. + " ".join(["&& echo // @" + "generated >> ${OUT}/" + header for header in headers]), + ]), + visibility = [], # Private + ) + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + runtime.export_file( + name = INPUT_BUNDLED, + visibility = [ + "//executorch/devtools/bundled_program/serialize/...", + ], + ) + + runtime.export_file( + name = INPUT_SCALAR_TYPE, + visibility = [ + "//executorch/devtools/bundled_program/serialize/...", + ], + ) + + _generate_schema_header( + BUNDLED_GEN_RULE_NAME, + [INPUT_BUNDLED, INPUT_SCALAR_TYPE], + [OUTPUT_BUNDLED_HEADER, OUTPUT_SCALAR_TYPE_HEADER], + OUTPUT_BUNDLED_HEADER, + ) + + # Header-only library target with the generate bundled program schema header. + runtime.cxx_library( + name = BUNDLED_LIBRARY_NAME, + srcs = [], + visibility = [ + "//executorch/devtools/bundled_program/...", + "//executorch/extension/pybindings/...", + ], + exported_headers = { + OUTPUT_BUNDLED_HEADER: ":{}[{}]".format(BUNDLED_GEN_RULE_NAME, OUTPUT_BUNDLED_HEADER), + OUTPUT_SCALAR_TYPE_HEADER: ":{}[{}]".format(BUNDLED_GEN_RULE_NAME, OUTPUT_SCALAR_TYPE_HEADER), + }, + exported_external_deps = ["flatbuffers-api"], + ) diff --git a/sdk/bundled_program/schema/test/TARGETS b/devtools/bundled_program/schema/test/TARGETS similarity index 100% rename from sdk/bundled_program/schema/test/TARGETS rename to devtools/bundled_program/schema/test/TARGETS diff --git a/sdk/bundled_program/schema/test/test_schema.py b/devtools/bundled_program/schema/test/test_schema.py similarity index 79% rename from sdk/bundled_program/schema/test/test_schema.py rename to devtools/bundled_program/schema/test/test_schema.py index ab3d2760d2..c2a19adef7 100644 --- a/sdk/bundled_program/schema/test/test_schema.py +++ b/devtools/bundled_program/schema/test/test_schema.py @@ -20,8 +20,8 @@ def test_schema_sync(self) -> None: self.assertTrue( filecmp.cmp( - prefix + "sdk/bundled_program/schema/scalar_type.fbs", + prefix + "devtools/bundled_program/schema/scalar_type.fbs", prefix + "schema/scalar_type.fbs", ), - 'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/sdk/bundled_program/schema/scalar_type.fbs" to sync schema changes.', + 'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/devtools/bundled_program/schema/scalar_type.fbs" to sync schema changes.', ) diff --git a/devtools/bundled_program/serialize/TARGETS b/devtools/bundled_program/serialize/TARGETS new file mode 100644 index 0000000000..11c5839977 --- /dev/null +++ b/devtools/bundled_program/serialize/TARGETS @@ -0,0 +1,37 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +# Use runtime.python_library instead of the one defined in python_library.bzl, +# so we can have access to EXECUTORCH_CLIENTS list. +runtime.python_library( + name = "lib", + srcs = [ + "__init__.py", + ], + resources = { + "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs", + "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs", + }, + # Currently serialization API should only be used in some dedicated targets, + # to avoid ODR violation when linking with another Flatbuffers library. + # Please ask before changing this. + visibility = [ + "//executorch/bacends/...", + "//executorch/backends/xnnpack/test/...", + "//executorch/codegen/...", + "//executorch/devtools/bundled_program/tests/...", + "//executorch/examples/async_exec:emit_program_lib", + "//executorch/exir:lib", + "//executorch/extension/pybindings/test:test", + "//executorch/extension/pybindings/test:test-library", + "//executorch/profiler/...", + "//executorch/test/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "fbsource//third-party/pypi/setuptools:setuptools", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", + "//executorch/exir/_serialize:lib", + ], +) diff --git a/devtools/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py new file mode 100644 index 0000000000..075436e9c1 --- /dev/null +++ b/devtools/bundled_program/serialize/__init__.py @@ -0,0 +1,122 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +# TODO(T138924864): Refactor to unify the serialization for bundled program and executorch program. + +import json +import os +import tempfile + +import executorch.devtools.bundled_program.schema as bp_schema + +# @manual=fbsource//third-party/pypi/setuptools:setuptools +import pkg_resources +from executorch.devtools.bundled_program.core import BundledProgram + +from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass +from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile + +# The prefix of schema files used for bundled program +BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema" +SCALAR_TYPE_SCHEMA_NAME = "scalar_type" + + +def write_schema(d: str, schema_name: str) -> None: + schema_path = os.path.join(d, "{}.fbs".format(schema_name)) + with open(schema_path, "wb") as schema_file: + schema_file.write( + pkg_resources.resource_string(__name__, "{}.fbs".format(schema_name)) + ) + + +def serialize_from_bundled_program_to_json( + bundled_program: bp_schema.BundledProgram, +) -> str: + return json.dumps(bundled_program, cls=_DataclassEncoder) + + +def deserialize_from_json_to_bundled_program( + program_json: bytes, +) -> bp_schema.BundledProgram: + program_json = json.loads(program_json) + return _json_to_dataclass(program_json, bp_schema.BundledProgram) + + +def convert_to_flatbuffer(program_json: str) -> bytes: + with tempfile.TemporaryDirectory() as d: + # load given and common schema + write_schema(d, BUNDLED_PROGRAM_SCHEMA_NAME) + write_schema(d, SCALAR_TYPE_SCHEMA_NAME) + + schema_path = os.path.join(d, "{}.fbs".format(BUNDLED_PROGRAM_SCHEMA_NAME)) + json_path = os.path.join(d, "{}.json".format(BUNDLED_PROGRAM_SCHEMA_NAME)) + with open(json_path, "wb") as json_file: + json_file.write(program_json.encode("ascii")) + _flatc_compile(d, schema_path, json_path) + output_path = os.path.join(d, "{}.bpte".format(BUNDLED_PROGRAM_SCHEMA_NAME)) + with open(output_path, "rb") as output_file: + return output_file.read() + + +def convert_from_flatbuffer(program_flatbuffer: bytes) -> bytes: + with tempfile.TemporaryDirectory() as d: + write_schema(d, BUNDLED_PROGRAM_SCHEMA_NAME) + write_schema(d, SCALAR_TYPE_SCHEMA_NAME) + + schema_path = os.path.join(d, "{}.fbs".format(BUNDLED_PROGRAM_SCHEMA_NAME)) + bin_path = os.path.join(d, "schema.bin") + with open(bin_path, "wb") as bin_file: + bin_file.write(program_flatbuffer) + _flatc_decompile(d, schema_path, bin_path) + output_path = os.path.join(d, "schema.json") + with open(output_path, "rb") as output_file: + return output_file.read() + + +# from bundled program to flatbuffer +def serialize_from_bundled_program_to_flatbuffer( + bundled_program: BundledProgram, +) -> bytes: + """ + Serialize a BundledProgram into FlatBuffer binary format. + + Args: + bundled_program (BundledProgram): The `BundledProgram` variable to be serialized. + + Returns: + The serialized FlatBuffer binary data in bytes. + """ + + bundled_program_in_schema = bundled_program.serialize_to_schema() + + return convert_to_flatbuffer( + serialize_from_bundled_program_to_json(bundled_program_in_schema) + ) + + +# From flatbuffer to bundled program in schema. +# Please notice here the bundled program is the one in our schema (bp_schema.BundledProgram), +# not the bundled program user interact with (core.bundled_program). +# However there're two concerns for current design: +# 1. the misalignment of serialization input and deserialization out, which may confuse our user. +# 2. the mis-exposion of schema.bundled_program. all classes in schema should not directly +# interact with user, but the deserialization api returns one. +# TODO(T170042248): Solve the above issues. +def deserialize_from_flatbuffer_to_bundled_program( + flatbuffer: bytes, +) -> bp_schema.BundledProgram: + """ + Deserialize a FlatBuffer binary format into a BundledProgram. + + Args: + flatbuffer (bytes): The FlatBuffer binary data in bytes. + + Returns: + A `BundledProgram` instance. + """ + return deserialize_from_json_to_bundled_program(convert_from_flatbuffer(flatbuffer)) diff --git a/devtools/bundled_program/serialize/test/TARGETS b/devtools/bundled_program/serialize/test/TARGETS new file mode 100644 index 0000000000..dd92f63f2d --- /dev/null +++ b/devtools/bundled_program/serialize/test/TARGETS @@ -0,0 +1,17 @@ +# @noautodeps + +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "serialize", + srcs = [ + "test_serialize.py", + ], + deps = [ + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/util:test_util", + ], +) diff --git a/sdk/bundled_program/serialize/test/test_serialize.py b/devtools/bundled_program/serialize/test/test_serialize.py similarity index 82% rename from sdk/bundled_program/serialize/test/test_serialize.py rename to devtools/bundled_program/serialize/test/test_serialize.py index 1db6871fc0..48a914d144 100644 --- a/sdk/bundled_program/serialize/test/test_serialize.py +++ b/devtools/bundled_program/serialize/test/test_serialize.py @@ -8,13 +8,15 @@ import unittest -from executorch.sdk.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.serialize import ( deserialize_from_flatbuffer_to_bundled_program, serialize_from_bundled_program_to_flatbuffer, ) -from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program +from executorch.devtools.bundled_program.util.test_util import ( + get_common_executorch_program, +) class TestSerialize(unittest.TestCase): diff --git a/devtools/bundled_program/targets.bzl b/devtools/bundled_program/targets.bzl new file mode 100644 index 0000000000..7035b3b31f --- /dev/null +++ b/devtools/bundled_program/targets.bzl @@ -0,0 +1,28 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + for aten_mode in (True, False): + aten_suffix = ("_aten" if aten_mode else "") + runtime.cxx_library( + name = "runtime" + aten_suffix, + srcs = ["bundled_program.cpp"], + exported_headers = ["bundled_program.h"], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, + "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs", + ], + exported_deps = [ + "//executorch/runtime/core:memory_allocator", + "//executorch/runtime/executor:program" + aten_suffix, + ], + ) diff --git a/devtools/bundled_program/test/TARGETS b/devtools/bundled_program/test/TARGETS new file mode 100644 index 0000000000..652c74b8f4 --- /dev/null +++ b/devtools/bundled_program/test/TARGETS @@ -0,0 +1,63 @@ +# @noautodeps + +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "bundle_data", + srcs = [ + "test_bundle_data.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", + "//executorch/devtools/bundled_program/util:test_util", + "//executorch/exir/_serialize:lib", + ], +) + +python_unittest( + name = "config", + srcs = [ + "test_config.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/util:test_util", + "//executorch/extension/pytree:pylib", + ], +) + +python_unittest( + name = "end2end", + srcs = [ + "test_end2end.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/util:test_util", + "//executorch/exir:dynamic_shape", + "//executorch/exir:lib", + "//executorch/exir:memory", + "//executorch/exir:pass_manager", + "//executorch/exir:print_program", + "//executorch/exir:tensor", + "//executorch/exir/_serialize:lib", + "//executorch/exir/emit:lib", + "//executorch/exir/passes:lib", + "//executorch/exir/tests:control_flow_models", + "//executorch/exir/tests:dynamic_shape_models", + "//executorch/exir/tests:models", + "//executorch/exir/tests:transformer", + "//executorch/extension/pybindings:portable_lib", + "//executorch/extension/pytree:pybindings", + "//executorch/kernels/portable:custom_ops_generated_lib", + ], +) diff --git a/sdk/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py similarity index 93% rename from sdk/bundled_program/test/test_bundle_data.py rename to devtools/bundled_program/test/test_bundle_data.py index a8d9485c5f..565539cbf1 100644 --- a/sdk/bundled_program/test/test_bundle_data.py +++ b/devtools/bundled_program/test/test_bundle_data.py @@ -9,13 +9,15 @@ import unittest from typing import List -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.devtools.bundled_program.schema as bp_schema import torch +from executorch.devtools.bundled_program.config import ConfigValue +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.util.test_util import ( + get_common_executorch_program, +) from executorch.exir._serialize import _serialize_pte_binary -from executorch.sdk.bundled_program.config import ConfigValue -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program class TestBundle(unittest.TestCase): diff --git a/sdk/bundled_program/test/test_config.py b/devtools/bundled_program/test/test_config.py similarity index 97% rename from sdk/bundled_program/test/test_config.py rename to devtools/bundled_program/test/test_config.py index 3183ad907f..21f3d48042 100644 --- a/sdk/bundled_program/test/test_config.py +++ b/devtools/bundled_program/test/test_config.py @@ -10,14 +10,14 @@ from typing import get_args, List, Union import torch -from executorch.extension.pytree import tree_flatten -from executorch.sdk.bundled_program.config import DataContainer +from executorch.devtools.bundled_program.config import DataContainer -from executorch.sdk.bundled_program.util.test_util import ( +from executorch.devtools.bundled_program.util.test_util import ( get_random_test_suites, get_random_test_suites_with_eager_model, SampleModel, ) +from executorch.extension.pytree import tree_flatten class TestConfig(unittest.TestCase): diff --git a/sdk/bundled_program/test/test_end2end.py b/devtools/bundled_program/test/test_end2end.py similarity index 88% rename from sdk/bundled_program/test/test_end2end.py rename to devtools/bundled_program/test/test_end2end.py index 99d58ee15c..7cee073be0 100644 --- a/sdk/bundled_program/test/test_end2end.py +++ b/devtools/bundled_program/test/test_end2end.py @@ -21,12 +21,12 @@ import torch -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) -from executorch.sdk.bundled_program.util.test_util import ( +from executorch.devtools.bundled_program.util.test_util import ( get_common_executorch_program, SampleModel, ) @@ -45,7 +45,7 @@ pass try: - from executorch.extension.pybindings.aten_lib import ( + from executorch.extension.pybindings.aten_lib import ( # @manual=//executorch/extension/pybindings:aten_lib _load_bundled_program_from_buffer, _load_for_executorch_from_buffer, _load_for_executorch_from_bundled_program, diff --git a/devtools/bundled_program/util/TARGETS b/devtools/bundled_program/util/TARGETS new file mode 100644 index 0000000000..7d019ce30f --- /dev/null +++ b/devtools/bundled_program/util/TARGETS @@ -0,0 +1,16 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("executorch") + +python_library( + name = "test_util", + srcs = [ + "test_util.py", + ], + visibility = ["//executorch/devtools/bundled_program/..."], + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/exir:lib", + ], +) diff --git a/sdk/bundled_program/util/test_util.py b/devtools/bundled_program/util/test_util.py similarity index 99% rename from sdk/bundled_program/util/test_util.py rename to devtools/bundled_program/util/test_util.py index bfea8158ac..505186f3a0 100644 --- a/sdk/bundled_program/util/test_util.py +++ b/devtools/bundled_program/util/test_util.py @@ -10,14 +10,14 @@ from typing import List, Tuple import torch - -from executorch.exir import ExecutorchProgramManager, to_edge -from executorch.sdk.bundled_program.config import ( +from executorch.devtools.bundled_program.config import ( MethodInputType, MethodOutputType, MethodTestCase, MethodTestSuite, ) + +from executorch.exir import ExecutorchProgramManager, to_edge from torch.export import export from torch.export.unflatten import _assign_attr, _AttrKind diff --git a/sdk/bundled_program/version.py b/devtools/bundled_program/version.py similarity index 100% rename from sdk/bundled_program/version.py rename to devtools/bundled_program/version.py diff --git a/sdk/debug_format/TARGETS b/devtools/debug_format/TARGETS similarity index 100% rename from sdk/debug_format/TARGETS rename to devtools/debug_format/TARGETS diff --git a/sdk/debug_format/base_schema.py b/devtools/debug_format/base_schema.py similarity index 94% rename from sdk/debug_format/base_schema.py rename to devtools/debug_format/base_schema.py index b987c28874..9b6247051e 100644 --- a/sdk/debug_format/base_schema.py +++ b/devtools/debug_format/base_schema.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + """ -Base Intermediate Representation for Productivity SDK consumers +Base Intermediate Representation for Developer Tools consumers (e.g. TensorBoard, Terminal Debugger) """ diff --git a/sdk/debug_format/et_schema.py b/devtools/debug_format/et_schema.py similarity index 99% rename from sdk/debug_format/et_schema.py rename to devtools/debug_format/et_schema.py index 9a6af4edba..bb15d70abc 100644 --- a/sdk/debug_format/et_schema.py +++ b/devtools/debug_format/et_schema.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + """ -Intermediate Representation of ExecuTorch Concepts in Productivity SDK +Intermediate Representation of ExecuTorch Concepts in Developer Tools """ from __future__ import annotations @@ -21,7 +23,7 @@ import torch from executorch import exir -from executorch.sdk.debug_format.base_schema import ( +from executorch.devtools.debug_format.base_schema import ( Node, OperatorGraph, OperatorNode, diff --git a/devtools/etdump/TARGETS b/devtools/etdump/TARGETS new file mode 100644 index 0000000000..7dcc4c1e84 --- /dev/null +++ b/devtools/etdump/TARGETS @@ -0,0 +1,38 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() + +runtime.python_library( + name = "schema_flatcc", + srcs = [ + "schema_flatcc.py", + ], + visibility = [ + "//executorch/devtools/...", + ], + deps = [ + "//executorch/exir:scalar_type", + ], +) + +runtime.python_library( + name = "serialize", + srcs = [ + "serialize.py", + ], + resources = { + "//executorch/devtools/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs", + "//executorch/schema:scalar_type.fbs": "scalar_type.fbs", + }, + visibility = [ + "//executorch/devtools/...", + ], + deps = [ + "fbsource//third-party/pypi/setuptools:setuptools", + ":schema_flatcc", + "//executorch/exir/_serialize:lib", + ], +) diff --git a/devtools/etdump/emitter.cpp b/devtools/etdump/emitter.cpp new file mode 100644 index 0000000000..653c75cb08 --- /dev/null +++ b/devtools/etdump/emitter.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include +#include + +#include + +using executorch::etdump::internal::ETDumpStaticAllocator; + +namespace executorch { +namespace etdump { +namespace internal { + +namespace { + +int allocator_fn( + void* alloc_context, + flatcc_iovec_t* b, + size_t request, + int zero_fill, + int hint) { + void* p; + size_t n; + + ETDumpStaticAllocator* state = + reinterpret_cast(alloc_context); + + // This allocator doesn't support freeing memory. + if (request == 0) { + if (b->iov_base) { + b->iov_base = nullptr; + b->iov_len = 0; + } + return 0; + } + + switch (hint) { + case flatcc_builder_alloc_ds: + n = 256; + break; + case flatcc_builder_alloc_ht: + /* Should be exact size, or space size is just wasted. */ + n = request; + break; + case flatcc_builder_alloc_fs: + n = sizeof(__flatcc_builder_frame_t) * 8; + break; + case flatcc_builder_alloc_us: + n = 64; + break; + case flatcc_builder_alloc_vd: + n = 64; + break; + default: + /* + * We have many small structures - vs stack for tables with few + * elements, and few offset fields in patch log. No need to + * overallocate in case of busy small messages. + */ + n = 32; + break; + } + + while (n < request) { + n *= 2; + } + + if (b->iov_base != nullptr) { + if (request > b->iov_len) { + // We don't support reallocating larger buffers. + if (((uintptr_t)b->iov_base + b->iov_len) == + (uintptr_t)&state->data[state->allocated]) { + if ((state->allocated + n - b->iov_len) > state->data_size) { + return -1; + } + state->allocated += n - b->iov_len; + } else { + if ((state->allocated + n) > state->data_size) { + return -1; + } + memcpy((void*)&state->data[state->allocated], b->iov_base, b->iov_len); + b->iov_base = &state->data[state->allocated]; + state->allocated += n; + } + if (zero_fill) { + memset((uint8_t*)b->iov_base + b->iov_len, 0, n - b->iov_len); + } + b->iov_len = n; + } + + // Ignore request to resize buffers down. + return 0; + } + + if ((state->allocated + n) > state->data_size) { + return -1; + } + + p = &state->data[state->allocated]; + state->allocated += n; + + if (zero_fill) { + memset((void*)p, 0, n); + } + + b->iov_base = p; + b->iov_len = n; + + return 0; +} + +// This emitter implementation emits to a fixed size buffer and will fail if it +// runs out of room on either end. +int emitter_fn( + void* emit_context, + const flatcc_iovec_t* iov, + int iov_count, + flatbuffers_soffset_t offset, + size_t len) { + ETDumpStaticAllocator* E = + reinterpret_cast(emit_context); + uint8_t* p; + + if (offset < 0) { + if (len > E->front_left) { + return -1; + } + E->front_cursor -= len; + E->front_left -= len; + p = E->front_cursor; + } else { + ET_CHECK_MSG( + 0, "Moving the back pointer is currently not supported in ETDump."); + } + + while (iov_count--) { + memcpy(p, iov->iov_base, iov->iov_len); + p += iov->iov_len; + ++iov; + } + + return 0; +} + +} // namespace + +int etdump_flatcc_custom_init( + flatcc_builder_t* builder, + struct ETDumpStaticAllocator* alloc) { + return flatcc_builder_custom_init( + builder, emitter_fn, alloc, allocator_fn, alloc); +} + +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/emitter.h b/devtools/etdump/emitter.h new file mode 100644 index 0000000000..09c1b56aa5 --- /dev/null +++ b/devtools/etdump/emitter.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +typedef struct flatcc_builder flatcc_builder_t; + +namespace executorch { +namespace etdump { +namespace internal { + +int etdump_flatcc_custom_init( + flatcc_builder_t* builder, + internal::ETDumpStaticAllocator* alloc); + +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp new file mode 100644 index 0000000000..4c05bb5ace --- /dev/null +++ b/devtools/etdump/etdump_flatcc.cpp @@ -0,0 +1,635 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using ::exec_aten::Tensor; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::ChainID; +using ::executorch::runtime::DebugHandle; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; + +namespace executorch { +namespace etdump { + +namespace { + +executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type( + exec_aten::ScalarType tensor_scalar_type) { + switch (tensor_scalar_type) { + case exec_aten::ScalarType::Byte: + return executorch_flatbuffer_ScalarType_BYTE; + case exec_aten::ScalarType::Char: + return executorch_flatbuffer_ScalarType_CHAR; + case exec_aten::ScalarType::Short: + return executorch_flatbuffer_ScalarType_SHORT; + case exec_aten::ScalarType::Float: + return executorch_flatbuffer_ScalarType_FLOAT; + case exec_aten::ScalarType::Int: + return executorch_flatbuffer_ScalarType_INT; + case exec_aten::ScalarType::Long: + return executorch_flatbuffer_ScalarType_LONG; + case exec_aten::ScalarType::Double: + return executorch_flatbuffer_ScalarType_DOUBLE; + case exec_aten::ScalarType::Bool: + return executorch_flatbuffer_ScalarType_BOOL; + default: + ET_CHECK_MSG( + 0, + "This ScalarType = %hhd is not yet supported in ETDump", + static_cast(tensor_scalar_type)); + } +} + +etdump_Tensor_ref_t add_tensor_entry( + flatcc_builder_t* builder_, + const exec_aten::Tensor& tensor, + long offset) { + etdump_Tensor_start(builder_); + + etdump_Tensor_scalar_type_add( + builder_, get_flatbuffer_scalar_type(tensor.scalar_type())); + etdump_Tensor_sizes_start(builder_); + + for (auto dim : tensor.sizes()) { + int64_t cast_dim = static_cast(dim); + etdump_Tensor_sizes_push(builder_, &cast_dim); + } + etdump_Tensor_sizes_end(builder_); + + etdump_Tensor_strides_start(builder_); + for (auto dim : tensor.strides()) { + int64_t cast_dim = static_cast(dim); + etdump_Tensor_strides_push(builder_, &cast_dim); + } + etdump_Tensor_strides_end(builder_); + etdump_Tensor_offset_add(builder_, offset); + + return etdump_Tensor_end(builder_); +} + +static uint8_t* alignPointer(void* ptr, size_t alignment) { + intptr_t addr = reinterpret_cast(ptr); + if ((addr & (alignment - 1)) == 0) { + // Already aligned. + return reinterpret_cast(ptr); + } + addr = (addr | (alignment - 1)) + 1; + return reinterpret_cast(addr); +} + +} // namespace + +// Constructor implementation +ETDumpGen::ETDumpGen(Span buffer) { + constexpr size_t max_alloc_buf_size = 128 * 1024; + + // Initialize the flatcc builder_ using the buffer and buffer size. + + if (buffer.data() != nullptr) { + builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64); + uintptr_t buffer_with_builder = + (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64); + size_t buffer_size = buffer.size() - + (size_t)(buffer_with_builder - (uintptr_t)buffer.data()); + alloc_.set_buffer( + (uint8_t*)buffer_with_builder, + buffer_size, + (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size + : buffer_size / 4)); + internal::etdump_flatcc_custom_init(builder_, &alloc_); + } else { + builder_ = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); + ET_CHECK_MSG( + builder_ != nullptr, "Failed to allocate memory for flatcc builder_."); + flatcc_builder_init(builder_); + } + reset(); +} + +ETDumpGen::~ETDumpGen() { + flatcc_builder_clear(builder_); + if (!is_static_etdump()) { + free(builder_); + } +} + +void ETDumpGen::reset() { + state_ = State::Init; + num_blocks_ = 0; + flatcc_builder_reset(builder_); + flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier); + etdump_ETDump_start_as_root_with_size(builder_); + etdump_ETDump_version_add(builder_, ETDUMP_VERSION); + etdump_ETDump_run_data_start(builder_); + etdump_ETDump_run_data_push_start(builder_); +} + +void ETDumpGen::create_event_block(const char* name) { + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::Done) { + reset(); + } + if (num_blocks_ > 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_push_start(builder_); + } + ++num_blocks_; + etdump_RunData_name_create_strn(builder_, name, strlen(name)); + if (bundled_input_index_ != -1) { + etdump_RunData_bundled_input_index_add(builder_, bundled_input_index_); + } + state_ = State::BlockCreated; +} + +int64_t ETDumpGen::create_string_entry(const char* name) { + return flatbuffers_string_create_str(builder_, name); +} + +// ETDumpGen has the following possible states, ETDumpGen_Init, +// ETDumpGen_Block_Created, ETDumpGen_Adding_Allocators, +// ETDumpGen_Adding_Events. Right after boot-up the state of ETDump will be +// ETDumpGen_Init. At this point we have an option of adding allocators that +// we want to track. Once we've completed adding the allocators we want to track +// we will close the allocators table and move ETDumpGen to the +// ETDumpGen_Adding_Events state. After this point we can start adding events to +// ETDump as we wish. +// The reason we need to maintain this state machine inside of ETDumpGen is +// because, once a table of one type has been closed and another table of a +// different type is opened after it we cannot open another table of the first +// type again. In this case once we close the allocators table and start pushing +// to the events table we cannot push to the allocators table again. +void ETDumpGen::check_ready_to_add_events() { + if (state_ != State::AddingEvents) { + ET_CHECK_MSG( + (state_ == State::AddingAllocators || state_ == State::BlockCreated), + "ETDumpGen in an invalid state. Cannot add new events now."); + if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); + } + etdump_RunData_events_start(builder_); + state_ = State::AddingEvents; + } +} + +EventTracerEntry ETDumpGen::start_profiling( + const char* name, + ChainID chain_id, + DebugHandle debug_handle) { + EventTracerEntry prof_entry; + prof_entry.event_id = name != nullptr ? create_string_entry(name) : -1; + prof_entry.delegate_event_id_type = DelegateDebugIdType::kNone; + + if (chain_id == -1) { + prof_entry.chain_id = chain_id_; + prof_entry.debug_handle = debug_handle_; + } else { + prof_entry.chain_id = chain_id; + prof_entry.debug_handle = debug_handle; + } + prof_entry.start_time = et_pal_current_ticks(); + return prof_entry; +} + +// TODO: Update all occurrences of the ProfileEvent calls once the +// EventTracerEntry struct is updated. +EventTracerEntry ETDumpGen::start_profiling_delegate( + const char* name, + DebugHandle delegate_debug_index) { + ET_CHECK_MSG( + (name == nullptr) ^ (delegate_debug_index == -1), + "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); + check_ready_to_add_events(); + EventTracerEntry prof_entry; + DelegateDebugIdType delegate_event_id_type = + name == nullptr ? DelegateDebugIdType::kInt : DelegateDebugIdType::kStr; + prof_entry.delegate_event_id_type = delegate_event_id_type; + prof_entry.chain_id = chain_id_; + prof_entry.debug_handle = debug_handle_; + prof_entry.event_id = delegate_debug_index == static_cast(-1) + ? create_string_entry(name) + : delegate_debug_index; + prof_entry.start_time = et_pal_current_ticks(); + return prof_entry; +} + +void ETDumpGen::end_profiling_delegate( + EventTracerEntry event_tracer_entry, + const void* metadata, + size_t metadata_len) { + et_timestamp_t end_time = et_pal_current_ticks(); + check_ready_to_add_events(); + + // Start building the ProfileEvent entry. + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, event_tracer_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); + // Delegate debug identifier can either be of a string type or an integer + // type. If it's a string type then it's a value of type + // flatbuffers_string_ref_t type, whereas if it's an integer type then we + // write the integer value directly. + if (event_tracer_entry.delegate_event_id_type == DelegateDebugIdType::kInt) { + etdump_ProfileEvent_delegate_debug_id_int_add( + builder_, event_tracer_entry.event_id); + } else { + etdump_ProfileEvent_delegate_debug_id_str_add( + builder_, event_tracer_entry.event_id); + } + flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); +} + +void ETDumpGen::log_profiling_delegate( + const char* name, + DebugHandle delegate_debug_index, + et_timestamp_t start_time, + et_timestamp_t end_time, + const void* metadata, + size_t metadata_len) { + ET_CHECK_MSG( + (name == nullptr) ^ (delegate_debug_index == -1), + "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); + check_ready_to_add_events(); + int64_t string_id = name != nullptr ? create_string_entry(name) : -1; + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); + if (string_id == -1) { + etdump_ProfileEvent_delegate_debug_id_int_add( + builder_, delegate_debug_index); + } else { + etdump_ProfileEvent_delegate_debug_id_str_add(builder_, string_id); + } + flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); +} + +void ETDumpGen::log_intermediate_output_delegate( + const char* name, + DebugHandle delegate_debug_index, + const Tensor& output) { + log_intermediate_output_delegate_helper(name, delegate_debug_index, output); +} + +void ETDumpGen::log_intermediate_output_delegate( + const char* name, + DebugHandle delegate_debug_index, + const ArrayRef output) { + log_intermediate_output_delegate_helper(name, delegate_debug_index, output); +} + +void ETDumpGen::log_intermediate_output_delegate( + const char* name, + DebugHandle delegate_debug_index, + const int& output) { + log_intermediate_output_delegate_helper(name, delegate_debug_index, output); +} + +void ETDumpGen::log_intermediate_output_delegate( + const char* name, + DebugHandle delegate_debug_index, + const bool& output) { + log_intermediate_output_delegate_helper(name, delegate_debug_index, output); +} + +void ETDumpGen::log_intermediate_output_delegate( + const char* name, + DebugHandle delegate_debug_index, + const double& output) { + log_intermediate_output_delegate_helper(name, delegate_debug_index, output); +} + +template +void ETDumpGen::log_intermediate_output_delegate_helper( + const char* name, + DebugHandle delegate_debug_index, + const T& output) { + ET_CHECK_MSG( + (name == nullptr) ^ (delegate_debug_index == -1), + "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); + if (debug_buffer_.empty()) { + ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n"); + return; + } + + check_ready_to_add_events(); + int64_t string_id = name != nullptr ? create_string_entry(name) : -1; + + etdump_DebugEvent_start(builder_); + + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); + if (string_id == -1) { + etdump_DebugEvent_delegate_debug_id_int_add(builder_, delegate_debug_index); + } else { + etdump_DebugEvent_delegate_debug_id_str_add(builder_, string_id); + } + + // Check the type of `output` then call the corresponding logging functions + if constexpr (std::is_same::value) { + long offset = copy_tensor_to_debug_buffer(output); + etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset); + + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); + + } else if constexpr (std::is_same>::value) { + etdump_Tensor_vec_start(builder_); + for (size_t i = 0; i < output.size(); ++i) { + long offset = copy_tensor_to_debug_buffer(output[i]); + etdump_Tensor_vec_push( + builder_, add_tensor_entry(builder_, output[i], offset)); + } + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); + etdump_TensorList_ref_t tensor_list_ref = + etdump_TensorList_create(builder_, tensor_vec_ref); + + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); + } else if constexpr (std::is_same::value) { + auto int_ref = etdump_Int_create(builder_, output); + + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); + } else if constexpr (std::is_same::value) { + auto double_ref = etdump_Double_create(builder_, output); + + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); + } else if constexpr (std::is_same::value) { + flatbuffers_bool_t flatbuffer_bool_val = + output ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); + + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); + } else { + ET_CHECK_MSG(0, "Unsupported output type for intermediate logging\n"); + } + + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); + + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); + + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); +} + +void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { + et_timestamp_t end_time = et_pal_current_ticks(); + ET_CHECK_MSG( + prof_entry.delegate_event_id_type == DelegateDebugIdType::kNone, + "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); + check_ready_to_add_events(); + + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, prof_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, prof_entry.chain_id); + etdump_ProfileEvent_instruction_id_add(builder_, prof_entry.debug_handle); + if (prof_entry.event_id != -1) { + etdump_ProfileEvent_name_add(builder_, prof_entry.event_id); + } + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); +} + +AllocatorID ETDumpGen::track_allocator(const char* name) { + ET_CHECK_MSG( + (state_ == State::BlockCreated || state_ == State::AddingAllocators), + "Allocators can only be added immediately after a new block is created and before any events are added."); + if (state_ != State::AddingAllocators) { + etdump_RunData_allocators_start(builder_); + state_ = State::AddingAllocators; + } + flatbuffers_string_ref_t ref = create_string_entry(name); + etdump_RunData_allocators_push_create(builder_, ref); + return etdump_RunData_allocators_reserved_len(builder_); +} + +void ETDumpGen::track_allocation( + AllocatorID allocator_id, + size_t allocation_size) { + check_ready_to_add_events(); + + etdump_RunData_events_push_start(builder_); + etdump_Event_allocation_event_create(builder_, allocator_id, allocation_size); + etdump_RunData_events_push_end(builder_); +} + +ETDumpResult ETDumpGen::get_etdump_data() { + ETDumpResult result; + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); + } else if (state_ == State::Init) { + result.buf = nullptr; + result.size = 0; + return result; + } + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_end(builder_); + etdump_ETDump_ref_t root = etdump_ETDump_end(builder_); + flatbuffers_buffer_end(builder_, root); + if (num_blocks_ == 0) { + result = {nullptr, 0}; + } else { + if (alloc_.data) { + result.buf = alloc_.front_cursor; + result.size = alloc_.out_size - alloc_.front_left; + } else { + result.buf = + flatcc_builder_finalize_aligned_buffer(builder_, &result.size); + } + } + state_ = State::Done; + return result; +} + +void ETDumpGen::set_debug_buffer(Span buffer) { + debug_buffer_ = buffer; +} + +size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { + if (tensor.nbytes() == 0) { + return static_cast(-1); + } + uint8_t* offset_ptr = + alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64); + debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes(); + ET_CHECK_MSG( + debug_buffer_offset_ <= debug_buffer_.size(), + "Ran out of space to store intermediate outputs."); + memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes()); + return (size_t)(offset_ptr - debug_buffer_.data()); +} + +void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { + if (debug_buffer_.empty()) { + return; + } + + check_ready_to_add_events(); + + etdump_DebugEvent_start(builder_); + + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); + + switch (evalue.tag) { + case Tag::Tensor: { + exec_aten::Tensor tensor = evalue.toTensor(); + long offset = copy_tensor_to_debug_buffer(tensor); + etdump_Tensor_ref_t tensor_ref = + add_tensor_entry(builder_, tensor, offset); + + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); + if (evalue_type == LoggedEValueType::kProgramOutput) { + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); + } + auto value_ref = etdump_Value_end(builder_); + + etdump_DebugEvent_debug_entry_add(builder_, value_ref); + break; + } + + case Tag::ListTensor: { + exec_aten::ArrayRef tensors = evalue.toTensorList(); + etdump_Tensor_vec_start(builder_); + for (size_t i = 0; i < tensors.size(); ++i) { + long offset = copy_tensor_to_debug_buffer(tensors[i]); + etdump_Tensor_vec_push( + builder_, add_tensor_entry(builder_, tensors[i], offset)); + } + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); + etdump_TensorList_ref_t tensor_list_ref = + etdump_TensorList_create(builder_, tensor_vec_ref); + + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); + if (evalue_type == LoggedEValueType::kProgramOutput) { + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); + } + auto value_ref = etdump_Value_end(builder_); + + etdump_DebugEvent_debug_entry_add(builder_, value_ref); + break; + } + + case Tag::Int: { + int64_t val = evalue.toInt(); + auto int_ref = etdump_Int_create(builder_, val); + + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); + + break; + } + + case Tag::Double: { + double val = evalue.toDouble(); + auto double_ref = etdump_Double_create(builder_, val); + + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); + + break; + } + + case Tag::Bool: { + flatbuffers_bool_t flatbuffer_bool_val = + evalue.toBool() ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); + + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); + + break; + } + + default: + ET_CHECK_MSG( + 0, + "This EValue type = %d is not yet supported for logging\n", + static_cast(evalue.tag)); + break; + } + + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); + + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); +} + +size_t ETDumpGen::get_num_blocks() { + return num_blocks_; +} + +bool ETDumpGen::is_static_etdump() { + return alloc_.data != nullptr; +} + +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h new file mode 100644 index 0000000000..0bd891a097 --- /dev/null +++ b/devtools/etdump/etdump_flatcc.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include + +#define ETDUMP_VERSION 0 + +struct flatcc_builder; + +namespace executorch { +namespace etdump { + +namespace internal { +struct ETDumpStaticAllocator { + ETDumpStaticAllocator() = default; + + void + set_buffer(uint8_t* buffer, size_t total_buf_size, size_t alloc_buf_size) { + data = buffer; + data_size = alloc_buf_size; + allocated = 0; + out_size = total_buf_size - alloc_buf_size; + front_cursor = &buffer[alloc_buf_size]; + front_left = out_size / 2; + } + + // Pointer to backing buffer to allocate from. + uint8_t* data{nullptr}; + + // Size of backing buffer. + size_t data_size{0}; + + // Current allocation offset. + size_t allocated{0}; + + // Size of build buffer. + size_t out_size{0}; + + // Pointer to front of build buffer. + uint8_t* front_cursor{nullptr}; + + // Bytes left in front of front_cursor. + size_t front_left{0}; +}; +} // namespace internal + +struct ETDumpResult { + void* buf; + size_t size; +}; + +class ETDumpGen : public ::executorch::runtime::EventTracer { + public: + ETDumpGen(::executorch::runtime::Span buffer = {nullptr, (size_t)0}); + ~ETDumpGen() override; + void clear_builder(); + + void create_event_block(const char* name) override; + virtual ::executorch::runtime::EventTracerEntry start_profiling( + const char* name, + ::executorch::runtime::ChainID chain_id = -1, + ::executorch::runtime::DebugHandle debug_handle = 0) override; + virtual void end_profiling( + ::executorch::runtime::EventTracerEntry prof_entry) override; + virtual ::executorch::runtime::EventTracerEntry start_profiling_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index) override; + virtual void end_profiling_delegate( + ::executorch::runtime::EventTracerEntry prof_entry, + const void* metadata, + size_t metadata_len) override; + virtual void log_profiling_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + et_timestamp_t start_time, + et_timestamp_t end_time, + const void* metadata, + size_t metadata_len) override; + virtual void track_allocation( + ::executorch::runtime::AllocatorID id, + size_t size) override; + virtual ::executorch::runtime::AllocatorID track_allocator( + const char* name) override; + virtual void log_evalue( + const ::executorch::runtime::EValue& evalue, + ::executorch::runtime::LoggedEValueType evalue_type = + ::executorch::runtime::LoggedEValueType::kIntermediateOutput) + override; + /** + * Log an intermediate tensor output from a delegate. + */ + virtual void log_intermediate_output_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + const exec_aten::Tensor& output) override; + + /** + * Log an intermediate tensor array output from a delegate. + */ + virtual void log_intermediate_output_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + const ::executorch::runtime::ArrayRef output) override; + + /** + * Log an intermediate int output from a delegate. + */ + virtual void log_intermediate_output_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + const int& output) override; + + /** + * Log an intermediate bool output from a delegate. + */ + virtual void log_intermediate_output_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + const bool& output) override; + + /** + * Log an intermediate double output from a delegate. + */ + virtual void log_intermediate_output_delegate( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + const double& output) override; + void set_debug_buffer(::executorch::runtime::Span buffer); + ETDumpResult get_etdump_data(); + size_t get_num_blocks(); + bool is_static_etdump(); + void reset(); + + private: + enum class State { + Init, + BlockCreated, + AddingAllocators, + AddingEvents, + Done, + }; + + void check_ready_to_add_events(); + int64_t create_string_entry(const char* name); + size_t copy_tensor_to_debug_buffer(exec_aten::Tensor tensor); + + /** + * Templated helper function used to log various types of intermediate output. + * Supported types include tensor, tensor array, int, bool and double. + */ + template + void log_intermediate_output_delegate_helper( + const char* name, + ::executorch::runtime::DebugHandle delegate_debug_index, + const T& output); + + struct flatcc_builder* builder_; + size_t num_blocks_ = 0; + ::executorch::runtime::Span debug_buffer_; + size_t debug_buffer_offset_ = 0; + int bundled_input_index_ = -1; + State state_ = State::Init; + struct internal::ETDumpStaticAllocator alloc_; +}; + +} // namespace etdump +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using etdump_result = ::executorch::etdump::ETDumpResult; +using ::executorch::etdump::ETDumpGen; +} // namespace executor +} // namespace torch diff --git a/sdk/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs similarity index 96% rename from sdk/etdump/etdump_schema_flatcc.fbs rename to devtools/etdump/etdump_schema_flatcc.fbs index d90d278f5f..1244ebd4ae 100644 --- a/sdk/etdump/etdump_schema_flatcc.fbs +++ b/devtools/etdump/etdump_schema_flatcc.fbs @@ -76,6 +76,10 @@ table DebugEvent { // String based delegate debug identifier. delegate_debug_id_str:string; + + // Name assigned to this debug event by the runtime. If it is an operator + // call this will just be the name of the operator that was executed. + name:string; } // All the details pertaining to an allocation done in the runtime. The main diff --git a/sdk/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs similarity index 97% rename from sdk/etdump/scalar_type.fbs rename to devtools/etdump/scalar_type.fbs index fdfe550e9e..a8da080c67 100644 --- a/sdk/etdump/scalar_type.fbs +++ b/devtools/etdump/scalar_type.fbs @@ -14,6 +14,7 @@ enum ScalarType : byte { SHORT = 2, INT = 3, LONG = 4, + HALF = 5, FLOAT = 6, DOUBLE = 7, BOOL = 11, @@ -24,7 +25,6 @@ enum ScalarType : byte { QUINT4X2 = 16, QUINT2X4 = 17, // Types currently not implemented. - // Half = 5, // COMPLEXHALF = 8, // COMPLEXFLOAT = 9, // COMPLEXDOUBLE = 10, diff --git a/sdk/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py similarity index 96% rename from sdk/etdump/schema_flatcc.py rename to devtools/etdump/schema_flatcc.py index eaad876a53..404fa1c975 100644 --- a/sdk/etdump/schema_flatcc.py +++ b/devtools/etdump/schema_flatcc.py @@ -7,7 +7,7 @@ # pyre-strict """ This file is the python representation of the schema contained in -executorch/sdk/etdump/etdump_schema.fbs. Any changes made to that +executorch/devtools/etdump/etdump_schema.fbs. Any changes made to that flatbuffer schema should accordingly be reflected here also. """ @@ -93,6 +93,7 @@ class Value: @dataclass class DebugEvent: + name: Optional[str] chain_index: int instruction_id: int delegate_debug_id_int: Optional[int] diff --git a/sdk/etdump/serialize.py b/devtools/etdump/serialize.py similarity index 98% rename from sdk/etdump/serialize.py rename to devtools/etdump/serialize.py index 0cc6682bfc..4ed63bc385 100644 --- a/sdk/etdump/serialize.py +++ b/devtools/etdump/serialize.py @@ -11,11 +11,11 @@ import tempfile import pkg_resources +from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC # The prefix of schema files used for etdump ETDUMP_FLATCC_SCHEMA_NAME = "etdump_schema_flatcc" diff --git a/devtools/etdump/targets.bzl b/devtools/etdump/targets.bzl new file mode 100644 index 0000000000..ddbb35eab7 --- /dev/null +++ b/devtools/etdump/targets.bzl @@ -0,0 +1,116 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +SCALAR_TYPE_STEM = "scalar_type" +SCALAR_TYPE = SCALAR_TYPE_STEM + ".fbs" + +# flatcc +ETDUMP_STEM_FLATCC = "etdump_schema_flatcc" +ETDUMP_SCHEMA_FLATCC = ETDUMP_STEM_FLATCC + ".fbs" +ETDUMP_GEN_RULE_NAME_FLATCC = "generate_etdump" + "_flatcc" + +ETDUMP_SCHEMA_FLATCC_BUILDER = ETDUMP_STEM_FLATCC + "_builder.h" +ETDUMP_SCHEMA_FLATCC_READER = ETDUMP_STEM_FLATCC + "_reader.h" +ETDUMP_SCHEMA_FLATCC_VERIFIER = ETDUMP_STEM_FLATCC + "_verifier.h" + +SCALAR_TYPE_BUILDER = SCALAR_TYPE_STEM + "_builder.h" +SCALAR_TYPE_READER = SCALAR_TYPE_STEM + "_reader.h" +SCALAR_TYPE_VERIFIER = SCALAR_TYPE_STEM + "_verifier.h" + +FLATBUFFERS_COMMON_STEM = "flatbuffers_common" +FLATBUFFERS_COMMON_BUILDER = FLATBUFFERS_COMMON_STEM + "_builder.h" +FLATBUFFERS_COMMON_READER = FLATBUFFERS_COMMON_STEM + "_reader.h" + +def generate_schema_header_flatcc(rule_name, srcs, headers, default_headers): + """ + Generate header files for ETDump schema + """ + runtime.genrule( + name = rule_name, + srcs = srcs, + outs = {header: [header] for header in headers}, + default_outs = default_headers, + cmd = " ".join([ + "$(exe {})".format(runtime.external_dep_location("flatcc-cli")), + "-cwr", + "-o ${OUT}", + "${SRCS}", + # Let our infra know that the file was generated. + " ".join(["&& echo // @" + "generated >> ${OUT}/" + header for header in headers]), + ]), + ) + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + runtime.export_file( + name = ETDUMP_SCHEMA_FLATCC, + visibility = ["@EXECUTORCH_CLIENTS"], + ) + + generate_schema_header_flatcc( + ETDUMP_GEN_RULE_NAME_FLATCC, + [ETDUMP_SCHEMA_FLATCC, SCALAR_TYPE], + [ + ETDUMP_SCHEMA_FLATCC_BUILDER, + ETDUMP_SCHEMA_FLATCC_READER, + ETDUMP_SCHEMA_FLATCC_VERIFIER, + SCALAR_TYPE_BUILDER, + SCALAR_TYPE_READER, + SCALAR_TYPE_VERIFIER, + FLATBUFFERS_COMMON_BUILDER, + FLATBUFFERS_COMMON_READER, + ], + [ + ETDUMP_SCHEMA_FLATCC_BUILDER, + ETDUMP_SCHEMA_FLATCC_READER, + ETDUMP_SCHEMA_FLATCC_VERIFIER, + ], + ) + + runtime.cxx_library( + name = ETDUMP_STEM_FLATCC, + srcs = [], + visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"], + exported_headers = { + ETDUMP_SCHEMA_FLATCC_BUILDER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, ETDUMP_SCHEMA_FLATCC_BUILDER), + ETDUMP_SCHEMA_FLATCC_READER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, ETDUMP_SCHEMA_FLATCC_READER), + ETDUMP_SCHEMA_FLATCC_VERIFIER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, ETDUMP_SCHEMA_FLATCC_VERIFIER), + SCALAR_TYPE_BUILDER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, SCALAR_TYPE_BUILDER), + SCALAR_TYPE_READER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, SCALAR_TYPE_READER), + SCALAR_TYPE_VERIFIER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, SCALAR_TYPE_VERIFIER), + FLATBUFFERS_COMMON_BUILDER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, FLATBUFFERS_COMMON_BUILDER), + FLATBUFFERS_COMMON_READER: ":{}[{}]".format(ETDUMP_GEN_RULE_NAME_FLATCC, FLATBUFFERS_COMMON_READER), + }, + exported_external_deps = ["flatccrt"], + ) + + for aten_mode in (True, False): + aten_suffix = "_aten" if aten_mode else "" + runtime.cxx_library( + name = "etdump_flatcc" + aten_suffix, + srcs = [ + "etdump_flatcc.cpp", + "emitter.cpp", + ], + headers = [ + "emitter.h", + ], + exported_headers = [ + "etdump_flatcc.h", + ], + deps = [ + "//executorch/runtime/platform:platform", + ], + exported_deps = [ + ":etdump_schema_flatcc", + "//executorch/runtime/core:event_tracer" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) diff --git a/sdk/etdump/tests/CMakeLists.txt b/devtools/etdump/tests/CMakeLists.txt similarity index 100% rename from sdk/etdump/tests/CMakeLists.txt rename to devtools/etdump/tests/CMakeLists.txt diff --git a/devtools/etdump/tests/TARGETS b/devtools/etdump/tests/TARGETS new file mode 100644 index 0000000000..51e807891d --- /dev/null +++ b/devtools/etdump/tests/TARGETS @@ -0,0 +1,18 @@ +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() + +python_unittest( + name = "serialize_test", + srcs = [ + "serialize_test.py", + ], + deps = [ + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etdump:serialize", + "//executorch/exir/_serialize:lib", + ], +) diff --git a/sdk/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp similarity index 94% rename from sdk/etdump/tests/etdump_test.cpp rename to devtools/etdump/tests/etdump_test.cpp index d30bd9a303..b750e21eb0 100644 --- a/sdk/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -9,19 +9,31 @@ #include #include +#include +#include +#include #include #include #include -#include -#include -#include #include #include #include #include -namespace torch { -namespace executor { +using ::exec_aten::ScalarType; +using ::exec_aten::Tensor; +using ::executorch::etdump::ETDumpGen; +using ::executorch::etdump::ETDumpResult; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::BoxedEvalueList; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; +using ::executorch::runtime::testing::TensorFactory; class ProfilerETDumpTest : public ::testing::Test { protected: @@ -49,7 +61,7 @@ TEST_F(ProfilerETDumpTest, SingleProfileEvent) { EventTracerEntry entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -105,7 +117,7 @@ TEST_F(ProfilerETDumpTest, EmptyBlocks) { etdump_gen[i]->start_profiling("test_event_1", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -160,7 +172,7 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) { TEST_F(ProfilerETDumpTest, DebugEvent) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); etdump_gen[i]->create_event_block("test_block"); @@ -189,7 +201,7 @@ TEST_F(ProfilerETDumpTest, DebugEvent) { TEST_F(ProfilerETDumpTest, DebugEventTensorList) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})}; EValue evalue_1(storage[0]); EValue evalue_2(storage[1]); @@ -212,7 +224,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) { } TEST_F(ProfilerETDumpTest, VerifyLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -225,7 +237,7 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) { etdump_gen[i]->log_evalue(evalue); etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -297,7 +309,7 @@ TEST_F(ProfilerETDumpTest, MultipleBlocksWithEvents) { entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -363,7 +375,7 @@ TEST_F(ProfilerETDumpTest, VerifyData) { entry = etdump_gen[i]->start_profiling("test_event2", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -421,7 +433,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { Span buffer((uint8_t*)ptr, 2048); etdump_gen[i]->create_event_block("test_block"); - testing::TensorFactory tf; + TensorFactory tf; ET_EXPECT_DEATH( etdump_gen[i]->log_intermediate_output_delegate( @@ -462,7 +474,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { static_cast(-1), true); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -474,7 +486,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { } TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -492,7 +504,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { etdump_gen[i]->log_intermediate_output_delegate( nullptr, 258, tf.ones({5, 6})); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -603,7 +615,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateEvents) { etdump_gen[i]->end_profiling(entry), "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -681,7 +693,7 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -712,6 +724,3 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { } } } - -} // namespace executor -} // namespace torch diff --git a/sdk/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py similarity index 96% rename from sdk/etdump/tests/serialize_test.py rename to devtools/etdump/tests/serialize_test.py index 2b1497f597..5cab3e5b2b 100644 --- a/sdk/etdump/tests/serialize_test.py +++ b/devtools/etdump/tests/serialize_test.py @@ -12,13 +12,13 @@ from pprint import pformat from typing import List -import executorch.sdk.etdump.schema_flatcc as flatcc -from executorch.exir._serialize._dataclass import _DataclassEncoder +import executorch.devtools.etdump.schema_flatcc as flatcc -from executorch.sdk.etdump.serialize import ( +from executorch.devtools.etdump.serialize import ( deserialize_from_etdump_flatcc, serialize_to_etdump_flatcc, ) +from executorch.exir._serialize._dataclass import _DataclassEncoder def diff_jsons(a: str, b: str) -> List[str]: @@ -83,6 +83,7 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC: profile_event=None, allocation_event=None, debug_event=flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", diff --git a/devtools/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl new file mode 100644 index 0000000000..5299b7c1cb --- /dev/null +++ b/devtools/etdump/tests/targets.bzl @@ -0,0 +1,21 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + runtime.cxx_test( + name = "etdump_test", + srcs = [ + "etdump_test.cpp", + ], + deps = [ + "//executorch/devtools/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_schema_flatcc", + "//executorch/runtime/platform:platform", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + ], + ) diff --git a/devtools/etrecord/TARGETS b/devtools/etrecord/TARGETS new file mode 100644 index 0000000000..09fc3212bf --- /dev/null +++ b/devtools/etrecord/TARGETS @@ -0,0 +1,18 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("executorch") + +python_library( + name = "etrecord", + srcs = [ + "__init__.py", + "_etrecord.py", + ], + deps = [ + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", + "//executorch/exir:lib", + "//executorch/exir/emit:emit", + "//executorch/exir/serde:serialize", + ], +) diff --git a/devtools/etrecord/__init__.py b/devtools/etrecord/__init__.py new file mode 100644 index 0000000000..59ff4e44c2 --- /dev/null +++ b/devtools/etrecord/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.devtools.etrecord._etrecord import ( + ETRecord, + generate_etrecord, + parse_etrecord, +) + +__all__ = ["ETRecord", "generate_etrecord", "parse_etrecord"] diff --git a/sdk/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py similarity index 96% rename from sdk/etrecord/_etrecord.py rename to devtools/etrecord/_etrecord.py index 55e231f216..de7cf93990 100644 --- a/sdk/etrecord/_etrecord.py +++ b/devtools/etrecord/_etrecord.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import json import os import pickle @@ -12,6 +14,9 @@ from zipfile import BadZipFile, ZipFile from executorch import exir +from executorch.devtools.bundled_program.core import BundledProgram + +from executorch.devtools.bundled_program.schema.bundled_program_schema import Value from executorch.exir import ( EdgeProgramManager, ExecutorchProgram, @@ -23,9 +28,6 @@ from executorch.exir.serde.export_serialize import SerializedArtifact from executorch.exir.serde.serialize import deserialize, serialize -from executorch.sdk.bundled_program.core import BundledProgram - -from executorch.sdk.bundled_program.schema.bundled_program_schema import Value ProgramOutput = List[Value] @@ -182,13 +184,13 @@ def generate_etrecord( is the closest graph module representation of what is eventually run on the device. In addition to all the graph modules, we also serialize the program buffer, which the users can provide to the ExecuTorch runtime to run the model, and the debug handle map - for SDK tooling usage. + for Developer Tools usage. Args: - etrecord_path: Path to where the `ETRecord` file will be saved to. + et_record: Path to where the `ETRecord` file will be saved to. edge_dialect_program: `EdgeProgramManager` for this model returned by the call to to_edge() executorch_program: The ExecuTorch program for this model returned by the call to `to_executorch()` or the `BundledProgram` of this model - export_modules[Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the + export_modules [Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the value being the corresponding exported module. The exported graph modules can be either the output of `torch.export()` or `exir.to_edge()`. @@ -201,7 +203,7 @@ def generate_etrecord( etrecord_zip = ZipFile(et_record, "w") # Write the magic file identifier that will be used to verify that this file - # is an etrecord when it's used later in the SDK tooling. + # is an etrecord when it's used later in the Developer Tools. etrecord_zip.writestr(ETRecordReservedFileNames.ETRECORD_IDENTIFIER, "") if export_modules is not None: diff --git a/devtools/etrecord/tests/TARGETS b/devtools/etrecord/tests/TARGETS new file mode 100644 index 0000000000..fffa7f1834 --- /dev/null +++ b/devtools/etrecord/tests/TARGETS @@ -0,0 +1,30 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "etrecord_test", + srcs = ["etrecord_test.py"], + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", + "//executorch/exir:lib", + "//executorch/exir/tests:models", + ], +) + +python_library( + name = "etrecord_test_library", + srcs = ["etrecord_test.py"], + deps = [ + "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", + "//executorch/exir:lib", + "//executorch/exir/tests:models", + ], +) diff --git a/sdk/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py similarity index 95% rename from sdk/etrecord/tests/etrecord_test.py rename to devtools/etrecord/tests/etrecord_test.py index bc534fd487..daef7c3e1e 100644 --- a/sdk/etrecord/tests/etrecord_test.py +++ b/devtools/etrecord/tests/etrecord_test.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import copy import json import tempfile @@ -12,14 +14,14 @@ import executorch.exir.tests.models as models import torch from executorch import exir -from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.etrecord import generate_etrecord, parse_etrecord -from executorch.sdk.etrecord._etrecord import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.etrecord import generate_etrecord, parse_etrecord +from executorch.devtools.etrecord._etrecord import ( _get_reference_outputs, ETRecordReservedFileNames, ) +from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge from torch.export import export @@ -75,7 +77,7 @@ def get_test_model_with_manager(self): return (aten_dialect, edge_program_copy, edge_program.to_executorch()) # Serialized and deserialized graph modules are not completely the same, so we check - # that they are close enough and match especially on the parameters we care about in the SDK. + # that they are close enough and match especially on the parameters we care about in the Developer Tools. def check_graph_closeness(self, graph_a, graph_b): self.assertEqual(len(graph_a.graph.nodes), len(graph_b.graph.nodes)) for node_a, node_b in zip(graph_a.graph.nodes, graph_b.graph.nodes): diff --git a/devtools/inspector/TARGETS b/devtools/inspector/TARGETS new file mode 100644 index 0000000000..bba5f7f895 --- /dev/null +++ b/devtools/inspector/TARGETS @@ -0,0 +1,58 @@ +load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("executorch") + +python_library( + name = "inspector", + srcs = [ + "_inspector.py", + ], + deps = [ + "fbsource//third-party/pypi/ipython:ipython", + "fbsource//third-party/pypi/numpy:numpy", + "fbsource//third-party/pypi/pandas:pandas", + "fbsource//third-party/pypi/tabulate:tabulate", + ":inspector_utils", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord:etrecord", + "//executorch/exir:lib", + ], +) + +python_binary( + name = "inspector_cli", + main_function = ".inspector_cli.main", + main_src = "inspector_cli.py", + deps = [ + "//executorch/devtools:lib", + "//executorch/devtools/inspector:lib", + ], +) + +python_library( + name = "inspector_utils", + srcs = [ + "_inspector_utils.py", + ], + deps = [ + "fbsource//third-party/pypi/matplotlib:matplotlib", + "fbsource//third-party/pypi/numpy:numpy", + "//caffe2:torch", + "//executorch/devtools/debug_format:base_schema", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etdump:serialize", + "//executorch/devtools/etrecord:etrecord", + ], +) + +python_library( + name = "lib", + srcs = ["__init__.py"], + deps = [ + ":inspector", + ":inspector_utils", + ], +) diff --git a/devtools/inspector/__init__.py b/devtools/inspector/__init__.py new file mode 100644 index 0000000000..375123a0a5 --- /dev/null +++ b/devtools/inspector/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from executorch.devtools.inspector._inspector import ( + Event, + EventBlock, + Inspector, + PerfData, +) +from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale + +__all__ = [ + "Event", + "EventBlock", + "Inspector", + "PerfData", + "compare_results", + "TimeScale", +] diff --git a/sdk/inspector/_inspector.py b/devtools/inspector/_inspector.py similarity index 94% rename from sdk/inspector/_inspector.py rename to devtools/inspector/_inspector.py index 5f9bfafee7..0539d4f5e4 100644 --- a/sdk/inspector/_inspector.py +++ b/devtools/inspector/_inspector.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import dataclasses import logging import sys @@ -26,16 +28,20 @@ Union, ) -import executorch.sdk.etdump.schema_flatcc as flatcc +import executorch.devtools.etdump.schema_flatcc as flatcc import numpy as np import pandas as pd -from executorch.exir import ExportedProgram -from executorch.sdk.debug_format.et_schema import OperatorGraph, OperatorNode -from executorch.sdk.etdump.schema_flatcc import DebugEvent, ETDumpFlatCC, ProfileEvent -from executorch.sdk.etrecord import ETRecord, parse_etrecord -from executorch.sdk.inspector._inspector_utils import ( +from executorch.devtools.debug_format.et_schema import OperatorGraph, OperatorNode +from executorch.devtools.etdump.schema_flatcc import ( + DebugEvent, + ETDumpFlatCC, + ProfileEvent, +) +from executorch.devtools.etrecord import ETRecord, parse_etrecord +from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, EXCLUDED_COLUMNS_WHEN_PRINTING, @@ -49,10 +55,10 @@ is_inference_output_equal, ProgramOutput, RESERVED_FRAMEWORK_EVENT_NAMES, - TIME_SCALE_DICT, TimeScale, verify_debug_data_equivalence, ) +from executorch.exir import ExportedProgram from tabulate import tabulate @@ -146,6 +152,7 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature": # Signature of a DebugEvent @dataclass(frozen=True, order=True) class DebugEventSignature: + name: str = "" instruction_id: Optional[int] = -1 delegate_id: Optional[int] = None delegate_id_str: Optional[str] = None @@ -159,6 +166,7 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature": The Signature will convert these back to the intended None value """ return DebugEventSignature( + event.name or "", event.instruction_id if event.instruction_id != -1 else None, event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None, event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None, @@ -464,46 +472,63 @@ def _calculate_elapsed_time(start_time, end_time): return elapsed_time @staticmethod - def _populate_profiling_related_fields( + def _populate_event_signature_fields( ret_event: "Event", - profile_event_signature: Optional[ProfileEventSignature], - events: List[InstructionEvent], - scale_factor: float, + event_signature: Optional[Union[ProfileEventSignature, DebugEventSignature]], ) -> None: """ Given a partially constructed Event, populate the fields related to - the profile events + the profile event signature or debug event signature Fields Updated: name delegate_debug_identifier is_delegated_op - perf_data - delegate_debug_metadatas """ - - # Fill out fields from profile event signature - if profile_event_signature is not None: - if profile_event_signature.delegate_id is not None: # 0 is a valid value - delegate_debug_identifier = profile_event_signature.delegate_id + # TODO: T201347372 Push the None check to ealier in the stack. + if event_signature is not None: + if event_signature.delegate_id is not None: # 0 is a valid value + delegate_debug_identifier = event_signature.delegate_id else: - delegate_debug_identifier = ( - profile_event_signature.delegate_id_str or None - ) + delegate_debug_identifier = event_signature.delegate_id_str or None # Use the delegate identifier as the event name if delegated is_delegated_op = delegate_debug_identifier is not None name = ( - profile_event_signature.name + event_signature.name if not is_delegated_op else str(delegate_debug_identifier) ) # Update fields - ret_event.name = name + # This is for older version of etdump that doesn't have the name field for debug events, we don't update the name field + if name: + ret_event.name = name ret_event.delegate_debug_identifier = delegate_debug_identifier ret_event.is_delegated_op = is_delegated_op + @staticmethod + def _populate_profiling_related_fields( + ret_event: "Event", + profile_event_signature: Optional[ProfileEventSignature], + events: List[InstructionEvent], + scale_factor: float, + ) -> None: + """ + Given a partially constructed Event, populate the fields related to + the profile events + + Fields Updated: + name + delegate_debug_identifier + is_delegated_op + perf_data + delegate_debug_metadatas + """ + + # Fill out fields from profile event signature + Event._populate_event_signature_fields(ret_event, profile_event_signature) + # Fill out fields from profile event data = [] delegate_debug_metadatas = [] @@ -571,9 +596,15 @@ def _populate_debugging_related_fields( the debug events Fields Updated: + name + delegate_debug_identifier + is_delegated_op debug_data """ + # Fill out fields from debug event signature + Event._populate_event_signature_fields(ret_event, debug_event_signature) + debug_data: List[flatcc.Value] = [] for event in events: if (debug_events := event.debug_events) is None: @@ -795,9 +826,7 @@ class GroupedRunInstances: # Construct the EventBlocks event_blocks = [] - scale_factor = ( - TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] - ) + scale_factor = calculate_time_scale_factor(source_time_scale, target_time_scale) for run_signature, grouped_run_instance in run_groups.items(): run_group: OrderedDict[EventSignature, List[InstructionEvent]] = ( grouped_run_instance.events @@ -962,6 +991,9 @@ def __init__( debug_buffer_path: Debug buffer file path that contains the debug data referenced by ETDump for intermediate and program outputs. delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is: (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]] + delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of + target_time_scale/source_time_scale. + enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False. Returns: None @@ -976,6 +1008,14 @@ def __init__( self._source_time_scale = source_time_scale self._target_time_scale = target_time_scale + if delegate_time_scale_converter is None: + scale_factor = calculate_time_scale_factor( + source_time_scale, target_time_scale + ) + delegate_time_scale_converter = ( + lambda event_name, input_time: input_time / scale_factor + ) + if etrecord is None: self._etrecord = None elif isinstance(etrecord, ETRecord): @@ -998,10 +1038,10 @@ def __init__( ) self.event_blocks = EventBlock._gen_from_etdump( - etdump, - self._source_time_scale, - self._target_time_scale, - output_buffer, + etdump=etdump, + source_time_scale=self._source_time_scale, + target_time_scale=self._target_time_scale, + output_buffer=output_buffer, delegate_metadata_parser=delegate_metadata_parser, delegate_time_scale_converter=delegate_time_scale_converter, ) diff --git a/sdk/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py similarity index 95% rename from sdk/inspector/_inspector_utils.py rename to devtools/inspector/_inspector_utils.py index 6879e85505..5f04e2d041 100644 --- a/sdk/inspector/_inspector_utils.py +++ b/devtools/inspector/_inspector_utils.py @@ -4,18 +4,20 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import math from enum import Enum from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union -import executorch.sdk.etdump.schema_flatcc as flatcc +import executorch.devtools.etdump.schema_flatcc as flatcc import torch -from executorch.sdk.debug_format.base_schema import OperatorNode +from executorch.devtools.debug_format.base_schema import OperatorNode -from executorch.sdk.debug_format.et_schema import FXOperatorGraph, OperatorGraph -from executorch.sdk.etdump.schema_flatcc import ( +from executorch.devtools.debug_format.et_schema import FXOperatorGraph, OperatorGraph +from executorch.devtools.etdump.schema_flatcc import ( DebugEvent, ETDumpFlatCC, ProfileEvent, @@ -25,8 +27,8 @@ ValueType, ) -from executorch.sdk.etdump.serialize import deserialize_from_etdump_flatcc -from executorch.sdk.etrecord import ETRecord +from executorch.devtools.etdump.serialize import deserialize_from_etdump_flatcc +from executorch.devtools.etrecord import ETRecord FORWARD = "forward" EDGE_DIALECT_GRAPH_KEY = "edge_dialect_graph_module" @@ -63,6 +65,15 @@ class TimeScale(Enum): } +def calculate_time_scale_factor( + source_time_scale: TimeScale, target_time_scale: TimeScale +) -> float: + """ + Calculate the factor (source divided by target) between two time scales + """ + return TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] + + # Model Debug Output InferenceOutput: TypeAlias = Union[ torch.Tensor, List[torch.Tensor], int, float, str, bool, None diff --git a/sdk/inspector/inspector_cli.py b/devtools/inspector/inspector_cli.py similarity index 93% rename from sdk/inspector/inspector_cli.py rename to devtools/inspector/inspector_cli.py index d6c8d5442f..db3536a84b 100644 --- a/sdk/inspector/inspector_cli.py +++ b/devtools/inspector/inspector_cli.py @@ -4,10 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import argparse -from executorch.sdk import Inspector -from executorch.sdk.inspector._inspector_utils import compare_results, TimeScale +from executorch.devtools import Inspector +from executorch.devtools.inspector import compare_results, TimeScale def main() -> None: diff --git a/devtools/inspector/tests/TARGETS b/devtools/inspector/tests/TARGETS new file mode 100644 index 0000000000..eada6817bc --- /dev/null +++ b/devtools/inspector/tests/TARGETS @@ -0,0 +1,41 @@ +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "inspector_test", + srcs = ["inspector_test.py"], + deps = [ + "//executorch/devtools:lib", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord/tests:etrecord_test_library", + "//executorch/devtools/inspector:inspector", + "//executorch/devtools/inspector:lib", + "//executorch/exir:lib", + ], +) + +python_unittest( + name = "event_blocks_test", + srcs = ["event_blocks_test.py"], + deps = [ + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/inspector:inspector", + "//executorch/devtools/inspector:lib", + ], +) + +python_unittest( + name = "inspector_utils_test", + srcs = ["inspector_utils_test.py"], + deps = [ + "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/devtools/debug_format:base_schema", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord/tests:etrecord_test_library", + "//executorch/devtools/inspector:inspector_utils", + ], +) diff --git a/sdk/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py similarity index 89% rename from sdk/inspector/tests/event_blocks_test.py rename to devtools/inspector/tests/event_blocks_test.py index 7c7da00186..85b65aa5f3 100644 --- a/sdk/inspector/tests/event_blocks_test.py +++ b/devtools/inspector/tests/event_blocks_test.py @@ -8,10 +8,10 @@ import unittest from typing import List, Optional, Tuple, Union -import executorch.sdk.etdump.schema_flatcc as flatcc -from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent -from executorch.sdk.inspector import Event, EventBlock, PerfData -from executorch.sdk.inspector._inspector import ( +import executorch.devtools.etdump.schema_flatcc as flatcc +from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent +from executorch.devtools.inspector import Event, EventBlock, PerfData +from executorch.devtools.inspector._inspector import ( DelegateMetadata, EventSignature, InstructionEvent, @@ -62,6 +62,7 @@ def _gen_sample_profile_event( def _gen_sample_debug_event( instruction_id: int, delegate_debug_id: Optional[Union[int, str]] = None, + name: str = "test_debug_event", ) -> flatcc.DebugEvent: """ Helper for generating test DebugEvents @@ -77,6 +78,7 @@ def _gen_sample_debug_event( ) return flatcc.DebugEvent( + name=name, chain_index=0, instruction_id=instruction_id, delegate_debug_id_int=delegate_debug_id_int, @@ -299,6 +301,42 @@ def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC: return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3]) + @staticmethod + def _get_sample_etdump_flatcc_debug_events_only( + event_name: str, + delegate_debug_id: str, + ) -> flatcc.ETDumpFlatCC: + """ + Helper for getting a sample ETDumpFlatCC object with RunData signature_a + and (debug_event_delegated, debug_event_non_delegated, no profile event) + """ + + debug_event_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, delegate_debug_id=delegate_debug_id, name=event_name + ) + debug_event_non_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, name=event_name + ) + run_data_1 = flatcc.RunData( + name="signature_a", + bundled_input_index=-1, + allocators=[], + events=[ + flatcc.Event( + allocation_event=None, + debug_event=debug_event_delegated, + profile_event=None, + ), + flatcc.Event( + allocation_event=None, + debug_event=debug_event_non_delegated, + profile_event=None, + ), + ], + ) + + return ETDumpFlatCC(version=0, run_data=[run_data_1]) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def test_gen_from_etdump(self) -> None: @@ -370,6 +408,30 @@ def test_gen_from_etdump_inconsistent_debug_data(self) -> None: with self.assertRaises(AssertionError): EventBlock._gen_from_etdump(etdump) + def test_gen_from_etdump_debug_events_only(self) -> None: + """ + Test generation of EventBlocks given an ETDump with only debugging events + + Specifically it tests: + - Correct number of EventBlocks and Events + - Correct name of each Event + """ + event_name = "test_debug_event_only" + delegate_debug_id = "debug_id" + etdump: ETDumpFlatCC = ( + TestEventBlock._get_sample_etdump_flatcc_debug_events_only( + event_name=event_name, + delegate_debug_id=delegate_debug_id, + ) + ) + event_blocks = EventBlock._gen_from_etdump(etdump) + self.assertEqual(len(event_blocks), 1) + self.assertEqual(len(event_blocks[0].events), 2) + # Delegated event uses delegate_debug_id as event name + self.assertEqual(event_blocks[0].events[0].name, delegate_debug_id) + # Non delegated event uses event_name as event name + self.assertEqual(event_blocks[0].events[1].name, event_name) + def test_inspector_event_generation(self) -> None: """ Test Inspector.Event derivation from various ProfileEvent cases diff --git a/sdk/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py similarity index 89% rename from sdk/inspector/tests/inspector_test.py rename to devtools/inspector/tests/inspector_test.py index a372c7c569..34c96eef53 100644 --- a/sdk/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -4,31 +4,41 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import random import statistics import tempfile import unittest from contextlib import redirect_stdout -from typing import List +from typing import Callable, List from unittest.mock import patch -from executorch.exir import ExportedProgram -from executorch.sdk import generate_etrecord, parse_etrecord -from executorch.sdk.debug_format.et_schema import OperatorNode -from executorch.sdk.etdump.schema_flatcc import ProfileEvent -from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord - -from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData -from executorch.sdk.inspector._inspector import ( +from executorch.devtools import generate_etrecord, parse_etrecord +from executorch.devtools.debug_format.et_schema import OperatorNode +from executorch.devtools.etdump.schema_flatcc import ProfileEvent +from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord + +from executorch.devtools.inspector import ( + _inspector, + Event, + EventBlock, + Inspector, + PerfData, +) +from executorch.devtools.inspector._inspector import ( DebugEventSignature, flatcc, InstructionEvent, InstructionEventSignature, ProfileEventSignature, + TimeScale, ) +from executorch.exir import ExportedProgram + OP_TYPE = "aten::add" EVENT_BLOCK_NAME = "block_0" @@ -81,6 +91,33 @@ def test_inspector_constructor(self): # Because we mocked parse_etrecord() to return None, this method shouldn't be called mock_gen_graphs_from_etrecord.assert_not_called() + def test_default_delegate_time_scale_converter(self): + # Create a context manager to patch functions called by Inspector.__init__ + with patch.object( + _inspector, "parse_etrecord", return_value=None + ), patch.object( + _inspector, "gen_etdump_object", return_value=None + ), patch.object( + EventBlock, "_gen_from_etdump" + ) as mock_gen_from_etdump, patch.object( + _inspector, "gen_graphs_from_etrecord" + ), patch.object( + _inspector, "create_debug_handle_to_op_node_mapping" + ): + # Call the constructor of Inspector + Inspector( + etdump_path=ETDUMP_PATH, + etrecord=ETRECORD_PATH, + source_time_scale=TimeScale.US, + target_time_scale=TimeScale.S, + ) + + # Verify delegate_time_scale_converter is set to be a callable + self.assertIsInstance( + mock_gen_from_etdump.call_args.get("delegate_time_scale_converter"), + Callable, + ) + def test_inspector_print_data_tabular(self): # Create a context manager to patch functions called by Inspector.__init__ with patch.object( @@ -281,6 +318,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -304,6 +342,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): # Note the sizes of this tensor are different from the previous one debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -348,6 +387,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -371,6 +411,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): # Same as the event above except for offset debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, diff --git a/sdk/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py similarity index 88% rename from sdk/inspector/tests/inspector_utils_test.py rename to devtools/inspector/tests/inspector_utils_test.py index b5b9b54d6c..73511f5fcd 100644 --- a/sdk/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -4,30 +4,34 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import tempfile import unittest from typing import Dict, Tuple import torch -from executorch.sdk import generate_etrecord, parse_etrecord +from executorch.devtools import generate_etrecord, parse_etrecord -from executorch.sdk.debug_format.base_schema import ( +from executorch.devtools.debug_format.base_schema import ( OperatorGraph, OperatorNode, ValueNode, ) -from executorch.sdk.debug_format.et_schema import FXOperatorGraph -from executorch.sdk.etdump import schema_flatcc as flatcc +from executorch.devtools.debug_format.et_schema import FXOperatorGraph +from executorch.devtools.etdump import schema_flatcc as flatcc -from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord -from executorch.sdk.inspector._inspector_utils import ( +from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord +from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, find_populated_event, gen_graphs_from_etrecord, is_inference_output_equal, + TimeScale, ) @@ -74,6 +78,7 @@ def test_find_populated_event(self): end_time=2002, ) debug_event = flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", @@ -170,6 +175,19 @@ def test_is_inference_output_equal_returns_true_for_same_strs(self): ) ) + def test_calculate_time_scale_factor_second_based(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.NS, TimeScale.MS), 1000000 + ) + self.assertEqual( + calculate_time_scale_factor(TimeScale.MS, TimeScale.NS), 1 / 1000000 + ) + + def test_calculate_time_scale_factor_cycles(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.CYCLES, TimeScale.CYCLES), 1 + ) + def gen_mock_operator_graph_with_expected_map() -> ( Tuple[OperatorGraph, Dict[int, OperatorNode]] diff --git a/devtools/size_analysis_tool/TARGETS b/devtools/size_analysis_tool/TARGETS new file mode 100644 index 0000000000..c365ba152d --- /dev/null +++ b/devtools/size_analysis_tool/TARGETS @@ -0,0 +1,51 @@ +load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_library( + name = "size_analysis_tool_lib", + srcs = [ + "size_analysis_tool.py", + ], + visibility = ["PUBLIC"], + deps = [ + "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/exir:lib", + "//executorch/exir/backend:backend_api", + ], +) + +python_binary( + name = "size_analysis_tool", + srcs = [ + "size_analysis_tool.py", + ], + main_function = "executorch.devtools.size_analysis_tool.size_analysis_tool.main", + visibility = ["PUBLIC"], + deps = [ + "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/exir:lib", + "//executorch/exir/backend:backend_api", + ], +) + +python_unittest( + name = "size_analysis_tool_test", + srcs = [ + "size_analysis_tool.py", + "size_analysis_tool_test.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/xnnpack/partition:xnnpack_partitioner", + "//executorch/backends/xnnpack/utils:xnnpack_utils", + "//executorch/devtools:lib", + "//executorch/exir:lib", + "//executorch/exir/backend:backend_api", + "//executorch/exir/passes:spec_prop_pass", + ], +) diff --git a/sdk/size_analysis_tool/size_analysis_tool.py b/devtools/size_analysis_tool/size_analysis_tool.py similarity index 99% rename from sdk/size_analysis_tool/size_analysis_tool.py rename to devtools/size_analysis_tool/size_analysis_tool.py index d17ec5ac47..8ea8ddbbf4 100644 --- a/sdk/size_analysis_tool/size_analysis_tool.py +++ b/devtools/size_analysis_tool/size_analysis_tool.py @@ -9,10 +9,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple import torch +from executorch.devtools import parse_etrecord from executorch.exir import ExportedProgram from executorch.exir.backend.backend_api import LoweredBackendModule -from executorch.sdk import parse_etrecord def _get_tensor_data(node: torch.fx.Node, tensor: torch.Tensor) -> Dict[str, Any]: diff --git a/sdk/size_analysis_tool/size_analysis_tool_test.py b/devtools/size_analysis_tool/size_analysis_tool_test.py similarity index 98% rename from sdk/size_analysis_tool/size_analysis_tool_test.py rename to devtools/size_analysis_tool/size_analysis_tool_test.py index 3e1efec77b..96feae7e42 100644 --- a/sdk/size_analysis_tool/size_analysis_tool_test.py +++ b/devtools/size_analysis_tool/size_analysis_tool_test.py @@ -14,12 +14,12 @@ get_xnnpack_executorch_backend_config, ) from executorch.backends.xnnpack.utils.utils import capture_graph_for_xnnpack -from executorch.exir.backend.backend_api import to_backend, validation_disabled -from executorch.exir.passes.spec_prop_pass import SpecPropPass -from executorch.sdk.size_analysis_tool.size_analysis_tool import ( +from executorch.devtools.size_analysis_tool.size_analysis_tool import ( generate_model_size_information, ) +from executorch.exir.backend.backend_api import to_backend, validation_disabled +from executorch.exir.passes.spec_prop_pass import SpecPropPass class SizeAnalysisToolTest(unittest.TestCase): diff --git a/devtools/targets.bzl b/devtools/targets.bzl new file mode 100644 index 0000000000..17d9e89cad --- /dev/null +++ b/devtools/targets.bzl @@ -0,0 +1,8 @@ +def build_sdk(): + return native.read_config("executorch", "build_sdk", "false") == "true" + +def get_sdk_flags(): + sdk_flags = [] + if build_sdk(): + sdk_flags += ["-DEXECUTORCH_BUILD_DEVTOOLS"] + return sdk_flags diff --git a/docs/Makefile b/docs/Makefile index ae8470b58c..219998d4b4 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -# SPHINXOPTS = -WT --keep-going TODO(T165752164) fix sphinx warnings around preprocess macros in cpp like __ET_DEPRECATED +# SPHINXOPTS = -WT --keep-going TODO(T165752164) fix sphinx warnings around preprocess macros in cpp like ET_DEPRECATED SPHINXBUILD = sphinx-build SPHINXPROJ = ExecuTorch SOURCEDIR = source diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile index b741509197..e662105b83 100644 --- a/docs/source/Doxyfile +++ b/docs/source/Doxyfile @@ -964,8 +964,7 @@ INPUT = ../runtime/executor/memory_manager.h \ ../runtime/core/tensor_shape_dynamism.h \ ../runtime/platform/compiler.h \ ../runtime/executor/ \ - ../runtime/platform/ \ - ../util/ + ../runtime/platform/ diff --git a/docs/source/_static/img/api_life_cycle.png b/docs/source/_static/img/api_life_cycle.png new file mode 100644 index 0000000000..47b8e7c318 Binary files /dev/null and b/docs/source/_static/img/api_life_cycle.png differ diff --git a/docs/source/_static/img/benchmark-infra.png b/docs/source/_static/img/benchmark-infra.png new file mode 100644 index 0000000000..a5d3077425 Binary files /dev/null and b/docs/source/_static/img/benchmark-infra.png differ diff --git a/docs/source/_static/img/chat.png b/docs/source/_static/img/chat.png new file mode 100644 index 0000000000..e7ed934519 Binary files /dev/null and b/docs/source/_static/img/chat.png differ diff --git a/docs/source/_static/img/chat_response.png b/docs/source/_static/img/chat_response.png new file mode 100644 index 0000000000..714265276f Binary files /dev/null and b/docs/source/_static/img/chat_response.png differ diff --git a/docs/source/_static/img/llava_example.png b/docs/source/_static/img/llava_example.png new file mode 100644 index 0000000000..ccac335ee6 Binary files /dev/null and b/docs/source/_static/img/llava_example.png differ diff --git a/docs/source/_static/img/load_complete_and_start_prompt.png b/docs/source/_static/img/load_complete_and_start_prompt.png new file mode 100644 index 0000000000..43d81f10d0 Binary files /dev/null and b/docs/source/_static/img/load_complete_and_start_prompt.png differ diff --git a/docs/source/_static/img/logs.png b/docs/source/_static/img/logs.png new file mode 100644 index 0000000000..e35227a1c0 Binary files /dev/null and b/docs/source/_static/img/logs.png differ diff --git a/docs/source/_static/img/mtk_changes_to_shell_file.png b/docs/source/_static/img/mtk_changes_to_shell_file.png new file mode 100644 index 0000000000..7fa4e46186 Binary files /dev/null and b/docs/source/_static/img/mtk_changes_to_shell_file.png differ diff --git a/docs/source/_static/img/mtk_output.png b/docs/source/_static/img/mtk_output.png new file mode 100644 index 0000000000..e41d54c356 Binary files /dev/null and b/docs/source/_static/img/mtk_output.png differ diff --git a/docs/source/_static/img/opening_the_app_details.png b/docs/source/_static/img/opening_the_app_details.png new file mode 100644 index 0000000000..60494ecc69 Binary files /dev/null and b/docs/source/_static/img/opening_the_app_details.png differ diff --git a/docs/source/_static/img/settings_menu.png b/docs/source/_static/img/settings_menu.png new file mode 100644 index 0000000000..028e6b55cd Binary files /dev/null and b/docs/source/_static/img/settings_menu.png differ diff --git a/docs/source/api-life-cycle.md b/docs/source/api-life-cycle.md new file mode 100644 index 0000000000..1836ba77d7 --- /dev/null +++ b/docs/source/api-life-cycle.md @@ -0,0 +1,217 @@ +# ExecuTorch API Life Cycle and Deprecation Policy + +## API Life Cycle + +![name](_static/img/api_life_cycle.png) + +Each API of ExecuTorch falls into one of the following life cycle states: + +_Experimental_ + +- APIs in this stage are under active development and may change or be removed + at any time. That said, the expectation is that we will eventually promote it + to _Stable_, unless sufficient negative signals have been collected from the + community or better alternatives have been found. +- _Experimental_ APIs will be clearly marked (see the “How to Mark API State” + section below). +- _Experimental_ APIs may be changed or removed without notice, and developers + should expect no stability guarantees. + +_Stable_ + +- APIs are considered to be _Stable_ if they are not marked as _Experimental_ or + _Deprecated._ +- APIs in this stage have been thoroughly tested and are considered ready for + production use. +- The recommended best practice is to not deprecate stable APIs. When writing an + API, write it in such a way that it doesn’t need to be deprecated in the + future. +- _Stable_ APIs can be changed, but not in a breaking way. If breaking changes + have to be made, _Stable_ APIs will always transition to _Deprecated_ before + being broken/removed from the library. + +_Deprecated_ + +- APIs in this stage are no longer recommended for use and will be removed in a + future version of ExecuTorch. +- _Deprecated_ APIs will be clearly marked (see the “How to Mark API State” + section below). +- _Deprecated_ APIs will remain functional for at least the _deprecation period_ + (see the “Deprecation Period” section below) to allow developers time to + migrate to alternative APIs. + +_Deleted_ + +- APIs whose removal are made permanent. Cleaned up from both code and + documentation. + +## Deprecation Policy + +Follow these steps to deprecate and remove an API: + +1. Discuss the change and collect initial feedback. +2. Clearly mark the API deprecated in code and documentation (See “How to Mark + API State” below). +3. Listen to user feedback after the first release that deprecates the API. + Users who weren't involved in the original discussion may have good arguments + for not deprecating or removing the API. +4. Once the deprecation period has passed, the API may be removed (See + “Deprecation Period” below). Be sure to also remove references from the + documentation. + + +We also use deprecation as a way to make breaking changes to an existing +interface: for example, if adding a non-optional parameter to a method. To do +this without breaking existing users: + +1. In a single commit: + - Create a new API that meets the new requirements. + - Deprecate the old API and recommend that users move to the new API. +2. Migrate use cases from the old API to the new API. +3. Delete the old API after the deprecation period. + +## How to Mark API State + +When possible, the ExecuTorch code uses language-standard ways to annotate API +lifecycle state in the code. This makes it easier for IDEs and other tools to +communicate state to developers. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Language + Code + Documentation +
Python + + +Use the +executorch.exir._warnings.deprecated +decorator. + +

+Use the +executorch.exir._warnings.experimental +decorator. + +

+ +Use .. warning:: in the docstrings of deprecated and experimental +APIs. See +example +usage. + + +
C++ + + +Use the ET_DEPRECATED annotation macro. See example usage. + +

+

+Use the ET_EXPERIMENTAL annotation macro. + +

+ +Start Doxygen comments with DEPRECATED: See +example +usage. + +

+

+Start Doxygen comments with EXPERIMENTAL:. +

Java + + +Use java.lang.Deprecated. + +

+

+ +Use androidx.annotation.RequiresOptIn. + +

+

+

/**
+* @deprecated Use {@link #newMethod()} instead.
+*/
+
+

+

/**
+* Warning: This API is experimental.
+*/
+
Objective-C + +

+__attribute__((deprecated("Use newMethod instead"))); +

+

+__attribute__((deprecated("This API is experimental and may change without notice."))); +

+

+


+/**
+* @deprecated Use `newMethod` instead.
+*/
+
+

+


+/**
+* @experimental This API is experimental.
+*/
+

+

Swift + +

+@available(*, deprecated, message: "Use newMethod instead") +

+

+@available(*, message: "This API is experimental") +

+

+/// - Warning: Deprecated. Use `newMethod()` instead. +

+/// - Warning: This API is experimental. +

+ +The annotations would trigger static and/or runtime warning that contains at +least these information: + +1. Clearly point to the non-deprecated alternative to migrate to, or be clear if + there is no alternative; +2. Specify the earliest version in which the API may actually be removed (See + “Deprecation Period” below). + +## Deprecation Period + +Here we recommend waiting for at least 2 minor releases before the removal. For +example, if a function is marked as _deprecated_ in release 1.3.x, then it can +be _deleted_ in 1.5.x or later. diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md index 39794ac06c..52755773ee 100644 --- a/docs/source/build-run-coreml.md +++ b/docs/source/build-run-coreml.md @@ -127,7 +127,7 @@ python examples/apple/coreml/scripts/inspector_cli.py --etdump_path etdump.etdp 1. Build frameworks, running the following will create a `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory. ```bash cd executorch -./build/build_apple_frameworks.sh --Release --coreml +./build/build_apple_frameworks.sh --coreml ``` 2. Create a new [Xcode project](https://developer.apple.com/documentation/xcode/creating-an-xcode-project-for-an-app#) or open an existing project. diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md index ff5cb51595..94a936b2e7 100644 --- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md @@ -5,6 +5,7 @@ build ExecuTorch for Qualcomm AI Engine Direct and running a model on it. Qualcomm AI Engine Direct is also referred to as QNN in the source and documentation. + ::::{grid} 2 :::{grid-item-card} What you will learn in this tutorial: @@ -35,11 +36,10 @@ Currently, this ExecuTorch Backend can delegate AI computations to Hexagon proce ### Host OS -The Linux host operating system that QNN Backend is verified with is Ubuntu 20.04 LTS x64. - -However, because Qualcomm Package Manager(QPM) used to download necessary SDK (see below) -only support Ubuntu, we recommend users to exercise this tutorial exacly -on Ubuntu 20.04. +The Linux host operating system that QNN Backend is verified with is Ubuntu 22.04 LTS x64 +at the moment of updating this tutorial. +Usually, we verified the backend on the same OS version which QNN is verified with. +The version is documented in QNN SDK. ### Hardware: You will need an Android smartphone with adb-connected running on one of below Qualcomm SoCs: @@ -53,20 +53,16 @@ This example is verified with SM8550 and SM8450. ### Software: - Follow ExecuTorch recommended Python version. - - A compiler to compile AOT parts. GCC 9.4 come with Ubuntu20.04 is verified. - - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 25c. + - A compiler to compile AOT parts, e.g., the GCC compiler comes with Ubuntu LTS. + - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 26c. - [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) - - Follow the download button. After logging in, search Qualcomm AI Stack at the *Tool* panel. - - You can find Qualcomm AI Engine Direct SDK under the AI Stack group. - - Please download the Linux version, and follow instructions on the page to extract the file. - - The SDK should be installed to somewhere `/opt/qcom/aistack/qnn` by default. - - It's also OK to place it somewhere else. We don't have assumption about the absolute path of the SDK. - - This example is verified with version 2.12.0. + - Click the "Get Software" button to download a version of QNN SDK. + - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6. + - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon. + - [QNN 2.26.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip) The directory with installed Qualcomm AI Engine Direct SDK looks like: ``` -$ tree -L 1 /opt/qcom/aistack/qnn// -/opt/qcom/aistack/qnn// ├── benchmarks ├── bin ├── docs @@ -74,11 +70,15 @@ $ tree -L 1 /opt/qcom/aistack/qnn// ├── include ├── lib ├── LICENSE.pdf +├── NOTICE.txt +├── NOTICE_WINDOWS.txt ├── QNN_NOTICE.txt ├── QNN_README.txt ├── QNN_ReleaseNotes.txt -├── share -└── Uninstall +├── ReleaseNotes.txt +├── ReleaseNotesWindows.txt +├── sdk.yaml +└── share ``` @@ -89,7 +89,7 @@ $ tree -L 1 /opt/qcom/aistack/qnn// `$QNN_SDK_ROOT` refers to the root of Qualcomm AI Engine Direct SDK, i.e., the directory containing `QNN_README.txt`. -`$ANDROID_NDK` refers to the root of Android NDK. +`$ANDROID_NDK_ROOT` refers to the root of Android NDK. `$EXECUTORCH_ROOT` refers to the root of executorch git repository. @@ -107,7 +107,16 @@ export PYTHONPATH=$EXECUTORCH_ROOT/.. ## Build -An example script for below building instructions is [here](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/scripts/build.sh). +An example script for the below building instructions is [here](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/scripts/build.sh). +We recommend to use the script because the ExecuTorch build-command can change from time to time. +The above script is actively used. It is updated more frquently than this tutorial. +An example usage is +```bash +cd $EXECUTORCH_ROOT +./backends/qualcomm/scripts/build.sh +# or +./backends/qualcomm/scripts/build.sh --release +``` ### AOT (Ahead-of-time) components: @@ -115,14 +124,24 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b ```bash cd $EXECUTORCH_ROOT -# Workaround for fbs files in exir/_serialize -cp schema/program.fbs exir/_serialize/program.fbs -cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs - -mkdir build_x86_64 -cd build_x86_64 -cmake .. -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=${QNN_SDK_ROOT} -cmake --build . -t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j8 +mkdir build-x86 +cd build-x86 +# Note that the below command might change. +# Please refer to the above build.sh for latest workable commands. +cmake .. \ + -DCMAKE_INSTALL_PREFIX=$PWD \ + -DEXECUTORCH_BUILD_QNN=ON \ + -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF + +# nproc is used to detect the number of available CPU. +# If it is not applicable, please feel free to use the number you want. +cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc) # install Python APIs to correct import path # The filename might vary depending on your Python and host version. @@ -138,49 +157,59 @@ Commands to build `qnn_executor_runner` for Android: ```bash cd $EXECUTORCH_ROOT -mkdir build_android -cd build_android +mkdir build-android +cd build-android # build executorch & qnn_executorch_backend cmake .. \ -DCMAKE_INSTALL_PREFIX=$PWD \ - -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_QNN=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ + -DPYTHON_EXECUTABLE=python3 \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ -DANDROID_ABI='arm64-v8a' \ - -DANDROID_NATIVE_API_LEVEL=23 \ - -B$PWD + -DANDROID_NATIVE_API_LEVEL=23 -cmake --build $PWD -j16 --target install +# nproc is used to detect the number of available CPU. +# If it is not applicable, please feel free to use the number you want. +cmake --build $PWD --target install -j$(nproc) cmake ../examples/qualcomm \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ -DANDROID_ABI='arm64-v8a' \ -DANDROID_NATIVE_API_LEVEL=23 \ -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \ -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ + -DPYTHON_EXECUTABLE=python3 \ -Bexamples/qualcomm -cmake --build examples/qualcomm -j16 +cmake --build examples/qualcomm -j$(nproc) + +# qnn_executor_runner can be found under examples/qualcomm +# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/qnn_executor_runner +ls examples/qualcomm ``` **Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options. -You can find `qnn_executor_runner` under `build_android/examples/qualcomm/`. - -The build script is also available [here](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/scripts/build.sh). ## Deploying and running on device ### AOT compile a model -You can refer to [this script](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/deeplab_v3.py) for the exact flow. +Refer to [this script](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/deeplab_v3.py) for the exact flow. We use deeplab-v3-resnet101 as an example in this tutorial. Run below commands to compile: -``` +```bash cd $EXECUTORCH_ROOT -python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8550 --compile_only --download +# Workaround for fbs files in exir/_serialize +cp schema/program.fbs exir/_serialize/program.fbs +cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs + +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download ``` You might see something like below: @@ -203,6 +232,58 @@ output output output ([getitem_ The compiled model is `./deeplab_v3/dlv3_qnn.pte`. +### Test model inference on QNN HTP emulator + +We can test model inferences before deploying it to a device by HTP emulator. + +Let's build `qnn_executor_runner` for a x64 host: +```bash +# assuming the AOT component is built. +cd $EXECUTORCH_ROOT/build-x86 +cmake ../examples/qualcomm \ + -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \ + -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ + -DPYTHON_EXECUTABLE=python3 \ + -Bexamples/qualcomm + +cmake --build examples/qualcomm -j$(nproc) + +# qnn_executor_runner can be found under examples/qualcomm +# The full path is $EXECUTORCH_ROOT/build-x86/examples/qualcomm/qnn_executor_runner +ls examples/qualcomm/ +``` + +To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`. +We set the below two paths to `LD_LIBRARY_PATH` environment variable: + 1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/` + 2. `$EXECUTORCH_ROOT/build-x86/lib/` + +The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section. + +The second path is for `libqnn_executorch_backend.so`. + +So, we can run `./deeplab_v3/dlv3_qnn.pte` by: +```bash +cd $EXECUTORCH_ROOT/build-x86 +export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/build-x86/lib/:$LD_LIBRARY_PATH +examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte +``` + +We should see some outputs like the below. Note that the emulator can take some time to finish. +```bash +I 00:00:00.354662 executorch:qnn_executor_runner.cpp:213] Method loaded. +I 00:00:00.356460 executorch:qnn_executor_runner.cpp:261] ignoring error from set_output_data_ptr(): 0x2 +I 00:00:00.357991 executorch:qnn_executor_runner.cpp:261] ignoring error from set_output_data_ptr(): 0x2 +I 00:00:00.357996 executorch:qnn_executor_runner.cpp:265] Inputs prepared. + +I 00:01:09.328144 executorch:qnn_executor_runner.cpp:414] Model executed successfully. +I 00:01:09.328159 executorch:qnn_executor_runner.cpp:421] Write etdump to etdump.etdp, Size = 424 +[INFO] [Qnn ExecuTorch]: Destroy Qnn backend parameters +[INFO] [Qnn ExecuTorch]: Destroy Qnn context +[INFO] [Qnn ExecuTorch]: Destroy Qnn device +[INFO] [Qnn ExecuTorch]: Destroy Qnn backend +``` + ### Run model inference on an Android smartphone with Qualcomm SoCs ***Step 1***. We need to push required QNN libraries to the device. @@ -212,11 +293,13 @@ The compiled model is `./deeplab_v3/dlv3_qnn.pte`. DEVICE_DIR=/data/local/tmp/executorch_qualcomm_tutorial/ adb shell "mkdir -p ${DEVICE_DIR}" adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR} -adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR} adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR} ``` ***Step 2***. We also need to indicate dynamic linkers on Android and Hexagon @@ -225,8 +308,8 @@ So, we can run `qnn_executor_runner` like ```bash adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/build_android/examples/qualcomm/qnn_executor_runner ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/build_android/lib/libqnn_executorch_backend.so ${DEVICE_DIR} +adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR} +adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR} adb shell "cd ${DEVICE_DIR} \ && export LD_LIBRARY_PATH=${DEVICE_DIR} \ && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \ @@ -236,12 +319,28 @@ adb shell "cd ${DEVICE_DIR} \ You should see something like below: ``` -I 00:00:01.835706 executorch:qnn_executor_runner.cpp:298] 100 inference took 1096.626000 ms, avg 10.966260 ms -[INFO][Qnn ExecuTorch] Destroy Qnn backend parameters -[INFO][Qnn ExecuTorch] Destroy Qnn context -[INFO][Qnn ExecuTorch] Destroy Qnn device -[INFO][Qnn ExecuTorch] Destroy Qnn backend +I 00:00:00.257354 executorch:qnn_executor_runner.cpp:213] Method loaded. +I 00:00:00.323502 executorch:qnn_executor_runner.cpp:262] ignoring error from set_output_data_ptr(): 0x2 +I 00:00:00.357496 executorch:qnn_executor_runner.cpp:262] ignoring error from set_output_data_ptr(): 0x2 +I 00:00:00.357555 executorch:qnn_executor_runner.cpp:265] Inputs prepared. +I 00:00:00.364824 executorch:qnn_executor_runner.cpp:414] Model executed successfully. +I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump.etdp, Size = 424 +[INFO] [Qnn ExecuTorch]: Destroy Qnn backend parameters +[INFO] [Qnn ExecuTorch]: Destroy Qnn context +[INFO] [Qnn ExecuTorch]: Destroy Qnn backend +``` + +The model is merely executed. If we want to feed real inputs and get model outputs, we can use +```bash +cd $EXECUTORCH_ROOT +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --download -s ``` +The `` can be found by `adb devices` command. + +After the above command, pre-processed inputs and outputs are put in `$EXECUTORCH_ROOT/deeplab_v3` and `$EXECUTORCH_ROOT/deeplab_v3/outputs` folder. + +The command-line arguents are written in [utils.py](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/utils.py#L127). +The model, inputs, and output location are passed to `qnn_executorch_runner` by `--model_path`, `--input_list_path`, and `--output_folder_path`. ### Running a model via ExecuTorch's android demo-app @@ -249,11 +348,14 @@ I 00:00:01.835706 executorch:qnn_executor_runner.cpp:298] 100 inference took 109 An Android demo-app using Qualcomm AI Engine Direct Backend can be found in `examples`. Please refer to android demo app [tutorial](https://pytorch.org/executorch/stable/demo-apps-android.html). +## Supported model list + +Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models. ## What is coming? - - [An example using quantized mobilebert](https://github.com/pytorch/executorch/pull/1043) to solve multi-class text classification. - - More Qualcomm AI Engine Direct accelerators, e.g., GPU. + - Improve the performance for llama3-8B-Instruct and support batch prefill. + - We will support pre-compiled binaries from [Qualcomm AI Hub](https://aihub.qualcomm.com/). ## FAQ diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md index 52a515aa7f..c82af7d98f 100644 --- a/docs/source/compiler-delegate-and-partitioner.md +++ b/docs/source/compiler-delegate-and-partitioner.md @@ -87,22 +87,22 @@ function which will be called when the program is out of its lifespan. ```cpp // Runtime check -__ET_NODISCARD bool is_available(); +ET_NODISCARD bool is_available(); // Runtime initialization -__ET_NODISCARD virtual Result init( +ET_NODISCARD virtual Result init( BackendInitContext& context, FreeableBuffer* processed, ArrayRef compile_specs); // Runtime execution -__ET_NODISCARD virtual Error execute( +ET_NODISCARD virtual Error execute( BackendExecutionContext& context, DelegateHandle* handle, EValue** args); // [optional] Runtime destroy. Destroy the resource held by the backend -virtual void destroy(__ET_UNUSED DelegateHandle* handle); +virtual void destroy(ET_UNUSED DelegateHandle* handle); ``` The diagram looks like following @@ -114,7 +114,7 @@ The diagram looks like following In order to make backend available to ExecuTorch runtime, it must be registered via the `register_backend` API: ```cpp -__ET_NODISCARD Error register_backend(const Backend& backend); +ET_NODISCARD Error register_backend(const Backend& backend); ``` Static registeration, i.e., at libraray init or load time, of a backend can be achieved as follows: @@ -127,13 +127,13 @@ static auto success_with_compiler = register_backend(backend); ``` -## SDK Integration: Debuggability +## Developer Tools Integration: Debuggability -Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native SDK (Software Development Kit) for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord). +Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord). -Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native SDK does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. +Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. -In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, SDK provides an interface to correlate delegated (sub)graph to original (sub)graph. The SDK does so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [SDK delegate integration](./sdk-delegate-integration). +In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Developer Tools Delegate Integration](./sdk-delegate-integration). By leveraging the debug identifier, backend developer can embed the debug as part of the delegated blob diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md index 1dad3b032f..fcad2eca58 100644 --- a/docs/source/compiler-memory-planning.md +++ b/docs/source/compiler-memory-planning.md @@ -32,7 +32,6 @@ The `MemoryPlanningPass` exposes the option to not memory plan program inputs an program = edge_program.to_executorch( exir.ExecutorchBackendConfig( memory_planning_pass=MemoryPlanningPass( - memory_planning_algo="greedy", alloc_graph_input=False, # Inputs will not be memory planned, the data_ptr for input tensors after model load will be nullptr alloc_graph_output=True, # Outputs will be memory planned, the data_ptr for input tensors after model load will be in the `planned_memory`. ) @@ -77,7 +76,7 @@ Then later when lowering to ExecuTorch you can use your custom plan in the follo program = edge_program.to_executorch( exir.ExecutorchBackendConfig( memory_planning_pass=CustomPoolMemoryPlanningPass( - memory_planning_algo="greedy", + memory_planning_algo=greedy, ) ) ) diff --git a/docs/source/concepts.md b/docs/source/concepts.md index 33d944c376..c085505b61 100644 --- a/docs/source/concepts.md +++ b/docs/source/concepts.md @@ -283,9 +283,9 @@ Techniques for performing computations and memory accesses on tensors with lower The ExecuTorch runtime executes models on edge devices. It is responsible for program initialization, program execution and, optionally, destruction (releasing backend owned resources). -## [SDK](./sdk-overview.md) +## [Developer Tools](./devtools-overview.md) -Software Development Kit. The tooling users need to profile, debug and visualize programs that are running with ExecuTorch. +A collection of tools users need to profile, debug and visualize programs that are running with ExecuTorch. ## [Selective build](./kernel-library-selective-build.md) diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md new file mode 100644 index 0000000000..13fd8e0059 --- /dev/null +++ b/docs/source/devtools-overview.md @@ -0,0 +1,44 @@ +# Introduction to the ExecuTorch Developer Tools + +ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch. + +All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from. + +## Developer Tools Features + +The ExecuTorch Developer Tools support the following features: + +- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output. +- **Profiling** models with operator level breakdown of performance stats + - Linking back operator performance stats to source code and module hierarchy + - Model loading and execution time +- **Delegate Integration** - Surfacing performance details from delegate backends + - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy) +- **Debugging** - Intermediate outputs and output quality analysis +- **Visualization** - Coming soon + +## Fundamental components of the Developer Tools + +In order to fully understand and leverage the power of the Developer Tools in this section, the fundamental components that power the Developer Tools will be detailed. + +### ETRecord +ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the Developer Tools to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model. + +To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb). + +More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord. + +### ETDump +ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution. + +```{note} +If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the Developer Tools. For the full experience, it is recommended that the users also generate an ETRecord. +``` + +More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime. + + +### Inspector APIs +The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. + +More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs. diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md new file mode 100644 index 0000000000..33d78cf58d --- /dev/null +++ b/docs/source/devtools-tutorial.md @@ -0,0 +1,3 @@ +## Developer Tools Usage Tutorial + +Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools. diff --git a/docs/source/executorch-runtime-api-reference.rst b/docs/source/executorch-runtime-api-reference.rst index 20dbc631f2..008030b84d 100644 --- a/docs/source/executorch-runtime-api-reference.rst +++ b/docs/source/executorch-runtime-api-reference.rst @@ -6,6 +6,8 @@ The ExecuTorch C++ API provides an on-device execution framework for exported Py For a tutorial style introduction to the runtime API, check out the `runtime tutorial `__ and its `simplified `__ version. +For detailed information on how APIs evolve and the deprecation process, please refer to the `ExecuTorch API Life Cycle and Deprecation Policy `__. + Model Loading and Execution --------------------------- diff --git a/docs/source/export-to-executorch-api-reference.rst b/docs/source/export-to-executorch-api-reference.rst index 2150ac7f8c..5560e75e21 100644 --- a/docs/source/export-to-executorch-api-reference.rst +++ b/docs/source/export-to-executorch-api-reference.rst @@ -1,6 +1,8 @@ Export to ExecuTorch API Reference ---------------------------------- +For detailed information on how APIs evolve and the deprecation process, please refer to the `ExecuTorch API Life Cycle and Deprecation Policy `__. + .. automodule:: executorch.exir .. autofunction:: to_edge diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md index 9e236e8e48..7516184d1c 100644 --- a/docs/source/extension-module.md +++ b/docs/source/extension-module.md @@ -22,7 +22,7 @@ Tensor::SizesType sizes[] = {1, 3, 256, 256}; TensorImpl tensor(ScalarType::Float, std::size(sizes), sizes, input); // Perform an inference. -const auto result = module.forward({EValue(Tensor(&tensor))}); +const auto result = module.forward(Tensor(&tensor)); // Check for success or failure. if (result.ok()) { @@ -105,13 +105,13 @@ Note: `method_meta()` will try to force-load the `Method` when called for the fi Assuming that the `Program`'s method names and their input format is known ahead of time, we rarely need to query for those and can run the methods directly by name using the `execute()` function: ```cpp -const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); +const auto result = module.execute("forward", Tensor(&tensor)); ``` Which can also be simplified for the standard `forward()` method name as: ```cpp -const auto result = module.forward({EValue(Tensor(&tensor))}); +const auto result = module.forward(Tensor(&tensor)); ``` Note: `execute()` or `forward()` will try to force load the `Program` and the `Method` when called for the first time. Therefore, the first inference will take more time than subsequent ones as it loads the model lazily and prepares it for execution unless the `Program` or `Method` was loaded explicitly earlier using the corresponding functions. @@ -132,7 +132,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc #include #include #include -#include +#include using namespace ::torch::executor; diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md index 2c3f85aff1..937b5b389f 100644 --- a/docs/source/getting-started-architecture.md +++ b/docs/source/getting-started-architecture.md @@ -87,8 +87,8 @@ The ExecuTorch runtime is written in C++ with minimal dependencies for portabili _Executor_ is the entry point to load the program and execute it. The execution triggers corresponding operator kernels or backend execution from this very minimal runtime. -## SDK +## Developer Tools -It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch SDK](./sdk-overview.md) to improve productivity. The SDK is not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. +It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. -During the program preparation and execution, users can use the ExecuTorch SDK to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. +During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index d610f020ef..15fa084e33 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -59,13 +59,11 @@ also work in similar environments. - We recommend `conda` as it provides cross-language support and integrates smoothly with `pip` (Python's built-in package manager) - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative. -* `g++` version 8 or higher, `clang++` version 8 or higher, or another - C++17-compatible toolchain that supports GNU C-style [statement - expressions](https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html) (`({ ... - })` syntax). +* `g++` version 7 or higher, `clang++` version 5 or higher, or another + C++17-compatible toolchain. Note that the cross-compilable core runtime code supports a wider range of -toolchains, down to C++11. See the [Runtime Overview](./runtime-overview.md) for +toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for portability details. ## Quick Setup: Colab/Jupyter Notebook Prototype @@ -112,6 +110,23 @@ Alternatively, if you would like to experiment with ExecuTorch quickly and easil ``` After setting up your environment, you are ready to convert your PyTorch programs to ExecuTorch. + +> **_NOTE:_** Cleaning the build system +> +> When fetching a new version of the upstream repo (via `git fetch` or `git +> pull`) it is a good idea to clean the old build artifacts. The build system +> does not currently adapt well to changes in build dependencies. +> +> You should also update and pull the submodules again, in case their versions +> have changed. +> +> ```bash +> # From the root of the executorch repo: +> rm -rf cmake-out pip-out +> git submodule sync +> git submodule update --init +> ``` + ## Create an ExecuTorch program After setting up your environment, you are ready to convert your PyTorch programs @@ -171,13 +186,30 @@ For now, let's use [`executor_runner`](https://github.com/pytorch/executorch/blo ### Build Tooling Setup The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it to build the `executor_runner` tool to run it on our desktop OS. ```bash - # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here. + # Clean and configure the CMake build system. Compiled programs will + # appear in the executorch/cmake-out directory we create here. (rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..) # Build the executor_runner target cmake --build cmake-out --target executor_runner -j9 ``` +> **_NOTE:_** Cleaning the build system +> +> When fetching a new version of the upstream repo (via `git fetch` or `git +> pull`) it is a good idea to clean the old build artifacts. The build system +> does not currently adapt well to changes in build dependencies. +> +> You should also update and pull the submodules again, in case their versions +> have changed. +> +> ```bash +> # From the root of the executorch repo: +> rm -rf cmake-out pip-out +> git submodule sync +> git submodule update --init +> ``` + ### Run Your Program Now that we've exported a program and built the runtime, let's execute it! diff --git a/docs/source/index.rst b/docs/source/index.rst index 57dc2c4bcc..7379c35bd2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -94,7 +94,7 @@ Topics in this section will help you get started with ExecuTorch. tutorials/export-to-executorch-tutorial running-a-model-cpp-tutorial extension-module - tutorials/sdk-integration-tutorial + tutorials/devtools-integration-tutorial apple-runtime demo-apps-ios demo-apps-android @@ -126,6 +126,7 @@ Topics in this section will help you get started with ExecuTorch. export-to-executorch-api-reference executorch-runtime-api-reference + api-life-cycle .. toctree:: :glob: @@ -192,10 +193,10 @@ Topics in this section will help you get started with ExecuTorch. .. toctree:: :glob: :maxdepth: 1 - :caption: SDK + :caption: Developer Tools :hidden: - sdk-overview + devtools-overview sdk-bundled-io sdk-etrecord sdk-etdump @@ -203,7 +204,7 @@ Topics in this section will help you get started with ExecuTorch. sdk-debugging sdk-inspector sdk-delegate-integration - sdk-tutorial + devtools-tutorial .. toctree:: :glob: @@ -243,11 +244,11 @@ ExecuTorch tutorials. :tags: .. customcarditem:: - :header: Using the ExecuTorch SDK to Profile a Model - :card_description: A tutorial for using the ExecuTorch SDK to profile and analyze a model with linkage back to source code. + :header: Using the ExecuTorch Developer Tools to Profile a Model + :card_description: A tutorial for using the ExecuTorch Developer Tools to profile and analyze a model with linkage back to source code. :image: _static/img/generic-pytorch-logo.png - :link: tutorials/sdk-integration-tutorial.html - :tags: SDK + :link: tutorials/devtools-integration-tutorial.html + :tags: devtools .. customcarditem:: :header: Integrating and Running ExecuTorch on Apple Platforms diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md index f80caff467..96c7982b8f 100644 --- a/docs/source/intro-overview.md +++ b/docs/source/intro-overview.md @@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are: - **Portability:** Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and SDK from - PyTorch model authoring and conversion, to debugging and deployment to a wide - variety of platforms. +- **Productivity:** Enabling developers to use the same toolchains and Developer + Tools from PyTorch model authoring and conversion, to debugging and deployment + to a wide variety of platforms. - **Performance:** Providing end users with a seamless and high-performance experience due to a lightweight runtime and utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md index 8fb4ed96cd..0f060d1c5e 100644 --- a/docs/source/kernel-library-custom-aten-kernel.md +++ b/docs/source/kernel-library-custom-aten-kernel.md @@ -3,23 +3,49 @@ At the last stage of [ExecuTorch model exporting](./export-overview.md), we lower the operators in the dialect to the _out variants_ of the [core ATen operators](./ir-ops-set-definition.md). Then we serialize these operator names into the model artifact. During runtime execution, for each operator name we will need to find the actual _kernels_, i.e., the C++ functions that do the heavy-lifting calculations and return results. -Portable kernel library is the in-house default kernel library, it’s easy to use and portable for most of the target backends. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels. +## Kernel Libraries +### First-party kernel libraries: +**[Portable kernel library](https://github.com/pytorch/executorch/tree/main/kernels/portable)** is the in-house default kernel library that covers most of the core ATen operators. It’s easy to use/read and is written in portable C++17. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels. -## Design Principles +**[Optimized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/optimized)** specializes on performance for some of the operators, leveraging existing third party libraries such as [EigenBLAS](https://gitlab.com/libeigen/eigen). This works best along with the portable kernel library, with a good balance on portability and performance. One example of combining these two libraries can be found [here](https://github.com/pytorch/executorch/blob/main/configurations/CMakeLists.txt). -**What do we support?** On the operator coverage side, the kernel registration APIs allow users to register kernels for all core ATen ops as well as custom ops, as long as the custom ops schemas are specified. +**[Quantized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/quantized)** implements operators for quantization and dequantization. These are out of core ATen operators but are vital to most of the production use cases. -Notice that we also support _partial kernels, _for example the kernel only supports a subset of tensor dtypes and/or dim orders. +### Custom kernel libraries: -**Kernel contract**: kernels need to comply with the following requirements: +**Custom kernels implementing core ATen ops**. Even though we don't have an internal example for custom kernels for core ATen ops, the optimized kernel library can be viewed as a good example. We have optimized [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/optimized/cpu/op_add.cpp) and a portable [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/portable/cpu/op_add.cpp). When user is combining these two libraries, we provide APIs to choose which kernel to use for `add.out`. In order to author and use custom kernels implementing core ATen ops, using the [YAML based approach](#yaml-entry-for-core-aten-op-out-variant) is recommended, because it provides full fledged support on + 1. combining kernel libraries and define fallback kernels; + 2. using selective build to minimize the kernel size. + +A **[Custom operator](https://github.com/pytorch/executorch/tree/main/extension/llm/custom_ops)** is any operator that an ExecuTorch user defines outside of PyTorch's [`native_functions.yaml`](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml). + +## Operator & Kernel Contract + +All the kernels mentioned above, whether they are in-house or customized, should comply with the following requirements: * Match the calling convention derived from operator schema. The kernel registration API will generate headers for the custom kernels as references. -* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match the expected dtypes. The constraints are available in edge dialect ops. -* Gives correct result. We will provide a testing framework to automatically test the custom kernels. +* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match the expected dtypes. The constraints are available in edge dialect ops. +* Give correct result. We will provide a testing framework to automatically test the custom kernels. + + +## APIs + +These are the APIs available to register kernels/custom kernels/custom ops into ExecuTorch: + +* [YAML Entry API](#yaml-entry-api-high-level-architecture) + - [for core ATen op with custom kernels](#yaml-entry-api-for-core-aten-op-out-variant) + - [for custom ops](#yaml-entry-api-for-custom-ops) + - [CMake Macros](#cmake-macros) +* C++ API + - [for custom ops](#c-api-for-custom-ops) + - [CMake Example](#compile-and-link-the-custom-kernel) + +If it's not clear which API to use, please see [Best Practices](#custom-ops-api-best-practices). + -## High Level Architecture +### YAML Entry API High Level Architecture ![](./_static/img/kernel-library-custom-aten-kernel.png) @@ -27,10 +53,10 @@ ExecuTorch users are asked to provide: 1. the custom kernel library with C++ implementations -2. a yaml file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the kernel. More details in the API section. +2. a YAML file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the kernel. More details in the API section. -### Workflow +### YAML Entry API Workflow At build time, the yaml files associated with kernel libraries will be passed to the _kernel resolver_ along with the model op info (see selective build doc) and the outcome is a mapping between a combination of operator names and tensor metadata, to kernel symbols. Then codegen tools will use this mapping to generate C++ bindings that connect the kernels to ExecuTorch runtime. ExecuTorch users need to link this generated library into their application to use these kernels. @@ -38,18 +64,10 @@ At static object initialization time, kernels will be registered into the ExecuT At runtime initialization stage, ExecuTorch will use the operator name and argument metadata as a key to lookup for the kernels. For example, with “aten::add.out” and inputs being float tensors with dim order (0, 1, 2, 3), ExecuTorch will go into the kernel registry and lookup for a kernel that matches the name and the input metadata. - -## APIs - -There are two sets of APIs: yaml files that describe kernel - operator mappings and codegen tools to consume these mappings. - - -### Yaml Entry for Core ATen Op Out Variant +### YAML Entry API for Core ATen Op Out Variant Top level attributes: - - * `op` (if the operator appears in `native_functions.yaml`) or `func` for custom operator. The value for this key needs to be the full operator name (including overload name) for `op` key, or a full operator schema (namespace, operator name, operator overload name and schema string), if we are describing a custom operator. For schema syntax please refer to this [instruction](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md). * `kernels`: defines kernel information. It consists of `arg_meta` and `kernel_name`, which are bound together to describe "for input tensors with these metadata, use this kernel". * `type_alias`(optional): we are giving aliases to possible dtype options. `T0: [Double, Float]` means `T0` can be one of `Double` or `Float`. @@ -86,86 +104,9 @@ ATen operator with a dtype/dim order specialized kernel (works for `Double` dtyp kernel_name: torch::executor::add_out ``` -### Custom Ops C++ API - -For a custom kernel that implements a custom operator, we provides 2 ways to register it into ExecuTorch runtime: -1. Using `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` C++ macros, covered by this section. -2. Using `functions.yaml` and codegen'd C++ libraries, covered by [next section](#custom-ops-yaml-entry). - -Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use. - -The first option requires C++17 and doesn't have selective build support yet, but it's faster than the second option where we have to go through yaml authoring and build system tweaking. - -The first option is particularly suitable for fast prototyping but can also be used in production. - -Similar to `TORCH_LIBRARY`, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime. - -#### Prepare custom kernel implementation - -Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see native_functions.yaml). For example: - -```yaml -custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor -custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!) -``` - -Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime: - - -```c++ -// custom_linear.h/custom_linear.cpp -#include -Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional bias, Tensor& out) { - // calculation - return out; -} -``` -#### Use a C++ macro to register it into PyTorch & ExecuTorch - -Append the following line in the example above: -```c++ -// custom_linear.h/custom_linear.cpp -// opset namespace myop -EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out); -``` - -Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose: - -```c++ -// custom_linear_pytorch.cpp -#include "custom_linear.h" -#include - -at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional bias) { - // initialize out - at::Tensor out = at::empty({weight.size(1), input.size(1)}); - // wrap kernel in custom_linear.cpp into ATen kernel - WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out); - return out; -} -// standard API to register ops into PyTorch -TORCH_LIBRARY(myop, m) { - m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear); - m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3)); -} -``` - -#### Compile and link the custom kernel - -Link it into ExecuTorch runtime: In our `CMakeLists.txt`` that builds the binary/application, we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well. - -Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is: - -```python -import torch -torch.ops.load_library("libcustom_linear.so/dylib") - -# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp. -op = torch.ops.myop.custom_linear.default -``` -### Custom Ops Yaml Entry +### YAML Entry API for Custom Ops As mentioned above, this option provides more support in terms of selective build and features such as merging operator libraries. @@ -215,14 +156,11 @@ ExecuTorch does not support all of the argument types that core PyTorch supports * List> * Optional> - -### Build Tool Macros +#### CMake Macros We provide build time macros to help users to build their kernel registration library. The macro takes the yaml file describing the kernel library as well as model operator metadata, and packages the generated C++ bindings into a C++ library. The macro is available on CMake. -#### CMake - `generate_bindings_for_kernels(FUNCTIONS_YAML functions_yaml CUSTOM_OPS_YAML custom_ops_yaml)` takes a yaml file for core ATen op out variants and also a yaml file for custom ops, generate C++ bindings for kernel registration. It also depends on the selective build artifact generated by `gen_selected_ops()`, see selective build doc for more information. Then `gen_operators_lib` will package those bindings to be a C++ library. As an example: ```cmake # SELECT_OPS_LIST: aten::add.out,aten::mm.out @@ -263,6 +201,103 @@ And out fallback: The merged yaml will have the entry in functions.yaml. +### C++ API for Custom Ops + +Unlike the YAML entry API, the C++ API only uses C++ macros `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` for kernel registration, also without selective build support. It makes this API faster in terms of development speed, since users don't have to do YAML authoring and build system tweaking. + +Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use. + +Similar to [`TORCH_LIBRARY`](https://pytorch.org/cppdocs/library.html#library_8h_1a0bd5fb09d25dfb58e750d712fc5afb84) in PyTorch, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime. + +#### Prepare custom kernel implementation + +Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see `native_functions.yaml`). For example: + +```yaml +custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor +custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!) +``` + +Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime: + + +```c++ +// custom_linear.h/custom_linear.cpp +#include +Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional bias, Tensor& out) { + // calculation + return out; +} +``` +#### Use a C++ macro to register it into ExecuTorch + +Append the following line in the example above: +```c++ +// custom_linear.h/custom_linear.cpp +// opset namespace myop +EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out); +``` + +Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose: + +```c++ +// custom_linear_pytorch.cpp +#include "custom_linear.h" +#include + +at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional bias) { + // initialize out + at::Tensor out = at::empty({weight.size(1), input.size(1)}); + // wrap kernel in custom_linear.cpp into ATen kernel + WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out); + return out; +} +// standard API to register ops into PyTorch +TORCH_LIBRARY(myop, m) { + m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear); + m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3)); +} +``` + +#### Compile and link the custom kernel + +Link it into ExecuTorch runtime: In our `CMakeLists.txt` that builds the binary/application, we need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well. + +Here's an example to do it: + +```cmake +# For target_link_options_shared_lib +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +# Add a custom op library +add_library(custom_op_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/custom_op.cpp) + +# Include the header +target_include_directory(custom_op_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + +# Link ExecuTorch library +target_link_libraries(custom_op_lib PUBLIC executorch) + +# Define a binary target +add_executable(custom_op_runner PUBLIC main.cpp) + +# Link this library with --whole-archive !! IMPORTANT !! this is to avoid the operators being stripped by linker +target_link_options_shared_lib(custom_op_lib) + +# Link custom op lib +target_link_libraries(custom_op_runner PUBLIC custom_op_lib) + +``` + +Link it into the PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is: + +```python +import torch +torch.ops.load_library("libcustom_linear.so/dylib") + +# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp. +op = torch.ops.myop.custom_linear.default +``` ### Custom Ops API Best Practices diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md new file mode 100644 index 0000000000..ac95fb21bd --- /dev/null +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -0,0 +1,128 @@ +# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend + +This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device. + +## Prerequisites + +- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment. +- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device. +- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch. +- A Qualcomm device with 16GB RAM + - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices. +- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above. + +## Instructions + +### Step1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant) + +1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`. +2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**. + +### Step2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend +Deploying large language models like Llama 3 on-device presents the following challenges: + +1. The model size is too large to fit in device memory for inference. +2. High model loading and inference time. +3. Difficulty in quantization. + +To address these challenges, we have implemented the following solutions: +1. Using `--pt2e_quantize qnn_16a4w` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference. +2. Using `--num_sharding 8` to shard the model into sub-parts. +3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations. +4. Using `--optimized_rotation_path ` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy. +5. Using `--calibration_data "<|start_header_id|>system<|end_header_id|..."` to ensure that during the quantization of Llama 3 8B instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card of meta llama3 instruct](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/). + +To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following: + +1. The host machine has more than 100GB of memory (RAM + swap space). +2. The entire process takes a few hours. + +```bash +# Please note that calibration_data must include the prompt template for special tokens. +python -m examples.models.llama2.export_llama -t +llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +``` + +### Step3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs +1. Build executorch with Qualcomm AI Engine Direct Backend for android + ```bash + cmake \ + -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-android-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_QNN=ON \ + -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-android-out . + + cmake --build cmake-android-out -j16 --target install --config Release + ``` +2. Build llama runner for android +```bash + cmake \ + -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-android-out \ + -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \ + -DEXECUTORCH_BUILD_QNN=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-android-out/examples/models/llama2 examples/models/llama2 + + cmake --build cmake-android-out/examples/models/llama2 -j16 --config Release +``` +3. Run on Android via adb shell +*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone + +**3.1 Connect your android phone** + +**3.2 We need to push required QNN libraries to the device.** +```bash +# make sure you have write-permission on below path. +DEVICE_DIR=/data/local/tmp/llama +adb shell mkdir -p ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR} +``` + +**3.3 Upload model, tokenizer and llama runner binary to phone** +```bash +adb push ${DEVICE_DIR} +adb push ${DEVICE_DIR} +adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR} +adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR} +``` + +**3.4 Run model** +```bash +adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path --tokenizer_path --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128" +``` +You should see the message: +``` +<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the +``` + +## What is coming? +- Improve the performance for Llama 3 Instruct +- Reduce the memory pressure during inference to support 12GB Qualcomm devices +- Support more LLMs + +## FAQ + +If you encounter any issues while reproducing the tutorial, please file a github +issue on ExecuTorch repo and tag use `#qcom_aisw` tag diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 5fffb7e8ca..771bf489a9 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -198,25 +198,21 @@ Create a file called main.cpp with the following contents: // main.cpp #include -#include -#include -#include -#include "basic_tokenizer.h" #include "basic_sampler.h" -#include "managed_tensor.h" +#include "basic_tokenizer.h" #include -#include +#include +#include #include -#include -#include - -using namespace torch::executor; +#include -using SizesType = exec_aten::SizesType; -using DimOrderType = exec_aten::DimOrderType; -using StridesType = exec_aten::StridesType; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::extension::Module; +using executorch::runtime::EValue; +using executorch::runtime::Result; ``` The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array. @@ -248,14 +244,13 @@ std::string generate( for (auto i = 0u; i < max_output_length; i++) { // Convert the input_tokens from a vector of int64_t to EValue. // EValue is a unified data type in the ExecuTorch runtime. - ManagedTensor tensor_tokens( + auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, ScalarType::Long); - std::vector inputs = {tensor_tokens.get_tensor()}; // Run the model. It will return a tensor of logits (log-probabilities). - Result> logits_evalue = llm_model.forward(inputs); + auto logits_evalue = llm_model.forward(inputs); // Convert the output logits from EValue to std::vector, which is what // the sampler expects. @@ -343,7 +338,6 @@ Finally, download the following files into the same directory as main.h: ``` curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h -curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h ``` To learn more, see the [Runtime APIs Tutorial](../extension-module.md). @@ -368,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) # Include the executorch subdirectory. @@ -381,6 +376,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels ``` @@ -390,7 +386,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json @@ -522,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend @@ -538,6 +534,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels xnnpack_backend) # Provides the XNNPACK CPU acceleration backend ``` @@ -552,7 +549,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json @@ -591,8 +587,8 @@ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a ver The delegated model should be noticeably faster compared to the non-delegated model. For more information regarding backend delegateion, see the ExecuTorch guides -for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md) and [Core ML -Backend](../build-run-coreml.md). +for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md), [Core ML +Backend](../build-run-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md). ## Quantization @@ -750,7 +746,7 @@ In the fragment of the output for nanoGPT below, observe that embedding and add ### Performance Analysis -Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model. +Through the ExecuTorch Developer Tools, users are able to profile model execution, giving timing information for each operator in the model. #### Prerequisites @@ -763,7 +759,7 @@ In your export script, after calling `to_edge()` and `to_executorch()`, call `ge ``` import copy -from executorch.sdk import generate_etrecord +from executorch.devtools import generate_etrecord # Make the deep copy immediately after to to_edge() edge_manager_copy = copy.deepcopy(edge_manager) @@ -784,7 +780,7 @@ Include the ETDump header in your code. ```cpp // main.cpp -#include +#include ``` Create an Instance of the ETDumpGen class and pass it to the Module constructor. @@ -809,10 +805,10 @@ if (result.buf != nullptr && result.size > 0) { } ``` -Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump: +Additionally, update CMakeLists.txt to build with Developer Tools and enable events to be traced and logged into ETDump: ``` -option(EXECUTORCH_BUILD_SDK "" ON) +option(EXECUTORCH_BUILD_DEVTOOLS "" ON) # ... @@ -835,7 +831,7 @@ Run the runner, you will see “etdump.etdp” generated. Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information. ```python -from executorch.sdk import Inspector +from executorch.devtools import Inspector inspector = Inspector(etdump_path="etdump.etdp") # If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")` diff --git a/docs/source/llm/llama-demo-android.md b/docs/source/llm/llama-demo-android.md index 023f82baf3..ce2d25a4a8 100644 --- a/docs/source/llm/llama-demo-android.md +++ b/docs/source/llm/llama-demo-android.md @@ -1,2 +1,141 @@ -```{include} ../../../examples/demo-apps/android/LlamaDemo/README.md +# ExecuTorch Llama Android Demo App + +We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer. + +This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case. + +Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas. + + +## Key Concepts +From this demo app, you will learn many key concepts such as: +* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates +* Expose the ExecuTorch library via JNI layer +* Familiarity with current ExecuTorch app-facing capabilities + +The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases. + +## Supporting Models +As a whole, the models that this app supports are (varies by delegate): +* Llama 3.1 8B +* Llama 3 8B +* Llama 2 7B +* LLaVA-1.5 vision model (only XNNPACK) + + +## Building the APK +First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device: + +| Delegate | Resource | +| ------------- | ------------- | +| XNNPACK (CPU-based library) | [link](docs/delegates/xnnpack_README.md) | +| QNN (Qualcomm AI Accelerators) | [link](docs/delegates/qualcomm_README.md) | +| MediaTek (MediaTek AI Accelerators) | [link](docs/delegates/mediatek_README.md) | + +## How to Use the App + +This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API. + +For loading the app, development, and running on device we recommend Android Studio: +1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo. +2. Run the app (^R). This builds and launches the app on the phone. + +### Opening the App + +Below are the UI features for the app. + +Select the settings widget to get started with picking a model, its parameters and any prompts. +

+ +

+ + + +### Select Models and Parameters + +Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity. +

+ +

+ + + +Optional Parameters: +* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments. +* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences". +* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send. + +> [!TIP] +> Helpful ExecuTorch API in app + +```java +// Upon returning to the Main Chat Activity +mModule = new LlamaModule( + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), + modelPath, + tokenizerPath, + temperature); +int loadResult = mModule.load(); ``` + +* `modelCategory`: Indicate whether it’s a text-only or vision model +* `modePath`: path to the .pte file +* `tokenizerPath`: path to the tokenizer .bin file +* `temperature`: model parameter to adjust the randomness of the model’s output + + +### User Prompt +Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model. +

+ +

+ +You can provide it more follow-up questions as well. +

+ +

+ +> [!TIP] +> Helpful ExecuTorch API in app +```java +mModule.generate(prompt,sequence_length, MainActivity.this); +``` +* `prompt`: User formatted prompt +* `sequence_length`: Number of tokens to generate in response to a prompt +* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class. + +[*LLaVA-1.5: Only for XNNPACK delegate*] + +For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model. + +

+ +

+ + +### Output Generated +To show completion of the follow-up question, here is the complete detailed response from the model. +

+ +

+ +> [!TIP] +> Helpful ExecuTorch API in app + +Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`. +```java + @Override + public void onResult(String result) { + //...result contains token from response + //.. onResult will continue to be invoked until response is complete + } + + @Override + public void onStats(float tps) { + //...tps (tokens per second) stats is provided by framework + } + +``` + +## Reporting Issues +If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new). diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md index 1d12daef9d..41de29687e 100644 --- a/docs/source/native-delegates-executorch-xnnpack-delegate.md +++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md @@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors. #### **Profiling** -We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information. +We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. [comment]: <> (TODO: Refactor quantizer to a more official quantization doc) @@ -110,9 +110,9 @@ quantizer.set_global(quantization_config) ### Quantizing your model with the XNNPACKQuantizer After configuring our quantizer, we are now ready to quantize our model ```python -from torch._export import capture_pre_autograd_graph +from torch.export import export_for_training -exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) +exported_model = export_for_training(model_to_quantize, example_inputs).module() prepared_model = prepare_pt2e(exported_model, quantizer) print(prepared_model.graph) ``` diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md index 7bc8b4dd8b..6766e678e0 100644 --- a/docs/source/runtime-overview.md +++ b/docs/source/runtime-overview.md @@ -96,7 +96,7 @@ can build it for a wide variety of target systems. #### C++ Language Considerations -* The code is C++11-compatible to work with older toolchains. +* The code is C++17-compatible to work with older toolchains. * The runtime does not use exceptions or RTTI, although it is not antagonistic to them. * The code is compatible with GCC and Clang, and has also been built with diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md index 33deae3904..776c37a5da 100644 --- a/docs/source/sdk-bundled-io.md +++ b/docs/source/sdk-bundled-io.md @@ -28,7 +28,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest :::{dropdown} `MethodTestCase` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.config.MethodTestCase.__init__ +.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__ :noindex: ``` ::: @@ -38,7 +38,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest :::{dropdown} `MethodTestSuite` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.config.MethodTestSuite +.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite :noindex: ``` ::: @@ -48,13 +48,13 @@ Since each model may have multiple inference methods, we need to generate `List[ ### Step 3: Generate `BundledProgram` -We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including +We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`: :::{dropdown} `BundledProgram` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__ +.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__ :noindex: ``` ::: @@ -65,18 +65,18 @@ Construtor of `BundledProgram `will do sannity check internally to see if the gi ### Step 4: Serialize `BundledProgram` to Flatbuffer. -To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/sdk/bundled_program/serialize/__init__.py`. +To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`. :::{dropdown} Serialize and Deserialize ```{eval-rst} -.. currentmodule:: executorch.sdk.bundled_program.serialize +.. currentmodule:: executorch.devtools.bundled_program.serialize .. autofunction:: serialize_from_bundled_program_to_flatbuffer :noindex: ``` ```{eval-rst} -.. currentmodule:: executorch.sdk.bundled_program.serialize +.. currentmodule:: executorch.devtools.bundled_program.serialize .. autofunction:: deserialize_from_flatbuffer_to_bundled_program :noindex: ``` @@ -90,14 +90,13 @@ Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch m import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) -from torch._export import capture_pre_autograd_graph -from torch.export import export +from torch.export import export, export_for_training # Step 1: ExecuTorch Program Export @@ -131,7 +130,7 @@ capture_input = ( # Export method's FX Graph. method_graph = export( - capture_pre_autograd_graph(model, capture_input), + export_for_training(model, capture_input).module(), capture_input, ) @@ -187,7 +186,7 @@ with open(save_path, "wb") as f: We can also regenerate `BundledProgram` from flatbuffer file if needed: ```python -from executorch.sdk.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program +from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program save_path = "bundled_program.bpte" with open(save_path, "rb") as f: serialized_bundled_program = f.read() @@ -211,21 +210,19 @@ We need the pointer to ExecuTorch program to do the execution. To unify the proc Here's an example of how to use the `GetProgramData` API: ```c++ -std::shared_ptr buff_ptr; -size_t buff_len; - -// FILE_PATH here can be either BundledProgram or Program flatbuffer file. -Error status = torch::executor::util::read_file_content( - FILE_PATH, &buff_ptr, &buff_len); -ET_CHECK_MSG( - status == Error::Ok, - "read_file_content() failed with status 0x%" PRIx32, - status); - +// Assume that the user has read the contents of the file into file_data using +// whatever method works best for their application. The file could contain +// either BundledProgram data or Program data. +void* file_data = ...; +size_t file_data_len = ...; + +// If file_data contains a BundledProgram, GetProgramData() will return a +// pointer to the Program data embedded inside it. Otherwise it will return +// file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; status = torch::executor::bundled_program::GetProgramData( - buff_ptr.get(), buff_len, &program_ptr, &program_len); + file_data, file_data_len, &program_ptr, &program_len); ET_CHECK_MSG( status == Error::Ok, "GetProgramData() failed with status 0x%" PRIx32, @@ -255,7 +252,7 @@ We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput ### Runtime Example -Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp), and please review that file if you need more info and context: +Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp), and please review that file if you need more info and context: ```c++ // method_name is the name for the method we want to test @@ -313,9 +310,9 @@ Here's the example of the dtype of test input not meet model's requirement: import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from torch.export import export @@ -340,7 +337,7 @@ inputs = (torch.ones(2, 2, dtype=torch.float), ) # Find each method of model needs to be traced my its name, export its FX Graph. method_graph = export( - capture_pre_autograd_graph(model, inputs), + export_for_training(model, inputs).module(), inputs, ) @@ -400,7 +397,7 @@ Cell In[1], line 72 68 ] 70 # Step 3: Generate BundledProgram ---> 72 bundled_program = create_bundled_program(program, method_test_suites) -File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) 264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together. 265 266 Args: @@ -411,7 +408,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog --> 276 assert_valid_bundle(program, method_test_suites) 278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = [] 280 # Emit data and metadata of bundled tensor -File /executorch/sdk/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites) 215 # type of tensor input should match execution plan 216 if type(cur_plan_test_inputs[j]) == torch.Tensor: 217 # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]` @@ -449,9 +446,9 @@ Another common error would be the method name in any `MethodTestSuite` does not import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from torch.export import export @@ -476,7 +473,7 @@ inputs = (torch.ones(2, 2, dtype=torch.float),) # Find each method of model needs to be traced my its name, export its FX Graph. method_graph = export( - capture_pre_autograd_graph(model, inputs), + export_for_training(model, inputs).module(), inputs, ) @@ -532,7 +529,7 @@ Cell In[3], line 73 70 method_test_suites[0].method_name = "MISSING_METHOD_NAME" 72 # Generate BundledProgram ---> 73 bundled_program = create_bundled_program(program, method_test_suites) -File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) 264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together. 265 266 Args: @@ -543,7 +540,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog --> 276 assert_valid_bundle(program, method_test_suites) 278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = [] 280 # Emit data and metadata of bundled tensor -File /executorch/sdk/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites) 138 method_name_of_program = {e.name for e in program.execution_plan} 139 method_name_of_test_suites = {t.method_name for t in method_test_suites} --> 141 assert method_name_of_test_suites.issubset( diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md index 45e50b44e8..4707b4a2f9 100644 --- a/docs/source/sdk-debugging.md +++ b/docs/source/sdk-debugging.md @@ -1,6 +1,6 @@ # Debugging Models in ExecuTorch -With the ExecuTorch SDK, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.). +With the ExecuTorch Developer Tools, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.). Currently, ExecuTorch supports the following debugging flows: - Extraction of model level outputs via ETDump. @@ -11,7 +11,7 @@ Currently, ExecuTorch supports the following debugging flows: ## Steps to debug a model in ExecuTorch ### Runtime -For a real example reflecting the steps below, please refer to [sdk_example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp). +For a real example reflecting the steps below, please refer to [example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp). 1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy). 2. Integrate [ETDump generation](./sdk-etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging: @@ -38,7 +38,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./sdk-inspector.rst) to inspect these debug outputs. ```python -from executorch.sdk import Inspector +from executorch.devtools import Inspector # Create an Inspector instance with etdump and the debug buffer. inspector = Inspector(etdump_path=etdump_path, @@ -67,7 +67,7 @@ We've also provided a simple set of utilities that let users perform quality ana ```python -from executorch.sdk.inspector._inspector_utils import compare_results +from executorch.devtools.inspector import compare_results # Run a simple quality analysis between the model outputs sourced from the # runtime and a set of reference outputs. diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md index 8003371155..a2f67157c8 100644 --- a/docs/source/sdk-delegate-integration.md +++ b/docs/source/sdk-delegate-integration.md @@ -1,4 +1,4 @@ -# SDK Delegate Integration +# Developer Tools Delegate Integration [Delegate backends](compiler-delegate-and-partitioner.md) are a prominent component of on-device models due to their flexibility in defining behavior. A side effect of this flexibility is that it operates as an opaque transformation. This obfuscates rich associations and mutations that are valuable in post-processing. - For example, if two different operator fusions were to occur within a delegate, post processing wouldn’t be able to separate the two transformations. diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md index 4eacb18b14..c58efb40de 100644 --- a/docs/source/sdk-etdump.md +++ b/docs/source/sdk-etdump.md @@ -1,6 +1,6 @@ # Prerequisite | ETDump - ExecuTorch Dump -ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch SDK experience. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging. +ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch Developer Tools. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging. ## Generating an ETDump @@ -9,7 +9,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t 1. ***Include*** the ETDump header in your code. ```C++ -#include +#include ``` 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime. diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst index 43ed5095c6..63546f43ca 100644 --- a/docs/source/sdk-etrecord.rst +++ b/docs/source/sdk-etrecord.rst @@ -9,7 +9,7 @@ users ahead of time (when they export their model to run on ExecuTorch). To draw a rough equivalent to conventional software development, ``ETRecord`` can be considered as the binary built with debug symbols that is used for debugging in GNU Debugger (gdb). It is expected that -the user will supply this to the ExecuTorch SDK tooling in order for +the user will supply this to the ExecuTorch Developer Tools in order for them to debug and visualize their model. ``ETRecord`` contains numerous components such as: @@ -31,7 +31,7 @@ they are interested in working with via our tooling. .. warning:: Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process. -.. currentmodule:: executorch.sdk.etrecord._etrecord +.. currentmodule:: executorch.devtools.etrecord._etrecord .. autofunction:: generate_etrecord Using an ``ETRecord`` diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst index e15c1f2a39..4f55271b3f 100644 --- a/docs/source/sdk-inspector.rst +++ b/docs/source/sdk-inspector.rst @@ -17,7 +17,7 @@ APIs: * By accessing the `public attributes <#inspector-attributes>`__ of the ``Inspector``, ``EventBlock``, and ``Event`` classes. * By using a `CLI <#cli>`__ tool for basic functionalities. -Please refer to the `e2e use case doc `__ get an understanding of how to use these in a real world example. +Please refer to the `e2e use case doc `__ get an understanding of how to use these in a real world example. Inspector Methods @@ -26,26 +26,26 @@ Inspector Methods Constructor ~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.__init__ +.. autofunction:: executorch.devtools.Inspector.__init__ **Example Usage:** .. code:: python - from executorch.sdk import Inspector + from executorch.devtools import Inspector inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin") to_dataframe ~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.to_dataframe +.. autofunction:: executorch.devtools.Inspector.to_dataframe print_data_tabular ~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.print_data_tabular +.. autofunction:: executorch.devtools.Inspector.print_data_tabular .. _example-usage-1: @@ -62,7 +62,7 @@ Note that the unit of delegate profiling events is "cycles". We're working on pr find_total_for_module ~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.find_total_for_module +.. autofunction:: executorch.devtools.Inspector.find_total_for_module .. _example-usage-2: @@ -80,7 +80,7 @@ find_total_for_module get_exported_program ~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.get_exported_program +.. autofunction:: executorch.devtools.Inspector.get_exported_program .. _example-usage-3: @@ -119,7 +119,7 @@ of an ``Inspector`` instance, for example: inspector.event_blocks -.. autoclass:: executorch.sdk.inspector.EventBlock +.. autoclass:: executorch.devtools.inspector.EventBlock ``Event`` Class ~~~~~~~~~~~~~~~ @@ -127,7 +127,7 @@ of an ``Inspector`` instance, for example: Access ``Event`` instances through the ``events`` attribute of an ``EventBlock`` instance. -.. autoclass:: executorch.sdk.inspector.Event +.. autoclass:: executorch.devtools.inspector.Event **Example Usage:** @@ -152,7 +152,7 @@ table. This command produces the identical table output as calling the .. code:: bash - python3 -m sdk.inspector.inspector_cli --etdump_path --etrecord_path + python3 -m devtools.inspector.inspector_cli --etdump_path --etrecord_path Note that the `etrecord_path` argument is optional. diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md index 53f7d88613..1e8f1fae1b 100644 --- a/docs/source/sdk-overview.md +++ b/docs/source/sdk-overview.md @@ -1,44 +1,3 @@ -# Introduction to the ExecuTorch SDK +# Introduction to the ExecuTorch Developer Tools -ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch SDK enables this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch. - -All the components of the SDK have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from. - -## SDK Features - -The ExecuTorch SDK supports the following features: - -- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output. -- **Profiling** models with operator level breakdown of performance stats - - Linking back operator performance stats to source code and module hierarchy - - Model loading and execution time -- **Delegate Integration** - Surfacing performance details from delegate backends - - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy) -- **Debugging** - Intermediate outputs and output quality analysis -- **Visualization** - Coming soon - -## Fundamental components of the SDK - -In order to fully understand and leverage the power of the SDK in this section, the fundamental components that power the SDK will be detailed. - -### ETRecord -ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the SDK tooling to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model. - -To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb). - -More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord. - -### ETDump -ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution. - -```{note} -If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the SDK. For the full experience, it is recommended that the users also generate an ETRecord. -``` - -More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime. - - -### Inspector APIs -The Inspector Python APIs are the main user enrty point into the SDK. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. - -More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs. +Please update your link to . This URL will be deleted after v0.4.0. diff --git a/docs/source/sdk-profiling.md b/docs/source/sdk-profiling.md index 83276d8d18..e17fb1ae48 100644 --- a/docs/source/sdk-profiling.md +++ b/docs/source/sdk-profiling.md @@ -4,7 +4,7 @@ Profiling in ExecuTorch gives users access to these runtime metrics: - Model Load Time. - Operator Level Execution Time. - Delegate Execution Time. - - If the delegate that the user is calling into has been integrated with the [SDK](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time. + - If the delegate that the user is calling into has been integrated with the [Developer Tools](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time. - End-to-end Inference Execution Time. One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to. @@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](./sdk - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level. -Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model. +Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model. diff --git a/docs/source/sdk-tutorial.md b/docs/source/sdk-tutorial.md index 90c9ed6d34..457d3b47eb 100644 --- a/docs/source/sdk-tutorial.md +++ b/docs/source/sdk-tutorial.md @@ -1,3 +1,3 @@ -## SDK usage tutorial +## Developer Tools Usage Tutorial -Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the SDK. +Please update your link to . This URL will be deleted after v0.4.0. diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index 4491a6e8c8..666ee23aa3 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -74,13 +74,13 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder. ```python -from torch._export import capture_pre_autograd_graph +from torch.export import export_for_training from executorch.exir import EdgeCompileConfig mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval() sample_inputs = (torch.randn(1, 3, 224, 224), ) -mobilenet_v2 = capture_pre_autograd_graph(mobilenet_v2, sample_inputs) # 2-stage export for quantization path +mobilenet_v2 = export_for_training(mobilenet_v2, sample_inputs).module() # 2-stage export for quantization path from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.ao.quantization.quantizer.xnnpack_quantizer import ( @@ -107,7 +107,7 @@ def quantize(model, example_inputs): quantized_mobilenetv2 = quantize(mobilenet_v2, sample_inputs) ``` -Quantization requires a two stage export. First we use the `capture_pre_autograd_graph` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK. +Quantization requires a two stage export. First we use the `export_for_training` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK. ```python # Continued from earlier... @@ -149,9 +149,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/docs/source/tutorials_source/devtools-integration-tutorial.py b/docs/source/tutorials_source/devtools-integration-tutorial.py new file mode 100644 index 0000000000..b5e335b43d --- /dev/null +++ b/docs/source/tutorials_source/devtools-integration-tutorial.py @@ -0,0 +1,301 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Using the ExecuTorch Developer Tools to Profile a Model +======================== + +**Author:** `Jack Khuu `__ +""" + +###################################################################### +# The `ExecuTorch Developer Tools <../devtools-overview.html>`__ is a set of tools designed to +# provide users with the ability to profile, debug, and visualize ExecuTorch +# models. +# +# This tutorial will show a full end-to-end flow of how to utilize the Developer Tools to profile a model. +# Specifically, it will: +# +# 1. Generate the artifacts consumed by the Developer Tools (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__). +# 2. Create an Inspector class consuming these artifacts. +# 3. Utilize the Inspector class to analyze the model profiling result. + +###################################################################### +# Prerequisites +# ------------- +# +# To run this tutorial, you’ll first need to +# `Set up your ExecuTorch environment <../getting-started-setup.html>`__. +# + +###################################################################### +# Generate ETRecord (Optional) +# ---------------------------- +# +# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model +# graphs and metadata for linking runtime results (such as profiling) to +# the eager model. This is generated via ``executorch.devtools.generate_etrecord``. +# +# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the +# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model +# (``ExecutorchProgramManager``), and an optional dictionary of additional models. +# +# In this tutorial, an example model (shown below) is used to demonstrate. + +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from executorch.devtools import generate_etrecord + +from executorch.exir import ( + EdgeCompileConfig, + EdgeProgramManager, + ExecutorchProgramManager, + to_edge, +) +from torch.export import export, ExportedProgram + + +# Generate Model +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + # 1 input image channel, 6 output channels, 5x5 square convolution + # kernel + self.conv1 = nn.Conv2d(1, 6, 5) + self.conv2 = nn.Conv2d(6, 16, 5) + # an affine operation: y = Wx + b + self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + # Max pooling over a (2, 2) window + x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) + # If the size is a square, you can specify with a single number + x = F.max_pool2d(F.relu(self.conv2(x)), 2) + x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +model = Net() + +aten_model: ExportedProgram = export( + model, + (torch.randn(1, 1, 32, 32),), +) + +edge_program_manager: EdgeProgramManager = to_edge( + aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True) +) +edge_program_manager_copy = copy.deepcopy(edge_program_manager) +et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch() + + +# Generate ETRecord +etrecord_path = "etrecord.bin" +generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager) + +# sphinx_gallery_start_ignore +from unittest.mock import patch + +# sphinx_gallery_end_ignore + +###################################################################### +# +# .. warning:: +# Users should do a deepcopy of the output of ``to_edge()`` and pass in the +# deepcopy to the ``generate_etrecord`` API. This is needed because the +# subsequent call, ``to_executorch()``, does an in-place mutation and will +# lose debug data in the process. +# + +###################################################################### +# Generate ETDump +# --------------- +# +# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results +# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__. +# +# In this tutorial, a `Bundled Program` is created from the example model above. + +import torch +from executorch.devtools import BundledProgram + +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) + +from executorch.exir import to_edge +from torch.export import export + +# Step 1: ExecuTorch Program Export +m_name = "forward" +method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))} + +# Step 2: Construct Method Test Suites +inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)] + +method_test_suites = [ + MethodTestSuite( + method_name=m_name, + test_cases=[ + MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp)) + for inp in inputs + ], + ) +] + +# Step 3: Generate BundledProgram +executorch_program = to_edge(method_graphs).to_executorch() +bundled_program = BundledProgram(executorch_program, method_test_suites) + +# Step 4: Serialize BundledProgram to flatbuffer. +serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer( + bundled_program +) +save_path = "bundled_program.bp" +with open(save_path, "wb") as f: + f.write(serialized_bundled_program) + +###################################################################### +# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``:: +# +# cd executorch +# ./examples/devtools/build_example_runner.sh +# cmake-out/examples/devtools/example_runner --bundled_program_path="bundled_program.bp" + +###################################################################### +# Creating an Inspector +# --------------------- +# +# Final step is to create the ``Inspector`` by passing in the artifact paths. +# Inspector takes the runtime results from ``ETDump`` and correlates them to +# the operators of the Edge Dialect Graph. +# +# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided, +# the Inspector will show runtime results without operator correlation. +# +# To visualize all runtime events, call Inspector's ``print_data_tabular``. + +from executorch.devtools import Inspector + +# sphinx_gallery_start_ignore +inspector_patch = patch.object(Inspector, "__init__", return_value=None) +inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="") +inspector_patch.start() +inspector_patch_print.start() +# sphinx_gallery_end_ignore +etrecord_path = "etrecord.bin" +etdump_path = "etdump.etdp" +inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path) +# sphinx_gallery_start_ignore +inspector.event_blocks = [] +# sphinx_gallery_end_ignore +inspector.print_data_tabular() + +# sphinx_gallery_start_ignore +inspector_patch.stop() +inspector_patch_print.stop() +# sphinx_gallery_end_ignore + +###################################################################### +# Analyzing with an Inspector +# --------------------------- +# +# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__ +# and ``DataFrames``. These mediums give users the ability to perform custom +# analysis about their model performance. +# +# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches. + +# Set Up +import pprint as pp + +import pandas as pd + +pd.set_option("display.max_colwidth", None) +pd.set_option("display.max_columns", None) + +###################################################################### +# If a user wants the raw profiling results, they would do something similar to +# finding the raw runtime data of an ``addmm.out`` event. + +for event_block in inspector.event_blocks: + # Via EventBlocks + for event in event_block.events: + if event.name == "native_call_addmm.out": + print(event.name, event.perf_data.raw) + + # Via Dataframe + df = event_block.to_dataframe() + df = df[df.event_name == "native_call_addmm.out"] + print(df[["event_name", "raw"]]) + print() + +###################################################################### +# If a user wants to trace an operator back to their model code, they would do +# something similar to finding the module hierarchy and stack trace of the +# slowest ``convolution.out`` call. + +for event_block in inspector.event_blocks: + # Via EventBlocks + slowest = None + for event in event_block.events: + if event.name == "native_call_convolution.out": + if slowest is None or event.perf_data.p50 > slowest.perf_data.p50: + slowest = event + if slowest is not None: + print(slowest.name) + print() + pp.pprint(slowest.stack_traces) + print() + pp.pprint(slowest.module_hierarchy) + + # Via Dataframe + df = event_block.to_dataframe() + df = df[df.event_name == "native_call_convolution.out"] + if len(df) > 0: + slowest = df.loc[df["p50"].idxmax()] + print(slowest.event_name) + print() + pp.pprint(slowest.stack_traces) + print() + pp.pprint(slowest.module_hierarchy) + +###################################################################### +# If a user wants the total runtime of a module, they can use +# ``find_total_for_module``. + +print(inspector.find_total_for_module("L__self__")) +print(inspector.find_total_for_module("L__self___conv2")) + +###################################################################### +# Note: ``find_total_for_module`` is a special first class method of +# `Inspector <../sdk-inspector.html>`__ + +###################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we learned about the steps required to consume an ExecuTorch +# model with the ExecuTorch Developer Tools. It also showed how to use the Inspector APIs +# to analyze the model run results. +# +# Links Mentioned +# ^^^^^^^^^^^^^^^ +# +# - `ExecuTorch Developer Tools Overview <../devtools-overview.html>`__ +# - `ETRecord <../sdk-etrecord.html>`__ +# - `ETDump <../sdk-etdump.html>`__ +# - `Inspector <../sdk-inspector.html>`__ diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py index 2071567ddd..fac3eab08e 100644 --- a/docs/source/tutorials_source/export-to-executorch-tutorial.py +++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py @@ -179,8 +179,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: # ----------------------- # # To quantize a model, we first need to capture the graph with -# ``torch._export.capture_pre_autograd_graph``, perform quantization, and then -# call ``torch.export``. ``torch._export.capture_pre_autograd_graph`` returns a +# ``torch.export.export_for_training``, perform quantization, and then +# call ``torch.export``. ``torch.export.export_for_training`` returns a # graph which contains ATen operators which are Autograd safe, meaning they are # safe for eager-mode training, which is needed for quantization. We will call # the graph at this level, the ``Pre-Autograd ATen Dialect`` graph. @@ -193,10 +193,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: # will annotate the nodes in the graph with information needed to quantize the # model properly for a specific backend. -from torch._export import capture_pre_autograd_graph +from torch.export import export_for_training example_args = (torch.randn(1, 3, 256, 256),) -pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args) +pre_autograd_aten_dialect = export_for_training(SimpleConv(), example_args).module() print("Pre-Autograd ATen Dialect Graph") print(pre_autograd_aten_dialect) @@ -523,9 +523,7 @@ def forward(self, a, x, b): executorch_program: ExecutorchProgramManager = edge_program.to_executorch( ExecutorchBackendConfig( passes=[], # User-defined passes - memory_planning_pass=MemoryPlanningPass( - "greedy" - ), # Default memory planning pass + memory_planning_pass=MemoryPlanningPass(), # Default memory planning pass ) ) @@ -562,8 +560,7 @@ def forward(self, a, x, b): # Here is an example for an entire end-to-end workflow: import torch -from torch._export import capture_pre_autograd_graph -from torch.export import export, ExportedProgram +from torch.export import export, export_for_training, ExportedProgram class M(torch.nn.Module): @@ -577,7 +574,7 @@ def forward(self, x): example_args = (torch.randn(3, 4),) -pre_autograd_aten_dialect = capture_pre_autograd_graph(M(), example_args) +pre_autograd_aten_dialect = export_for_training(M(), example_args).module() # Optionally do quantization: # pre_autograd_aten_dialect = convert_pt2e(prepare_pt2e(pre_autograd_aten_dialect, CustomBackendQuantizer)) aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args) diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py index ccc2e480ad..b9a8009c64 100644 --- a/docs/source/tutorials_source/sdk-integration-tutorial.py +++ b/docs/source/tutorials_source/sdk-integration-tutorial.py @@ -6,295 +6,8 @@ # LICENSE file in the root directory of this source tree. """ -Using the ExecuTorch SDK to Profile a Model +Using the ExecuTorch Developer Tools to Profile a Model ======================== -**Author:** `Jack Khuu `__ +Please update your link to . This URL will be deleted after v0.4.0. """ - -###################################################################### -# The `ExecuTorch SDK <../sdk-overview.html>`__ is a set of tools designed to -# provide users with the ability to profile, debug, and visualize ExecuTorch -# models. -# -# This tutorial will show a full end-to-end flow of how to utilize the SDK. -# Specifically, it will: -# -# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__). -# 2. Create an Inspector class consuming these artifacts. -# 3. Utilize the Inspector class to analyze the model. - -###################################################################### -# Prerequisites -# ------------- -# -# To run this tutorial, you’ll first need to -# `Set up your ExecuTorch environment <../getting-started-setup.html>`__. -# - -###################################################################### -# Generate ETRecord (Optional) -# ---------------------------- -# -# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model -# graphs and metadata for linking runtime results (such as profiling) to -# the eager model. This is generated via ``executorch.sdk.generate_etrecord``. -# -# ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the -# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model -# (``ExecutorchProgramManager``), and an optional dictionary of additional models. -# -# In this tutorial, an example model (shown below) is used to demonstrate. - -import copy - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from executorch.exir import ( - EdgeCompileConfig, - EdgeProgramManager, - ExecutorchProgramManager, - to_edge, -) -from executorch.sdk import generate_etrecord -from torch.export import export, ExportedProgram - - -# Generate Model -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - # 1 input image channel, 6 output channels, 5x5 square convolution - # kernel - self.conv1 = nn.Conv2d(1, 6, 5) - self.conv2 = nn.Conv2d(6, 16, 5) - # an affine operation: y = Wx + b - self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - # Max pooling over a (2, 2) window - x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) - # If the size is a square, you can specify with a single number - x = F.max_pool2d(F.relu(self.conv2(x)), 2) - x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -model = Net() - -aten_model: ExportedProgram = export( - model, - (torch.randn(1, 1, 32, 32),), -) - -edge_program_manager: EdgeProgramManager = to_edge( - aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True) -) -edge_program_manager_copy = copy.deepcopy(edge_program_manager) -et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch() - - -# Generate ETRecord -etrecord_path = "etrecord.bin" -generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager) - -# sphinx_gallery_start_ignore -from unittest.mock import patch - -# sphinx_gallery_end_ignore - -###################################################################### -# -# .. warning:: -# Users should do a deepcopy of the output of ``to_edge()`` and pass in the -# deepcopy to the ``generate_etrecord`` API. This is needed because the -# subsequent call, ``to_executorch()``, does an in-place mutation and will -# lose debug data in the process. -# - -###################################################################### -# Generate ETDump -# --------------- -# -# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results -# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__. -# -# In this tutorial, a `Bundled Program` is created from the example model above. - -import torch - -from executorch.exir import to_edge -from executorch.sdk import BundledProgram - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) -from torch.export import export - -# Step 1: ExecuTorch Program Export -m_name = "forward" -method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))} - -# Step 2: Construct Method Test Suites -inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)] - -method_test_suites = [ - MethodTestSuite( - method_name=m_name, - test_cases=[ - MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp)) - for inp in inputs - ], - ) -] - -# Step 3: Generate BundledProgram -executorch_program = to_edge(method_graphs).to_executorch() -bundled_program = BundledProgram(executorch_program, method_test_suites) - -# Step 4: Serialize BundledProgram to flatbuffer. -serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer( - bundled_program -) -save_path = "bundled_program.bp" -with open(save_path, "wb") as f: - f.write(serialized_bundled_program) - -###################################################################### -# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``:: -# -# cd executorch -# ./examples/sdk/build_sdk_example_runner.sh -# cmake-out/examples/sdk/sdk_example_runner --bundled_program_path="bundled_program.bp" - -###################################################################### -# Creating an Inspector -# --------------------- -# -# Final step is to create the ``Inspector`` by passing in the artifact paths. -# Inspector takes the runtime results from ``ETDump`` and correlates them to -# the operators of the Edge Dialect Graph. -# -# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided, -# the Inspector will show runtime results without operator correlation. -# -# To visualize all runtime events, call Inspector's ``print_data_tabular``. - -from executorch.sdk import Inspector - -# sphinx_gallery_start_ignore -inspector_patch = patch.object(Inspector, "__init__", return_value=None) -inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="") -inspector_patch.start() -inspector_patch_print.start() -# sphinx_gallery_end_ignore -etdump_path = "etdump.etdp" -inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path) -# sphinx_gallery_start_ignore -inspector.event_blocks = [] -# sphinx_gallery_end_ignore -inspector.print_data_tabular() - -# sphinx_gallery_start_ignore -inspector_patch.stop() -inspector_patch_print.stop() -# sphinx_gallery_end_ignore - -###################################################################### -# Analyzing with an Inspector -# --------------------------- -# -# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__ -# and ``DataFrames``. These mediums give users the ability to perform custom -# analysis about their model performance. -# -# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches. - -# Set Up -import pprint as pp - -import pandas as pd - -pd.set_option("display.max_colwidth", None) -pd.set_option("display.max_columns", None) - -###################################################################### -# If a user wants the raw profiling results, they would do something similar to -# finding the raw runtime data of an ``addmm.out`` event. - -for event_block in inspector.event_blocks: - # Via EventBlocks - for event in event_block.events: - if event.name == "native_call_addmm.out": - print(event.name, event.perf_data.raw) - - # Via Dataframe - df = event_block.to_dataframe() - df = df[df.event_name == "native_call_addmm.out"] - print(df[["event_name", "raw"]]) - print() - -###################################################################### -# If a user wants to trace an operator back to their model code, they would do -# something similar to finding the module hierarchy and stack trace of the -# slowest ``convolution.out`` call. - -for event_block in inspector.event_blocks: - # Via EventBlocks - slowest = None - for event in event_block.events: - if event.name == "native_call_convolution.out": - if slowest is None or event.perf_data.p50 > slowest.perf_data.p50: - slowest = event - if slowest is not None: - print(slowest.name) - print() - pp.pprint(slowest.stack_traces) - print() - pp.pprint(slowest.module_hierarchy) - - # Via Dataframe - df = event_block.to_dataframe() - df = df[df.event_name == "native_call_convolution.out"] - if len(df) > 0: - slowest = df.loc[df["p50"].idxmax()] - print(slowest.event_name) - print() - pp.pprint(slowest.stack_traces) - print() - pp.pprint(slowest.module_hierarchy) - -###################################################################### -# If a user wants the total runtime of a module, they can use -# ``find_total_for_module``. - -print(inspector.find_total_for_module("L__self__")) -print(inspector.find_total_for_module("L__self___conv2")) - -###################################################################### -# Note: ``find_total_for_module`` is a special first class method of -# `Inspector <../sdk-inspector.html>`__ - -###################################################################### -# Conclusion -# ---------- -# -# In this tutorial, we learned about the steps required to consume an ExecuTorch -# model with the ExecuTorch SDK. It also showed how to use the Inspector APIs -# to analyze the model run results. -# -# Links Mentioned -# ^^^^^^^^^^^^^^^ -# -# - `ExecuTorch SDK <../sdk-overview.html>`__ -# - `ETRecord <../sdk-etrecord.html>`__ -# - `ETDump <../sdk-etdump.html>`__ -# - `Inspector <../sdk-inspector.html>`__ diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md index 9dd2a8a95a..e477d8e6a6 100644 --- a/docs/website/docs/tutorials/bundled_program.md +++ b/docs/website/docs/tutorials/bundled_program.md @@ -49,19 +49,15 @@ Error GetProgramData( Here's an example of how to use the GetProgramData API: ```c++ - std::shared_ptr buff_ptr; - size_t buff_len; - -// FILE_PATH here can be either BundledProgram or Program flatbuffer file. - Error status = torch::executor::util::read_file_content( - FILE_PATH, &buff_ptr, &buff_len); - ET_CHECK_MSG( - status == Error::Ok, - "read_file_content() failed with status 0x%" PRIx32, - status); - - uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model"); - + // Assume that the user has read the contents of the file into file_data using + // whatever method works best for their application. The file could contain + // either BundledProgram data or Program data. + void* file_data = ...; + size_t file_data_len = ...; + + // If file_data contains a BundledProgram, GetProgramData() will return a + // pointer to the Program data embedded inside it. Otherwise it will return + // file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; status = torch::executor::bundled_program::GetProgramData( @@ -88,7 +84,7 @@ To execute the program on the bundled input, we need to load the bundled input i * @returns Return Error::Ok if load successfully, or the error happens during * execution. */ -__ET_NODISCARD Error LoadBundledInput( +ET_NODISCARD Error LoadBundledInput( Method& method, serialized_bundled_program* bundled_program_ptr, size_t testset_idx); @@ -111,7 +107,7 @@ We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput * @returns Return Error::Ok if two outputs match, or the error happens during * execution. */ -__ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( +ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( Method& method, serialized_bundled_program* bundled_program_ptr, size_t testset_idx, @@ -122,14 +118,13 @@ __ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( ### Example -Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/sdk/fb/runners/executor_runner.cpp" and please review that file if you need more info and context: +Here we provide an example about how to run the bundled program step by step. ```c++ // method_name is the name for the method we want to test // memory_manager is the executor::MemoryManager variable for executor memory allocation. // program is the executorch program. Result method = program->load_method(method_name, &memory_manager); - EXECUTORCH_END_PROF(prof_tok); ET_CHECK_MSG( method.ok(), "load_method() failed with status 0x%" PRIx32, diff --git a/examples/README.md b/examples/README.md index 0b0ff0daf3..e3a18cf5a0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,7 +13,7 @@ examples ├── models # Contains a set of popular and representative PyTorch models ├── portable # Contains end-to-end demos for ExecuTorch in portable mode ├── selective_build # Contains demos of selective build for optimizing the binary size of the ExecuTorch runtime -├── sdk # Contains demos of BundledProgram and ETDump +├── devtools # Contains demos of BundledProgram and ETDump ├── demo-apps # Contains demo apps for Android and iOS ├── xnnpack # Contains end-to-end ExecuTorch demos with first-party optimization using XNNPACK ├── apple @@ -35,13 +35,17 @@ A user's journey may commence by exploring the demos located in the [`portable/` [This page](./models/llama2/README.md) demonstrates how to run Llama 2 7B and Llama 3 8B models on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. +## Demo of Llava1.5 7B + +[This page](./models/llava/README.md) demonstrates how to run [Llava 1.5 7B](https://github.com/haotian-liu/LLaVA) model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. + ## Demo of Selective Build To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build/`](./selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective_build.md), offering insights into reducing the binary size while maintaining efficiency. -## Demo of ExecuTorch SDK +## Demo of ExecuTorch Developer Tools -You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data. +You will find demos of [ExecuTorch Developer Tools](./devtools/) in the [`devtools/`](./devtools/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data. ## Demo Apps @@ -63,9 +67,9 @@ The [`arm/`](./arm) directory contains scripts to help you run a PyTorch model o You will find demos of [ExecuTorch QNN Backend](./qualcomm) in the [`qualcomm/`](./qualcomm) directory. -## Demo of ExecuTorch on Xtensa HiFi4 DSP +## Demo of ExecuTorch on Cadence HiFi4 DSP -The [`xtensa/`](./xtensa) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it. +The [`Cadence/`](./cadence) directory hosts a demo that showcases the process of exporting and executing a model on Xtensa Hifi4 DSP. You can utilize [this tutorial](../docs/source/build-run-xtensa.md) to guide you in configuring the demo and running it. ## Dependencies diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm index 346c59637a..405bfb9c6c 100644 --- a/examples/apple/coreml/executor_runner/main.mm +++ b/examples/apple/coreml/executor_runner/main.mm @@ -13,8 +13,7 @@ #import #import #import -#import -#import +#import #import #import #import @@ -25,8 +24,25 @@ static inline id check_class(id obj, Class cls) { #define SAFE_CAST(Object, Type) ((Type *)check_class(Object, [Type class])) -using namespace torch::executor; -using torch::executor::util::FileDataLoader; +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +using executorch::extension::FileDataLoader; +using executorch::runtime::DataLoader; +using executorch::runtime::EValue; +using executorch::runtime::Error; +using executorch::runtime::EventTracer; +using executorch::runtime::EventTracerDebugLogLevel; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::TensorInfo; +using torch::executor::CoreMLBackendDelegate; static constexpr size_t kRuntimeMemorySize = 16 * 1024U * 1024U; // 16 MB @@ -154,13 +170,13 @@ Args parse_command_line_args(NSArray *args) { return data; } -class DataLoaderImpl: public DataLoader { +class DataLoaderImpl final : public DataLoader { public: DataLoaderImpl(const std::string& filePath) :data_(read_data(filePath)) {} - Result load(size_t offset, size_t size, __ET_UNUSED const DataLoader::SegmentInfo& segment_info) override { + Result load(size_t offset, size_t size, ET_UNUSED const DataLoader::SegmentInfo& segment_info) const override { NSData *subdata = [data_ subdataWithRange:NSMakeRange(offset, size)]; return FreeableBuffer(subdata.bytes, size, nullptr); } @@ -170,7 +186,7 @@ Args parse_command_line_args(NSArray *args) { } private: - NSData *data_; + NSData * const data_; }; using Buffer = std::vector; @@ -295,7 +311,7 @@ bool is_model_analysis_enabled(const Args& args) { } void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Args& args) { - etdump_result result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : etdump_result{.buf = nullptr, .size = 0}; + ETDumpResult result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : ETDumpResult{.buf = nullptr, .size = 0}; if (result.size == 0) { return; } @@ -317,7 +333,7 @@ void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Ar int main(int argc, char * argv[]) { @autoreleasepool { - runtime_init(); + executorch::runtime::runtime_init(); auto args = parse_command_line_args([[NSProcessInfo processInfo] arguments]); if (args.purge_models_cache) { diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh index 86ff5f6edb..89cd45ea6b 100755 --- a/examples/apple/coreml/scripts/build_executor_runner.sh +++ b/examples/apple/coreml/scripts/build_executor_runner.sh @@ -29,17 +29,18 @@ rm -rf "$CMAKE_BUILD_DIR_PATH" # Build executorch echo "ExecuTorch: Building executorch" cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \ +-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_TOOLCHAIN_FILE="$IOS_TOOLCHAIN_PATH" \ -DPLATFORM=MAC_UNIVERSAL \ -DDEPLOYMENT_TARGET=13.0 \ -DFLATC_EXECUTABLE="$(which flatc)" \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ -DEXECUTORCH_BUILD_XNNPACK=OFF \ --DEXECUTORCH_BUILD_SDK=ON \ +-DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_COREML=ON \ --DCOREML_BUILD_EXECUTOR_RUNNER=ON \ -Dprotobuf_BUILD_TESTS=OFF \ -Dprotobuf_BUILD_EXAMPLES=OFF \ +-DCOREML_BUILD_EXECUTOR_RUNNER=ON \ -DCMAKE_MACOSX_BUNDLE=OFF \ cmake --build "$CMAKE_BUILD_DIR_PATH" -j9 -t coremldelegate @@ -55,7 +56,7 @@ mkdir -p "$EXECUTORCH_INCLUDE_DIR_PATH" find extension \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; find runtime \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; find util \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; -find sdk \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; +find devtools \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH" # Copy required libraries diff --git a/examples/apple/coreml/scripts/debugger_cli.py b/examples/apple/coreml/scripts/debugger_cli.py new file mode 100644 index 0000000000..88390f8d8c --- /dev/null +++ b/examples/apple/coreml/scripts/debugger_cli.py @@ -0,0 +1,181 @@ +# Copyright © 2024 Apple Inc. All rights reserved. +# +# Please refer to the license found in the LICENSE file in the root directory of the source tree. + +import argparse +import sys +import tempfile +from pathlib import Path +from typing import List, Tuple + +import coremltools as ct +from executorch.backends.apple.coreml.compiler import CoreMLBackend +from executorch.exir import EdgeProgramManager + +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.tracer import Value +from tabulate import tabulate + + +def get_root_dir_path() -> Path: + return Path(__file__).resolve().parent.parent.parent.parent.parent + + +sys.path.append(str((get_root_dir_path() / "examples").resolve())) + +from inspector_utils import ( + build_devtools_runner_including_coreml, + ComparisonResult, + create_inspector_coreml, + create_inspector_reference, + get_comparison_result, + module_to_edge, +) + +from models import MODEL_NAME_TO_MODEL +from models.model_factory import EagerModelFactory + + +def args_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + + parser.add_argument( + "-m", + "--model_name", + required=True, + help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}", + ) + + parser.add_argument( + "-c", + "--compute_unit", + required=False, + default=ct.ComputeUnit.ALL.name.lower(), + help=f"Provide compute unit for the model. Valid ones: {[[compute_unit.name.lower() for compute_unit in ct.ComputeUnit]]}", + ) + + parser.add_argument( + "-precision", + "--compute_precision", + required=False, + default=ct.precision.FLOAT16.value, + help=f"Provide compute precision for the model. Valid ones: {[[precision.value for precision in ct.precision]]}", + ) + + parser.add_argument( + "--compile", + action=argparse.BooleanOptionalAction, + required=False, + default=False, + ) + + parser.add_argument( + "-env", + "--conda_environment_name", + required=False, + default="executorch", + help="Provide conda environment name.", + ) + + return parser + + +def get_compile_specs_from_args(args): + model_type = CoreMLBackend.MODEL_TYPE.MODEL + if args.compile: + model_type = CoreMLBackend.MODEL_TYPE.COMPILED_MODEL + + compute_precision = ct.precision(args.compute_precision) + compute_unit = ct.ComputeUnit[args.compute_unit.upper()] + + return CoreMLBackend.generate_compile_specs( + compute_precision=compute_precision, + compute_unit=compute_unit, + model_type=model_type, + minimum_deployment_target=ct.target.iOS17, + ) + + +def compare_intermediate_tensors( + edge_program: EdgeProgramManager, + example_inputs: Tuple[Value, ...], + coreml_compile_specs: List[CompileSpec], + model_name: str, + working_dir_path: Path, +) -> ComparisonResult: + inspector_coreml = create_inspector_coreml( + edge_program=edge_program, + compile_specs=coreml_compile_specs, + example_inputs=example_inputs, + model_name=model_name, + working_dir_path=working_dir_path, + root_dir_path=get_root_dir_path(), + ) + + inspector_reference = create_inspector_reference( + edge_program=edge_program, + example_inputs=example_inputs, + model_name=model_name, + working_dir_path=working_dir_path, + root_dir_path=get_root_dir_path(), + ) + + return get_comparison_result( + inspector1=inspector_reference, + tag1="reference", + inspector2=inspector_coreml, + tag2="coreml", + ) + + +def main() -> None: + parser = args_parser() + args = parser.parse_args() + + if args.model_name not in MODEL_NAME_TO_MODEL: + raise RuntimeError( + f"Model {args.model_name} is not a valid name. " + f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}." + ) + + valid_compute_units = [compute_unit.name.lower() for compute_unit in ct.ComputeUnit] + if args.compute_unit not in valid_compute_units: + raise RuntimeError( + f"{args.compute_unit} is invalid. " + f"Valid compute units are {valid_compute_units}." + ) + + build_devtools_runner_including_coreml( + root_dir_path=get_root_dir_path(), conda_env_name=args.conda_environment_name + ) + + model, example_inputs, _ = EagerModelFactory.create_model( + *MODEL_NAME_TO_MODEL[args.model_name] + ) + + model.eval() + edge_program = module_to_edge( + module=model, + example_inputs=example_inputs, + ) + + coreml_compile_specs = get_compile_specs_from_args(args) + + with tempfile.TemporaryDirectory() as temp_dir_name: + working_dir_path = Path(temp_dir_name) / "debugger" + working_dir_path.mkdir(parents=True, exist_ok=True) + comparison_result = compare_intermediate_tensors( + edge_program=edge_program, + example_inputs=example_inputs, + coreml_compile_specs=coreml_compile_specs, + model_name=args.model_name, + working_dir_path=working_dir_path, + ) + + print( + tabulate(comparison_result.to_dataframe(), headers="keys", tablefmt="grid") + ) + + +if __name__ == "__main__": + main() # pragma: no cover diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py index 4bf26a7f3e..e906c0704c 100644 --- a/examples/apple/coreml/scripts/export.py +++ b/examples/apple/coreml/scripts/export.py @@ -17,10 +17,10 @@ from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from executorch.devtools.etrecord import generate_etrecord from executorch.exir import to_edge from executorch.exir.backend.backend_api import to_backend -from executorch.sdk.etrecord import generate_etrecord from torch.export import export REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent @@ -104,11 +104,7 @@ def export_lowered_module_to_executorch_program(lowered_module, example_inputs): lowered_module(*example_inputs) exec_prog = to_edge( export(lowered_module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG - ).to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) - ) + ).to_executorch(config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)) return exec_prog @@ -178,9 +174,7 @@ def generate_compile_specs_from_args(args): ) delegated_program_manager = edge_program_manager.to_backend(partitioner) exec_program = delegated_program_manager.to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) + config=exir.ExecutorchBackendConfig(extract_delegate_segments=True) ) else: lowered_module, edge_copy = lower_module_to_coreml( diff --git a/examples/apple/coreml/scripts/extract_coreml_models.py b/examples/apple/coreml/scripts/extract_coreml_models.py index 6317b0f3d3..d2812d37ab 100644 --- a/examples/apple/coreml/scripts/extract_coreml_models.py +++ b/examples/apple/coreml/scripts/extract_coreml_models.py @@ -57,7 +57,7 @@ def extract_coreml_models(pte_data: bytes): model_index += 1 if len(coreml_delegates) == 0: - print("The model isn't delegated to CoreML.") + print("The model isn't delegated to Core ML.") if __name__ == "__main__": diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py index 077c8c26ef..c63d4791fc 100644 --- a/examples/apple/coreml/scripts/inspector_cli.py +++ b/examples/apple/coreml/scripts/inspector_cli.py @@ -1,43 +1,24 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. +# Copyright © 2024 Apple Inc. All rights reserved. # -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. +# Please refer to the license found in the LICENSE file in the root directory of the source tree. import argparse -import json -from typing import Any, Dict, Final, List, Tuple, Union +import sys -from executorch.sdk import Inspector -from executorch.sdk.inspector._inspector_utils import compare_results +from pathlib import Path -COREML_METADATA_KEYS: Final[List[Tuple[str, str]]] = [ - ("operatorName", "coreml_operator"), - ("estimatedCost", "coreml_estimated_cost"), - ("preferredComputeUnit", "coreml_preferred_device"), - ("supportedComputeUnits", "coreml_supported_devices"), -] +from executorch.devtools import Inspector +from executorch.devtools.inspector import compare_results -def parse_coreml_delegate_metadata(delegate_metadatas: List[str]) -> Dict[str, Any]: - try: - coreml_metadata: Dict[str, Any] = json.loads(delegate_metadatas[0]) - result: Dict[str, str] = {} - for col_key, col_name in COREML_METADATA_KEYS: - value = coreml_metadata.get(col_key, None) - if value is not None: - result[col_name] = value - return result +def get_root_dir_path() -> Path: + return Path().resolve().parent.parent.parent.parent - except ValueError: - return {} +sys.path.append(str((get_root_dir_path() / "examples").resolve())) -def convert_coreml_delegate_time( - event_name: Union[str, int], input_time: Union[int, float] -) -> Union[int, float]: - return input_time / (1000 * 1000) +from inspector_utils import convert_coreml_delegate_time, parse_coreml_delegate_metadata def main() -> None: @@ -55,7 +36,7 @@ def main() -> None: parser.add_argument( "--debug_buffer_path", required=False, - help="Provide an optional buffer file path.", + help="Provide an optional debug buffer file path.", ) parser.add_argument("--compare_results", action="store_true") diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py new file mode 100644 index 0000000000..08af6fb348 --- /dev/null +++ b/examples/apple/coreml/scripts/inspector_utils.py @@ -0,0 +1,428 @@ +# Copyright © 2024 Apple Inc. All rights reserved. +# +# Please refer to the license found in the LICENSE file in the root directory of the source tree. + +import copy +import errno +import json +import os + +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from typing import Any, Dict, Final, List, Optional, Tuple, Union + +import executorch.exir as exir + +import pandas as pd +import torch +from executorch.backends.apple.coreml.compiler import CoreMLBackend +from executorch.backends.apple.coreml.partition import CoreMLPartitioner + +from executorch.devtools import BundledProgram, generate_etrecord, Inspector +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) +from executorch.devtools.inspector import Event + +from executorch.exir import ( + EdgeProgramManager, + ExecutorchBackendConfig, + ExecutorchProgramManager, + ExirExportedProgram, + to_edge, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.tracer import Value + +from torch.export import export, ExportedProgram + +COREML_METADATA_KEYS: Final[List[Tuple[str, str]]] = [ + ("operatorName", "coreml_operator"), + ("estimatedCost", "coreml_estimated_cost"), + ("preferredComputeUnit", "coreml_preferred_device"), + ("supportedComputeUnits", "coreml_supported_devices"), +] + + +def build_devtools_runner_including_coreml( + root_dir_path: Path, + conda_env_name: str, + force: bool = False, +): + if not force: + devtools_executable_path = ( + root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner" + ) + print(devtools_executable_path) + if devtools_executable_path.is_file(): + return + + cd_root_command: str = f"cd {root_dir_path.resolve()}" + conda_activate_env_command: str = f"source conda activate {conda_env_name}" + build_devtools_runner_command: str = ( + "./examples/devtools/build_example_runner.sh --coreml" + ) + build_command: str = ( + f"{cd_root_command} && {conda_activate_env_command} && {build_devtools_runner_command}" + ) + subprocess.run( + f'bash -c "{build_command}"', shell=True, check=True + ).check_returncode() + + +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=False, + _skip_dim_order=True, +) + +_EDGE_BACKEND_CONFIG = exir.ExecutorchBackendConfig( + extract_delegate_segments=True, +) + + +def to_core_aten( + module: torch.nn.Module, + example_inputs: Tuple[Value, ...], +) -> ExportedProgram: + core_aten_program = export( + mod=module, + args=example_inputs, + ) + return core_aten_program + + +def core_aten_to_edge( + core_aten_program: ExportedProgram, + edge_compile_config: exir.EdgeCompileConfig, +) -> EdgeProgramManager: + edge_manager = to_edge( + programs=core_aten_program, + compile_config=edge_compile_config, + ) + return edge_manager + + +def module_to_edge( + module: torch.nn.Module, + example_inputs: Tuple[Value, ...], + edge_compile_config: exir.EdgeCompileConfig = _EDGE_COMPILE_CONFIG, +) -> EdgeProgramManager: + module.eval() + core_aten_program = to_core_aten( + module=module, + example_inputs=example_inputs, + ) + return core_aten_to_edge( + core_aten_program=core_aten_program, + edge_compile_config=edge_compile_config, + ) + + +def lower_and_export_edge_to_coreml( + edge_program: EdgeProgramManager, + compile_specs: List[CompileSpec], + config: ExecutorchBackendConfig, + skip_ops_for_coreml_delegation: Optional[List[str]] = None, +) -> ExirExportedProgram: + partitioner = CoreMLPartitioner( + skip_ops_for_coreml_delegation=skip_ops_for_coreml_delegation, + compile_specs=compile_specs, + ) + delegated_program_manager = edge_program.to_backend( + partitioner, + ) + executorch_program = delegated_program_manager.to_executorch( + config=config, + ) + return executorch_program + + +def write_to_file(buffer: bytes, file_path: Path): + with open(file_path.resolve(), "wb") as file: + file.write(buffer) + + +def generate_bundled_program( + executorch_program: ExecutorchProgramManager, + example_inputs: Tuple[Value, ...], + method_name: str, + bundled_program_path: Path, +): + method_test_suites = [ + MethodTestSuite( + method_name=method_name, + test_cases=[MethodTestCase(inputs=example_inputs)], + ) + ] + + bundled_program = BundledProgram(executorch_program, method_test_suites) + bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer( + bundled_program + ) + + write_to_file(buffer=bundled_program_buffer, file_path=bundled_program_path) + + +def generate_etdump_with_intermediate_values( + root_dir_path: Path, + bundled_program_path: Path, + et_dump_path: Path, + debug_buffer_path: Path, + debug_buffer_size: int, +): + devtools_executable_path = ( + root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner" + ) + if not devtools_executable_path.is_file(): + raise FileNotFoundError( + errno.ENOENT, + os.strerror(errno.ENOENT), + str(devtools_executable_path.resolve()), + ) + + devtools_runner_command: str = f""" + {devtools_executable_path.resolve()} -dump_intermediate_outputs\ + -bundled_program_path {bundled_program_path.resolve()}\ + -etdump_path {et_dump_path.resolve()}\ + -debug_output_path {debug_buffer_path.resolve()}\ + -debug_buffer_size {debug_buffer_size}""" + subprocess.run( + f'bash -c "{devtools_runner_command}"', shell=True, check=True + ).check_returncode() + + +def create_inspector( + edge_program: EdgeProgramManager, + executorch_program: ExecutorchProgramManager, + example_inputs: Tuple[Value, ...], + model_name: str, + root_dir_path: Path, + working_dir_path: Path, + method_name: str = "forward", + debug_buffer_size: int = 1 * 1024 * 1024 * 1024, + delegate_metadata_parser=None, + delegate_time_scale_converter=None, +) -> Inspector: + et_record_path = working_dir_path / f"{model_name}_etrecord.bin" + generate_etrecord( + et_record=et_record_path.resolve(), + edge_dialect_program=edge_program, + executorch_program=executorch_program, + ) + + bundled_program_path = working_dir_path / f"{model_name}.bpte" + generate_bundled_program( + executorch_program=executorch_program, + example_inputs=example_inputs, + method_name=method_name, + bundled_program_path=bundled_program_path, + ) + + et_dump_path: Path = working_dir_path / f"{model_name}_etdump.etdp" + debug_buffer_path: Path = working_dir_path / f"{model_name}_debug_output.bin" + generate_etdump_with_intermediate_values( + root_dir_path=root_dir_path, + bundled_program_path=bundled_program_path, + et_dump_path=et_dump_path, + debug_buffer_path=debug_buffer_path, + debug_buffer_size=debug_buffer_size, + ) + + return Inspector( + etdump_path=str(et_dump_path.resolve()), + etrecord=str(et_record_path.resolve()), + debug_buffer_path=str(debug_buffer_path.resolve()), + enable_module_hierarchy=True, + delegate_metadata_parser=delegate_metadata_parser, + delegate_time_scale_converter=delegate_time_scale_converter, + ) + + +def parse_coreml_delegate_metadata(delegate_metadatas: List[str]) -> Dict[str, Any]: + if len(delegate_metadatas) == 0: + return + try: + coreml_metadata: Dict[str, Any] = json.loads(delegate_metadatas[0]) + result: Dict[str, str] = {} + for col_key, col_name in COREML_METADATA_KEYS: + value = coreml_metadata.get(col_key, None) + if value is not None: + result[col_name] = value + return result + + except ValueError: + return {} + + +def convert_coreml_delegate_time( + event_name: Union[str, int], input_time: Union[int, float] +) -> Union[int, float]: + return input_time / (1000 * 1000) + + +def create_inspector_coreml( + edge_program: EdgeProgramManager, + compile_specs: List[CompileSpec], + example_inputs: Tuple[Value, ...], + model_name: str, + root_dir_path: Path, + working_dir_path: Path, + method_name: str = "forward", + debug_buffer_size: int = 1 * 1024 * 1024 * 1024, +) -> Inspector: + edge_program_copy = copy.deepcopy(edge_program) + executorch_program = lower_and_export_edge_to_coreml( + edge_program=edge_program_copy, + compile_specs=compile_specs, + config=_EDGE_BACKEND_CONFIG, + ) + return create_inspector( + edge_program=edge_program, + executorch_program=executorch_program, + example_inputs=example_inputs, + root_dir_path=root_dir_path, + model_name=f"{model_name}_coreml", + working_dir_path=working_dir_path, + method_name=method_name, + debug_buffer_size=debug_buffer_size, + delegate_metadata_parser=parse_coreml_delegate_metadata, + delegate_time_scale_converter=convert_coreml_delegate_time, + ) + + +def create_inspector_reference( + edge_program: EdgeProgramManager, + example_inputs: Tuple[Value, ...], + model_name: str, + root_dir_path: Path, + working_dir_path: Path, + method_name: str = "forward", + debug_buffer_size: int = 1 * 1024 * 1024 * 1024, +) -> Inspector: + edge_program_copy = copy.deepcopy(edge_program) + return create_inspector( + edge_program=edge_program, + executorch_program=edge_program_copy.to_executorch(), + example_inputs=example_inputs, + root_dir_path=root_dir_path, + model_name=f"{model_name}_default", + working_dir_path=working_dir_path, + method_name=method_name, + debug_buffer_size=debug_buffer_size, + ) + + +def get_debug_handle_to_event_map( + inspector: Inspector, + event_block_name: str = "Execute", +) -> Dict[int, Event]: + result = {} + + def is_not_blank(s): + return bool(s and not s.isspace()) + + event_names_to_ignore = {"DELEGATE_CALL", "OPERATOR_CALL"} + for event_block in inspector.event_blocks: + if event_block.name == event_block_name: + for event in event_block.events: + if is_not_blank(event.name) and event.name not in event_names_to_ignore: + debug_handles = [] + if isinstance(event.debug_handles, int): + debug_handles.append(event.debug_handles) + elif isinstance(event.debug_handles, list): + debug_handles.extend(event.debug_handles) + debug_handles.sort() + for debug_handle in debug_handles: + if len(event.debug_data) > 0: + result[debug_handle] = event + return result + + +@dataclass +class EventData: + tag: str + event: Event + + +@dataclass +class ComparisonResult: + datas: List[tuple[EventData, EventData]] + + def to_dataframe( + self, + atol: float = 1e-3, + rtol: float = 1e-3, + ) -> pd.DataFrame: + def get_compute_device(event: Event) -> str: + if event.delegate_backend_name == CoreMLBackend.__name__: + return event.delegate_debug_metadatas.get( + "coreml_preferred_device", "CPU" + ) + + return "CPU" + + if len(self.datas) == 0: + return + + (data1, data2) = self.datas[0] + dict = { + data1.tag: [], + f"{data1.tag}_compute_unit": [], + data2.tag: [], + f"{data2.tag}_compute_unit": [], + "max_diff": [], + } + + for data1, data2 in self.datas: + event1 = data1.event + event2 = data2.event + debug_data1 = event1.debug_data[0] + debug_data2 = event2.debug_data[0] + + if debug_data1.size() != debug_data2.size(): + continue + + max_diff = 0.0 + indices = torch.isclose( + debug_data1, debug_data2, atol=atol, rtol=rtol + ).logical_not() + + # Find the maximum difference + if torch.count_nonzero(indices) > 0: + values1 = torch.masked_select(debug_data1, indices) + values2 = torch.masked_select(debug_data2, indices) + diff = torch.abs(values1 - values2) + max_diff = torch.max(diff).item() + + dict[f"{data1.tag}_compute_unit"].append(get_compute_device(event1)) + dict[f"{data2.tag}_compute_unit"].append(get_compute_device(event2)) + dict["max_diff"].append(max_diff) + dict[data1.tag].append(event1.name) + dict[data2.tag].append(event2.name) + + return pd.DataFrame(dict) + + +def get_comparison_result( + inspector1: Inspector, + tag1: str, + inspector2: Inspector, + tag2: str, +) -> ComparisonResult: + debug_handle_event_map_1 = get_debug_handle_to_event_map(inspector1) + debug_handle_event_map_2 = get_debug_handle_to_event_map(inspector2) + + event_datas = [] + for handle, event1 in debug_handle_event_map_1.items(): + event2 = debug_handle_event_map_2.get(handle, None) + if event2 is None: + continue + + event_data1 = EventData(tag=tag1, event=event1) + event_data2 = EventData(tag=tag2, event=event2) + event_datas.append((event_data1, event_data2)) + + return ComparisonResult(datas=event_datas) diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt index 9ae528668c..319d8159ce 100644 --- a/examples/apple/mps/CMakeLists.txt +++ b/examples/apple/mps/CMakeLists.txt @@ -92,8 +92,8 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") include(${EXECUTORCH_SRCS_FILE}) target_include_directories( bundled_program - INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/include - ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/bundled_program + INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include + ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/bundled_program ${EXECUTORCH_ROOT}/third-party/flatbuffers/include ${EXECUTORCH_ROOT}/third-party/flatcc/include ${_mps_schema_headers} @@ -107,6 +107,10 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") set(FLATCC_LIB flatccrt) endif() + if(CMAKE_BUILD_TYPE MATCHES "Debug") + target_link_options(mps_executor_runner PUBLIC -fsanitize=undefined) + endif() + target_link_libraries( mps_executor_runner bundled_program diff --git a/examples/apple/mps/README.md b/examples/apple/mps/README.md index bebd1329be..dc01d585f8 100644 --- a/examples/apple/mps/README.md +++ b/examples/apple/mps/README.md @@ -30,7 +30,7 @@ Once we have the model binary file, then let's run it with the ExecuTorch runtim # Build and install executorch cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DEXECUTORCH_BUILD_MPS=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm index 604419a620..e3d0e2978b 100644 --- a/examples/apple/mps/executor_runner/mps_executor_runner.mm +++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm @@ -30,8 +30,8 @@ #include #include #include -#include -#include +#include +#include #include using namespace std::chrono; @@ -97,8 +97,26 @@ 262144, // 256 KB "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging."); -using namespace torch::executor; -using torch::executor::util::FileDataLoader; +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +using executorch::extension::BufferCleanup; +using executorch::extension::BufferDataLoader; +using executorch::extension::FileDataLoader; +using executorch::runtime::DataLoader; +using executorch::runtime::EValue; +using executorch::runtime::Error; +using executorch::runtime::EventTracerDebugLogLevel; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; + +namespace bundled_program = executorch::bundled_program; int main(int argc, char** argv) { { @@ -113,7 +131,7 @@ int main(int argc, char** argv) { return 1; } - runtime_init(); + executorch::runtime::runtime_init(); gflags::ParseCommandLineFlags(&argc, &argv, true); if (argc != 1) { @@ -144,20 +162,20 @@ int main(int argc, char** argv) { // Find the offset to the embedded Program. const void* program_data; size_t program_data_len; - Error status = torch::executor::bundled_program::GetProgramData( + Error status = bundled_program::get_program_data( const_cast(file_data->data()), file_data->size(), &program_data, &program_data_len); ET_CHECK_MSG( status == Error::Ok, - "GetProgramData() failed on file '%s': 0x%x", + "get_program_data() failed on file '%s': 0x%x", model_path, (unsigned int)status); // Wrap the buffer in a DataLoader. auto buffer_data_loader = - util::BufferDataLoader(program_data, program_data_len); + BufferDataLoader(program_data, program_data_len); // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. @@ -239,7 +257,7 @@ HierarchicalAllocator planned_memory( // be used by a single thread at at time, but it can be reused. // - torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); + ETDumpGen etdump_gen; Result method = program->load_method(method_name, &memory_manager, &etdump_gen); ET_CHECK_MSG( @@ -263,11 +281,11 @@ HierarchicalAllocator planned_memory( } // Prepare the inputs. - std::unique_ptr inputs; + std::unique_ptr inputs; if (FLAGS_bundled_program) { ET_LOG(Info, "Loading bundled program..."); // Use the inputs embedded in the bundled program. - status = torch::executor::bundled_program::LoadBundledInput( + status = bundled_program::load_bundled_input( *method, file_data->data(), FLAGS_testset_idx); @@ -278,11 +296,11 @@ HierarchicalAllocator planned_memory( } else { ET_LOG(Info, "Loading non-bundled program...\n"); // Use ones-initialized inputs. - auto inputs_result = torch::executor::util::prepare_input_tensors(*method); + auto inputs_result = executorch::extension::prepare_input_tensors(*method); if (inputs_result.ok()) { // Will free the inputs when destroyed. inputs = - std::make_unique(std::move(inputs_result.get())); + std::make_unique(std::move(inputs_result.get())); } } ET_LOG(Info, "Inputs prepared."); @@ -322,14 +340,14 @@ HierarchicalAllocator planned_memory( status = method->get_outputs(outputs.data(), outputs.size()); ET_CHECK(status == Error::Ok); // Print the first and last 100 elements of long lists of scalars. - std::cout << torch::executor::util::evalue_edge_items(100); + std::cout << executorch::extension::evalue_edge_items(100); for (int i = 0; i < outputs.size(); ++i) { std::cout << "Output " << i << ": " << outputs[i] << std::endl; } // Dump the etdump data containing profiling/debugging data to the specified // file. - etdump_result result = etdump_gen.get_etdump_data(); + ETDumpResult result = etdump_gen.get_etdump_data(); if (result.buf != nullptr && result.size > 0) { FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+"); fwrite((uint8_t*)result.buf, 1, result.size, f); @@ -362,7 +380,7 @@ HierarchicalAllocator planned_memory( atol = 1e-01; rtol = 1e-01; } - status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput( + status = bundled_program::verify_method_outputs( *method, file_data->data(), FLAGS_testset_idx, diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl index fd0a7a5046..14399411ae 100644 --- a/examples/apple/mps/executor_runner/targets.bzl +++ b/examples/apple/mps/executor_runner/targets.bzl @@ -28,9 +28,9 @@ def define_common_targets(): "//executorch/extension/data_loader:file_data_loader", "//executorch/kernels/portable:generated_lib", "//executorch/extension/data_loader:file_data_loader", - "//executorch/sdk/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/sdk/bundled_program:runtime", + "//executorch/devtools/bundled_program:runtime", ], external_deps = [ "gflags", diff --git a/examples/apple/mps/scripts/bench_utils.py b/examples/apple/mps/scripts/bench_utils.py index 792e3b6d82..7eca984f82 100644 --- a/examples/apple/mps/scripts/bench_utils.py +++ b/examples/apple/mps/scripts/bench_utils.py @@ -5,36 +5,13 @@ import logging import time +from typing import Tuple import torch +from executorch.backends.apple.mps.test.test_mps_utils import TestMPS from torch.export.exported_program import ExportedProgram -def assert_outputs_equal(model_output, ref_output): - """ - Helper testing function that asserts that the model output and the reference output - are equal with some tolerance. Due to numerical differences between eager mode and - the MPS's backend, we relax the detal such that absolute tolerance is 1e-3. and - relative tolerance is 1e-3. - """ - - # Compare the result from executor and eager mode direclty - if isinstance(ref_output, tuple) or isinstance(ref_output, list): - # Multiple outputs executor always returns tuple, even if there is one output - assert len(ref_output) == len( - model_output - ), "Length of outputs is not matching!" - for i in range(len(ref_output)): - assert torch.allclose( - model_output[i], ref_output[i], atol=1e-03, rtol=1e-03 - ) - else: - # If one output, eager returns tensor while executor tuple of size 1 - assert torch.allclose( - model_output[0], ref_output, atol=1e-03, rtol=1e-03 - ), "Outputs are not matching!" - - def bench_forward(func, *args): # warmup for _ in range(10): @@ -101,17 +78,31 @@ def bench_torch(executorch_program: ExportedProgram, model, inputs, model_name): ) -def compare_outputs(executorch_program: ExportedProgram, model, inputs, model_name): +def compare_outputs( + executorch_program: ExportedProgram, + model: torch.nn.Module, + inputs: Tuple[torch.tensor], + model_name: str, + use_fp16: bool, +): + test_module = TestMPS() inputs_copy = [] + if use_fp16: + model = model.to(torch.float16) + model = model for t in inputs: - inputs_copy.append(t.detach().clone()) + tensor = t.detach().clone() + if use_fp16 and tensor.dtype == torch.float32: + tensor = tensor.to(torch.float16) + inputs_copy.append(tensor) inputs_copy = tuple(inputs_copy) - pytorch_results = model(*inputs) + pytorch_results = model(*inputs_copy) + executorch_model = get_executorch_model(executorch_program) if executorch_model is not None: - executorch_results = executorch_model.forward(inputs_copy) - assert_outputs_equal(executorch_results, pytorch_results) + executorch_results = executorch_model.forward(inputs) + test_module.assert_outputs_equal(executorch_results, pytorch_results, use_fp16) logging.info( f"Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for {model_name} are matching!" ) diff --git a/examples/apple/mps/scripts/build_mps_executor_runner.sh b/examples/apple/mps/scripts/build_mps_executor_runner.sh index 16754588b6..31ab54fd4d 100755 --- a/examples/apple/mps/scripts/build_mps_executor_runner.sh +++ b/examples/apple/mps/scripts/build_mps_executor_runner.sh @@ -41,7 +41,7 @@ rm -rf "$OUTPUT" cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE="$MODE" \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DEXECUTORCH_BUILD_MPS=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py index 293c6f1404..d6416e0ffc 100644 --- a/examples/apple/mps/scripts/mps_example.py +++ b/examples/apple/mps/scripts/mps_example.py @@ -14,6 +14,11 @@ from executorch import exir from executorch.backends.apple.mps import MPSBackend from executorch.backends.apple.mps.partition import MPSPartitioner +from executorch.devtools import BundledProgram, generate_etrecord +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ( EdgeCompileConfig, @@ -24,11 +29,6 @@ from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.extension.export_util.utils import export_to_edge, save_pte_program -from executorch.sdk import BundledProgram, generate_etrecord -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from ....models import MODEL_NAME_TO_MODEL from ....models.model_factory import EagerModelFactory @@ -155,6 +155,8 @@ def get_model_config(args): model, example_inputs, _ = EagerModelFactory.create_model(**model_config) model = model.eval() + + # Deep copy the model inputs to check against PyTorch forward pass if args.check_correctness or args.bench_pytorch: model_copy = copy.deepcopy(model) inputs_copy = [] @@ -181,9 +183,7 @@ def get_model_config(args): logging.info(f"Lowered graph:\n{edge.exported_program().graph}") executorch_program = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) else: lowered_module = to_backend( @@ -193,11 +193,7 @@ def get_model_config(args): lowered_module, example_inputs, edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False), - ).to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) - ) + ).to_executorch(config=ExecutorchBackendConfig(extract_delegate_segments=False)) model_name = f"{args.model_name}_mps" @@ -228,4 +224,6 @@ def get_model_config(args): bench_torch(executorch_program, model_copy, example_inputs, model_name) if args.check_correctness: - compare_outputs(executorch_program, model_copy, inputs_copy, model_name) + compare_outputs( + executorch_program, model_copy, inputs_copy, model_name, args.use_fp16 + ) diff --git a/examples/apple/mps/test_mps.sh b/examples/apple/mps/test_mps.sh index 55712089e0..555161dd3f 100755 --- a/examples/apple/mps/test_mps.sh +++ b/examples/apple/mps/test_mps.sh @@ -11,14 +11,14 @@ set -e # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/../../../.ci/scripts/utils.sh" -cmake_install_executorch_sdk_lib() { +cmake_install_executorch_devtools_lib() { echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a" rm -rf cmake-out retry cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_MPS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ @@ -60,5 +60,5 @@ then fi -cmake_install_executorch_sdk_lib +cmake_install_executorch_devtools_lib test_cmake_mps diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index f854a081fa..4d77e81908 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -226,9 +226,7 @@ def forward(self, x): try: exec_prog = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) except RuntimeError as e: if "Missing out variants" in str(e.args[0]): diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 1f42eda9fb..68c5435dff 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -9,25 +9,38 @@ project(arm_executor_runner) option(SEMIHOSTING "Enable semihosting" OFF) if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING}) - message(FATAL_ERROR - "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the " - "model is built into the binary.") + message( + FATAL_ERROR + "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the " + "model is built into the binary." + ) endif() # Example ExecuTorch demo for bare metal Cortex-M based systems -set(ET_DIR_PATH "../../.." CACHE PATH - "Path to ExecuTorch dir") -set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH - "Path to ExecuTorch build dir") -set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH - "Path to ExecuTorch headers") -set(ET_PTE_FILE_PATH "" CACHE PATH - "Path to ExecuTorch model pte") -set(ETHOS_SDK_PATH "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" CACHE PATH - "Path to Ethos-U bare metal driver/env") -set(PYTHON_EXECUTABLE "python" CACHE PATH - "Define to override python executable used") - +set(ET_DIR_PATH + "../../.." + CACHE PATH "Path to ExecuTorch dir" +) +set(ET_BUILD_DIR_PATH + "${ET_DIR_PATH}/cmake-out" + CACHE PATH "Path to ExecuTorch build dir" +) +set(ET_INCLUDE_PATH + "${ET_DIR_PATH}/.." + CACHE PATH "Path to ExecuTorch headers" +) +set(ET_PTE_FILE_PATH + "" + CACHE PATH "Path to ExecuTorch model pte" +) +set(ETHOS_SDK_PATH + "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" + CACHE PATH "Path to Ethos-U bare metal driver/env" +) +set(PYTHON_EXECUTABLE + "python" + CACHE PATH "Define to override python executable used" +) get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) @@ -104,23 +117,25 @@ set_property( # Convert pte to header if(NOT ${SEMIHOSTING}) - add_custom_target(gen_model_header - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h) + add_custom_target( + gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + ) add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py - --pte ${ET_PTE_FILE_PATH} - --outdir ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${ET_PTE_FILE_PATH} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py --pte + ${ET_PTE_FILE_PATH} --outdir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${ET_PTE_FILE_PATH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) endif() # The arm_executor_runner executable add_executable(arm_executor_runner) -target_sources(arm_executor_runner PRIVATE arm_executor_runner.cpp) +target_sources( + arm_executor_runner PRIVATE arm_executor_runner.cpp arm_perf_monitor.cpp +) # Include the target's bare-metal linker script ethosu_eval_link_options(arm_executor_runner) @@ -146,19 +161,17 @@ target_include_directories( arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR} ) - - if(SEMIHOSTING) -target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) + target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) else() -add_dependencies(arm_executor_runner gen_model_header) + add_dependencies(arm_executor_runner gen_model_header) endif() # Fixup compilation of retarget.c if(SEMIHOSTING) -# Remove this when MLBEDSW-8910 is closed. -set_source_files_properties( - ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c - PROPERTIES HEADER_FILE_ONLY TRUE -) + # Remove this when MLBEDSW-8910 is closed. + set_source_files_properties( + ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c + PROPERTIES HEADER_FILE_ONLY TRUE + ) endif() diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index c5528b2519..7cc27c7cce 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -20,23 +20,28 @@ #include #include -/** - * This header file is generated by the build process based on the .pte file - * specified in the ET_PTE_FILE_PATH variable to the cmake build. - * Control of the action of the .pte, it's use of operators and delegates, and - * which are included in the bare metal build are also orchestrated by the - * CMakeLists file. For example use see examples/arm/run.sh - */ +#include "arm_perf_monitor.h" + #ifdef SEMIHOSTING -// TODO: Verify the section attribute to match the linker script -// pending MLETORCH-39 -const size_t input_allocation_pool_size = 1 * 1024 * 1024; +// In our unit test flow, we have the capability to provide an enitre model to +// the Corstone-3xx FVP using semi hosting. Hence, the input allocation pool +// needs to be large enough to take an entire model. On the FVP, +// network_model_sec is linked to the DDR, which is large (256MB on +// Corstone-300). +const size_t input_allocation_pool_size = 100 * 1024 * 1024; unsigned char __attribute__(( section("network_model_sec"), aligned(16))) input_allocation_pool[input_allocation_pool_size]; // memory for the model will be allocated from the input_allocation_pool char* model_pte = nullptr; #else +/** + * This header file is generated by the build process based on the .pte file + * specified in the ET_PTE_FILE_PATH variable to the cmake build. + * Control of the action of the .pte, it's use of operators and delegates, and + * which are included in the bare metal build are also orchestrated by the + * CMakeLists file. For example use see examples/arm/run.sh + */ #include "model_pte.h" #endif @@ -50,9 +55,14 @@ unsigned char __attribute__(( section("network_model_sec"), aligned(16))) method_allocation_pool[METHOD_ALLOCATOR_POOL_SIZE]; +const size_t temp_allocation_pool_size = 1 * 1024 * 1024; +unsigned char __attribute__(( + section("network_model_sec"), + aligned(16))) temp_allocation_pool[temp_allocation_pool_size]; + void et_pal_init(void) {} -__ET_NORETURN void et_pal_abort(void) { +ET_NORETURN void et_pal_abort(void) { #ifndef SEMIHOSTING __builtin_trap(); #else @@ -64,14 +74,15 @@ __ET_NORETURN void et_pal_abort(void) { * Emit a log message via platform output (serial port, console, etc). */ void et_pal_emit_log_message( - __ET_UNUSED et_timestamp_t timestamp, + ET_UNUSED et_timestamp_t timestamp, et_pal_log_level_t level, const char* filename, - __ET_UNUSED const char* function, + ET_UNUSED const char* function, size_t line, const char* message, - __ET_UNUSED size_t length) { - fprintf(stderr, "%c executorch:%s:%zu] %s\n", level, filename, line, message); + ET_UNUSED size_t length) { + fprintf( + stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message); } namespace { @@ -85,10 +96,12 @@ Result prepare_input_tensors( size_t num_inputs = method_meta.num_inputs(); size_t num_allocated = 0; +#ifdef SEMIHOSTING ET_CHECK_OR_RETURN_ERROR( input_buffers.size() > 0 && num_inputs == input_buffers.size(), InvalidArgument, "Wrong number of inputs allocated compared to method"); +#endif void** inputs = static_cast(allocator.allocate(num_inputs * sizeof(void*))); @@ -320,8 +333,11 @@ int main(int argc, const char* argv[]) { torch::executor::HierarchicalAllocator planned_memory( {planned_spans.data(), planned_spans.size()}); + torch::executor::MemoryAllocator temp_allocator( + temp_allocation_pool_size, temp_allocation_pool); + torch::executor::MemoryManager memory_manager( - &method_allocator, &planned_memory); + &method_allocator, &planned_memory, &temp_allocator); Result method = program->load_method(method_name, &memory_manager); @@ -349,7 +365,10 @@ int main(int argc, const char* argv[]) { ET_LOG(Info, "Input prepared."); ET_LOG(Info, "Starting the model execution..."); + StartMeasurements(); Error status = method->execute(); + StopMeasurements(); + if (status != Error::Ok) { ET_LOG( Info, @@ -367,6 +386,8 @@ int main(int argc, const char* argv[]) { for (int i = 0; i < outputs.size(); ++i) { Tensor t = outputs[i].toTensor(); #ifndef SEMIHOSTING + // The output might be collected and parsed so printf() is used instead + // of ET_LOG() here for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { if (t.scalar_type() == ScalarType::Int) { printf( diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp new file mode 100644 index 0000000000..c53d28baab --- /dev/null +++ b/examples/arm/executor_runner/arm_perf_monitor.cpp @@ -0,0 +1,173 @@ +/* Copyright 2024 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include "arm_perf_monitor.h" + +#ifdef ETHOSU +#include +#include +#include + +static uint32_t ethosu_inference_count = 0; +static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0; +static uint64_t ethosu_ArmBackendExecuteCycleCount = 0; +static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0; +static uint64_t ethosu_ArmWhenNPURunCycleCount = 0; +static uint64_t ethosu_pmuCycleCount = 0; +static std::vector ethosu_pmuEventCounts( + ETHOSU_PMU_Get_NumEventCounters(), + 0); + +static const uint32_t ethosu_pmuCountersUsed = 4; +// ethosu_pmuCountersUsed should match numbers of counters setup in +// ethosu_inference_begin() and not be more then the HW supports +static_assert(ETHOSU_PMU_NCOUNTERS >= ethosu_pmuCountersUsed); + +extern "C" { + +// Callback invoked at start of NPU execution +void ethosu_inference_begin(struct ethosu_driver* drv, void*) { + // Enable PMU + ETHOSU_PMU_Enable(drv); + ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE); + ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE); + + // Setup 4 counters + ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN); + ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE); + // Enable 4 counters + ETHOSU_PMU_CNTR_Enable(drv, 0xf); + + ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); + ETHOSU_PMU_CYCCNT_Reset(drv); + + // Reset all counters + ETHOSU_PMU_EVCNTR_ALL_Reset(drv); + + // Save Cortex-M cycle clock to calculate total CPU cycles used in + // ethosu_inference_end() + ethosu_ArmWhenNPURunCycleCountStart = ARM_PMU_Get_CCNTR(); +} + +// Callback invoked at end of NPU execution +void ethosu_inference_end(struct ethosu_driver* drv, void*) { + ethosu_inference_count++; + ethosu_pmuCycleCount += ETHOSU_PMU_Get_CCNTR(drv); + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ethosu_pmuEventCounts[i] += ETHOSU_PMU_Get_EVCNTR(drv, i); + } + ETHOSU_PMU_Disable(drv); + // Add Cortex-M cycle clock used during this NPU execution + ethosu_ArmWhenNPURunCycleCount += + (ARM_PMU_Get_CCNTR() - ethosu_ArmWhenNPURunCycleCountStart); +} + +// Callback invoked at start of ArmBackend::execute() +void ArmBackend_execute_begin() { + // Save Cortex-M cycle clock to calculate total CPU cycles used in + // ArmBackend_execute_end() + ethosu_ArmBackendExecuteCycleCountStart = ARM_PMU_Get_CCNTR(); +} + +// Callback invoked at end of ArmBackend::execute() +void ArmBackend_execute_end() { + // Add Cortex-M cycle clock used during this ArmBackend::execute() + ethosu_ArmBackendExecuteCycleCount += + (ARM_PMU_Get_CCNTR() - ethosu_ArmBackendExecuteCycleCountStart); +} +} + +void StartMeasurements() { + ethosu_ArmBackendExecuteCycleCount = 0; + ethosu_ArmWhenNPURunCycleCount = 0; + ethosu_pmuCycleCount = 0; + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ethosu_pmuEventCounts[i] = 0; + } + ARM_PMU_Enable(); + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable + ARM_PMU_CYCCNT_Reset(); + ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); +} + +void StopMeasurements() { + ARM_PMU_CNTR_Disable( + PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk | + PMU_CNTENCLR_CNT1_ENABLE_Msk); + uint32_t cycle_count = ARM_PMU_Get_CCNTR(); + + // Number of comand streams handled by the NPU + ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count); + ET_LOG(Info, "Profiler report, CPU cycles per operator:"); + // This is number of CPU cycles for the ethos-u operator from start to finish + // in the framework If there is more then one commandstream the time is added + // together + ET_LOG( + Info, + "ethos-u : cycle_cnt : %d cycles", + ethosu_ArmBackendExecuteCycleCount); + // We could print a list of the cycles used by the other delegates here in the + // future but now we only print ethos-u: this means that "Operator(s) total: + // ..." will be the same number as ethos-u : cycle_cnt and not the sum of all + ET_LOG( + Info, + "Operator(s) total: %d CPU cycles", + ethosu_ArmBackendExecuteCycleCount); + // Total CPU cycles used in the executorch method->execute() + // Other delegates and no delegates are counted in this + ET_LOG(Info, "Inference runtime: %d CPU cycles total", cycle_count); + + ET_LOG( + Info, + "NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency"); + + // Avoid division with zero if ARM_PMU_Get_CCNTR() is not enabled properly. + if (cycle_count == 0) { + ET_LOG(Info, "Inference CPU ratio: ?.?? %%"); + ET_LOG(Info, "Inference NPU ratio: ?.?? %%"); + } else { + ET_LOG( + Info, + "Inference CPU ratio: %.2f %%", + 100.0 * (cycle_count - ethosu_ArmWhenNPURunCycleCount) / cycle_count); + ET_LOG( + Info, + "Inference NPU ratio: %.2f %%", + 100.0 * ethosu_ArmWhenNPURunCycleCount / cycle_count); + } + + // CPU cycles used by NPU, e.g. number of CPU cycles used between + // ethosu_inference_begin() and ethosu_inference_end() + // If there is more then one commandstream the time is added together + ET_LOG( + Info, + "cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles", + ethosu_ArmWhenNPURunCycleCount); + + ET_LOG(Info, "Ethos-U PMU report:"); + ET_LOG(Info, "ethosu_pmu_cycle_cntr : %" PRIu64, ethosu_pmuCycleCount); + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]); + } + ET_LOG( + Info, + "Ethos-U PMU Events:[ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]"); +} + +#else +void StartMeasurements() {} + +void StopMeasurements() {} + +#endif diff --git a/examples/arm/executor_runner/arm_perf_monitor.h b/examples/arm/executor_runner/arm_perf_monitor.h new file mode 100644 index 0000000000..3925a9a571 --- /dev/null +++ b/examples/arm/executor_runner/arm_perf_monitor.h @@ -0,0 +1,10 @@ +/* Copyright 2024 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +void StartMeasurements(); +void StopMeasurements(); diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 272ddcfc0c..8c39a3a866 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -91,6 +91,7 @@ fi ### Optional user args ######## root_dir=${1:-"${script_dir}/ethos-u-scratch"} +mkdir -p ${root_dir} root_dir=$(realpath ${root_dir}) ######## @@ -215,7 +216,7 @@ function setup_vela() { if [[ ! -e ethos-u-vela ]]; then git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela repo_dir="${root_dir}/ethos-u-vela" - base_rev=7706c1281166e7611f4300ed26338087152a33c9 + base_rev=d362f5443f67b1e6213a9d8f124edff758efac96 patch_repo fi cd "${root_dir}/ethos-u-vela" @@ -246,7 +247,6 @@ fi cd "${script_dir}" # Setup the root dir -mkdir -p "${root_dir}" cd "${root_dir}" echo "[main] Using root dir ${root_dir}" diff --git a/examples/cadence/models/wav2vec2.py b/examples/cadence/models/wav2vec2.py new file mode 100644 index 0000000000..5db9ea2a6d --- /dev/null +++ b/examples/cadence/models/wav2vec2.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Example script for exporting simple models to flatbuffer + +import logging + +from executorch.backends.cadence.aot.ops_registrations import * # noqa + +import torch + +from executorch.backends.cadence.aot.export_example import export_model +from torchaudio.models.wav2vec2.model import wav2vec2_model, Wav2Vec2Model + +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.INFO, format=FORMAT) + + +def main() -> None: + # The wrapper is needed to avoid issues with the optional second arguments + # of Wav2Vec2Models. + class Wav2Vec2ModelWrapper(torch.nn.Module): + def __init__(self, model: Wav2Vec2Model): + super().__init__() + self.model = model + + def forward(self, x): + out, _ = self.model(x) + return out + + _model = wav2vec2_model( + extractor_mode="layer_norm", + extractor_conv_layer_config=None, + extractor_conv_bias=False, + encoder_embed_dim=768, + encoder_projection_dropout=0.1, + encoder_pos_conv_kernel=128, + encoder_pos_conv_groups=16, + encoder_num_layers=12, + encoder_num_heads=12, + encoder_attention_dropout=0.1, + encoder_ff_interm_features=3072, + encoder_ff_interm_dropout=0.0, + encoder_dropout=0.1, + encoder_layer_norm_first=False, + encoder_layer_drop=0.1, + aux_num_out=None, + ) + _model.eval() + + model = Wav2Vec2ModelWrapper(_model) + model.eval() + + # test input + audio_len = 1680 + example_inputs = (torch.rand(1, audio_len),) + + export_model(model, example_inputs) + + +if __name__ == "__main__": + main() diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md index 1d993da3d4..a60307dd90 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/README.md +++ b/examples/demo-apps/android/ExecuTorchDemo/README.md @@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build- After generating the model, copy the model to `assets` directory. ```bash -python -m examples.qualcomm.scripts.deeplab_v3 -b build_android -m SM8450 -s +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/ ``` @@ -78,6 +78,8 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install @@ -119,6 +121,8 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK new file mode 100644 index 0000000000..2b33cef732 --- /dev/null +++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK @@ -0,0 +1,67 @@ +load("@fbsource//tools/build_defs:manifold.bzl", "manifold_get") +load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") +load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") + +manifold_get( + name = "dl3_xnnpack_fp32", + out = "dl3_xnnpack_fp32.pte", + api_key = "executorch-key", + artifact_path = "tree/models/benchmarking/executorch/dl3_xnnpack_fp32.pte", + bucket_name = "executorch", + sha1 = "3e7af1d8f5ec4acb6de156d361715e16e5f53783", + timeout_msec = 120000, +) + +fb_android_resource( + name = "app_res", + assets = "assets", + package = "com.example.executorchdemo", + res = "res", +) + +fb_android_resource( + name = "model_res", + assets = {"dl3_xnnpack_fp32.pte": ":dl3_xnnpack_fp32"}, + package = "com.example.executorchdemo", + res = "res", +) + +fb_android_library( + name = "app_lib", + srcs = [ + "java/com/example/executorchdemo/ClassificationActivity.java", + "java/com/example/executorchdemo/ImageNetClasses.java", + "java/com/example/executorchdemo/MainActivity.java", + "java/com/example/executorchdemo/TensorImageUtils.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + ":app_res", + "//xplat/executorch/extension/android:executorch", + ], +) + +fb_android_binary( + name = "ExecuTorchDemo", + keystore = "//fbandroid/keystores:debug", + manifest = "AndroidManifest.xml", + manifest_entries = { + "min_sdk_version": 19, # Android supports 19 for minimum + "target_sdk_version": 34, + "version_code": "1", + "version_name": "1.0", + }, + package_type = "release", + skip_proguard = True, + deps = [ + ":app_lib", + ":app_res", + ":model_res", + "//third-party/java/androidx/appcompat/appcompat:appcompat", + "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", + "//xplat/executorch/extension/android:executorch", + "//xplat/executorch/extension/android/jni:executorch_jni_full", + ], +) diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh index 05dc3e4492..00d9201b09 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh +++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh @@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md index 7bb36657da..1d4dc6d576 100644 --- a/examples/demo-apps/android/LlamaDemo/README.md +++ b/examples/demo-apps/android/LlamaDemo/README.md @@ -1,111 +1,141 @@ -# Building ExecuTorch LLaMA Android Demo App - -This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch. - -## Prerequisites -* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment. -* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html). -* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/14/setup-sdk) and - [Android NDK 25.0.8775105](https://developer.android.com/studio/projects/install-ndk). - * If you have Android Studio set up, you can install them with - * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34. - * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row. - * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI. -* Supported Host OS: CentOS, macOS Sonoma on Apple Silicon. - -Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105. - -## Getting models -Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model. - -After you export the model and generate tokenizer.bin, push them device: -```bash -adb shell mkdir -p /data/local/tmp/llama -adb push llama2.pte /data/local/tmp/llama -adb push tokenizer.bin /data/local/tmp/llama -``` - -Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer. +# ExecuTorch Llama Android Demo App -## Build library -For the demo app to build, we need to build the ExecuTorch AAR library first. +We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer. -The AAR library contains the required Java package and the corresponding JNI -library for using ExecuTorch in your Android app. +This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case. -### Alternative 1: Use prebuilt AAR library (recommended) +Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas. -1. Open a terminal window and navigate to the root directory of the `executorch`. -2. Run the following command to download the prebuilt library: -```bash -bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh -``` -The prebuilt AAR library contains the Java library and the JNI binding for -NativePeer.java and ExecuTorch native library, including core ExecuTorch -runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, -and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64. +## Key Concepts +From this demo app, you will learn many key concepts such as: +* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates +* Expose the ExecuTorch library via JNI layer +* Familiarity with current ExecuTorch app-facing capabilities -If you want to use the prebuilt library for your own app, please refer to -[Using Android prebuilt libraries (AAR)](./android-prebuilt-library.md) for -tutorial. +The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases. -If you need to use other dependencies (like tokenizer), please refer to -Alternative 2: Build from local machine option. +## Supporting Models +As a whole, the models that this app supports are (varies by delegate): +* Llama 3.1 8B +* Llama 3 8B +* Llama 2 7B +* LLaVA-1.5 vision model (only XNNPACK) -### Alternative 2: Build from local machine -1. Open a terminal window and navigate to the root directory of the `executorch`. -2. Set the following environment variables: -```bash -export ANDROID_NDK= -export ANDROID_ABI=arm64-v8a -``` -Note: `` is the root for the NDK, which is usually under -`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md. -We use `/build/cmake/android.toolchain.cmake` for CMake to cross-compile. -3. (Optional) If you need to use tiktoken as the tokenizer (for LLaMA3), set -`EXECUTORCH_USE_TIKTOKEN=ON` and later CMake will use it as the tokenizer. -If you need to run other models like LLaMA2, skip this skip. +## Building the APK +First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device: -```bash -export EXECUTORCH_USE_TIKTOKEN=ON # Only for LLaMA3 -``` +| Delegate | Resource | +| ------------- | ------------- | +| XNNPACK (CPU-based library) | [link](docs/delegates/xnnpack_README.md) | +| QNN (Qualcomm AI Accelerators) | [link](docs/delegates/qualcomm_README.md) | +| MediaTek (MediaTek AI Accelerators) | [link](docs/delegates/mediatek_README.md) | -4. Build the Android Java extension code: -```bash -pushd extension/android -./gradlew build -popd -``` +## How to Use the App -5. Run the following command set up the required JNI library: -```bash -pushd examples/demo-apps/android/LlamaDemo -./gradlew :app:setup -popd -``` -This is running the shell script [setup.sh](./setup.sh) which configures the required core ExecuTorch, LLAMA2, and Android libraries, builds them, and copy to jniLibs. +This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API. -## Build APK -### Alternative 1: Android Studio (Recommended) +For loading the app, development, and running on device we recommend Android Studio: 1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo. 2. Run the app (^R). This builds and launches the app on the phone. -### Alternative 2: Command line -Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle. -```bash -export ANDROID_HOME= -pushd examples/demo-apps/android/LlamaDemo -./gradlew :app:installDebug -popd +### Opening the App + +Below are the UI features for the app. + +Select the settings widget to get started with picking a model, its parameters and any prompts. +

+ +

+ + + +### Select Models and Parameters + +Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity. +

+ +

+ + + +Optional Parameters: +* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments. +* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences". +* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send. + +> [!TIP] +> Helpful ExecuTorch API in app + +```java +// Upon returning to the Main Chat Activity +mModule = new LlamaModule( + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), + modelPath, + tokenizerPath, + temperature); +int loadResult = mModule.load(); +``` + +* `modelCategory`: Indicate whether it’s a text-only or vision model +* `modePath`: path to the .pte file +* `tokenizerPath`: path to the tokenizer .bin file +* `temperature`: model parameter to adjust the randomness of the model’s output + + +### User Prompt +Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model. +

+ +

+ +You can provide it more follow-up questions as well. +

+ +

+ +> [!TIP] +> Helpful ExecuTorch API in app +```java +mModule.generate(prompt,sequence_length, MainActivity.this); ``` +* `prompt`: User formatted prompt +* `sequence_length`: Number of tokens to generate in response to a prompt +* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class. -On the phone or emulator, you can try running the model: -Android LLaMA App
+[*LLaVA-1.5: Only for XNNPACK delegate*] -## Takeaways -Through this tutorial we've learnt how to build the ExecuTorch LLAMA library, and expose it to JNI layer to build the Android app. +For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model. + +

+ +

+ + +### Output Generated +To show completion of the follow-up question, here is the complete detailed response from the model. +

+ +

+ +> [!TIP] +> Helpful ExecuTorch API in app + +Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`. +```java + @Override + public void onResult(String result) { + //...result contains token from response + //.. onResult will continue to be invoked until response is complete + } + + @Override + public void onStats(float tps) { + //...tps (tokens per second) stats is provided by framework + } + +``` ## Reporting Issues If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new). diff --git a/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml deleted file mode 100644 index 4df9f18cc5..0000000000 --- a/examples/demo-apps/android/LlamaDemo/android-llama2-device-farm-test-spec.yml +++ /dev/null @@ -1,76 +0,0 @@ -version: 0.1 - -android_test_host: amazon_linux_2 - -phases: - install: - commands: - - pre_test: - commands: - # Prepare the model and the tokenizer - - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/" - - adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/llama/" - - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/tokenizer.bin /data/local/tmp/llama/tokenizer.bin" - - adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/xnnpack_llama2.pte /data/local/tmp/llama/xnnpack_llama2.pte" - - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/tokenizer.bin" - - adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/xnnpack_llama2.pte" - - adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/llama/" - - test: - commands: - # By default, the following ADB command is used by Device Farm to run your Instrumentation test. - # Please refer to Android's documentation for more options on running instrumentation tests with adb: - # https://developer.android.com/studio/test/command-line#run-tests-with-adb - - echo "Starting the Instrumentation test" - - | - adb -s $DEVICEFARM_DEVICE_UDID shell "am instrument -r -w --no-window-animation \ - $DEVICEFARM_TEST_PACKAGE_NAME/$DEVICEFARM_TEST_PACKAGE_RUNNER 2>&1 || echo \": -1\"" | - tee $DEVICEFARM_LOG_DIR/instrument.log - - # Parse the results - - |- - INSTRUMENT_LOG="$DEVICEFARM_LOG_DIR/instrument.log" - - DID_ANY_TESTS_START=$(grep "INSTRUMENTATION_STATUS_CODE: 1" $INSTRUMENT_LOG | wc -l); - TESTS_PASSED=$(grep "INSTRUMENTATION_STATUS_CODE: 0" $INSTRUMENT_LOG | wc -l); - TESTS_ERRORED=$(grep "INSTRUMENTATION_STATUS_CODE: -1" $INSTRUMENT_LOG | wc -l); - TESTS_FAILED=$(grep "INSTRUMENTATION_STATUS_CODE: -2" $INSTRUMENT_LOG | wc -l); - TESTS_IGNORED=$(grep "INSTRUMENTATION_STATUS_CODE: -3" $INSTRUMENT_LOG | wc -l); - TESTS_ASSUMPTION_FAILED=$(grep "INSTRUMENTATION_STATUS_CODE: -4" $INSTRUMENT_LOG | wc -l); - TESTS_PROCESSES_CRASHED=$(grep "INSTRUMENTATION_RESULT: shortMsg=Process crashed." $INSTRUMENT_LOG | wc -l); - - # And print the results so that the CI job can show them later - - | - INSTRUMENT_LOG="$DEVICEFARM_LOG_DIR/instrument.log" - - if [ $DID_ANY_TESTS_START -eq 0 ]; - then - echo "[PyTorch] Marking the test suite as failed because no tests started!"; - false; - elif [ $TESTS_FAILED -ne 0 ]; - then - OBSERVED_TPS=$(grep "The observed TPS " $INSTRUMENT_LOG | tail -n 1) - - if [ -n "${OBSERVED_TPS}" ]; - then - echo "[PyTorch] ${OBSERVED_TPS}"; - else - echo "[PyTorch] Marking the test suite as failed because it failed to load the model"; - fi - elif [ $TESTS_ERRORED -ne 0 ]; - then - echo "[PyTorch] Marking the test suite as failed because $TESTS_ERRORED tests errored!"; - false; - elif [ $TESTS_PROCESSES_CRASHED -ne 0 ]; - then - echo "[PyTorch] Marking the test suite as failed because the app crashed due to OOM!"; - false; - fi; - - post_test: - commands: - -artifacts: - # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory. - - $DEVICEFARM_LOG_DIR diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts index 3c168689f7..37c8cbf0ba 100644 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts @@ -17,7 +17,7 @@ android { defaultConfig { applicationId = "com.example.executorchllamademo" - minSdk = 24 + minSdk = 28 targetSdk = 33 versionCode = 1 versionName = "1.0" @@ -56,7 +56,10 @@ dependencies { implementation("androidx.camera:camera-core:1.3.0-rc02") implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") + implementation("com.google.code.gson:gson:2.8.6") implementation(files("libs/executorch-llama.aar")) + implementation("com.google.android.material:material:1.12.0") + implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.1.5") androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java index b8988d1f4b..221a9bd741 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java @@ -8,12 +8,15 @@ package com.example.executorchllamademo; -import static junit.framework.TestCase.assertTrue; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import android.os.Bundle; import androidx.test.ext.junit.runners.AndroidJUnit4; +import androidx.test.platform.app.InstrumentationRegistry; +import java.io.File; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.junit.Test; import org.junit.runner.RunWith; @@ -24,33 +27,35 @@ public class PerfTest implements LlamaCallback { private static final String RESOURCE_PATH = "/data/local/tmp/llama/"; - private static final String MODEL_NAME = "xnnpack_llama2.pte"; private static final String TOKENIZER_BIN = "tokenizer.bin"; - // From https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md - private static final Float EXPECTED_TPS = 10.0F; - private final List results = new ArrayList<>(); private final List tokensPerSecond = new ArrayList<>(); @Test public void testTokensPerSecond() { - String modelPath = RESOURCE_PATH + MODEL_NAME; String tokenizerPath = RESOURCE_PATH + TOKENIZER_BIN; - LlamaModule mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f); + // Find out the model name + File directory = new File(RESOURCE_PATH); + Arrays.stream(directory.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .forEach( + model -> { + LlamaModule mModule = new LlamaModule(model.getPath(), tokenizerPath, 0.8f); + // Print the model name because there might be more than one of them + report("ModelName", model.getName()); - int loadResult = mModule.load(); - // Check that the model can be load successfully - assertEquals(0, loadResult); + int loadResult = mModule.load(); + // Check that the model can be load successfully + assertEquals(0, loadResult); - // Run a testing prompt - mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this); - assertFalse(tokensPerSecond.isEmpty()); + // Run a testing prompt + mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this); + assertFalse(tokensPerSecond.isEmpty()); - final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1); - assertTrue( - "The observed TPS " + tps + " is less than the expected TPS " + EXPECTED_TPS, - tps >= EXPECTED_TPS); + final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1); + report("TPS", tps); + }); } @Override @@ -62,4 +67,16 @@ public void onResult(String result) { public void onStats(float tps) { tokensPerSecond.add(tps); } + + private void report(final String metric, final Float value) { + Bundle bundle = new Bundle(); + bundle.putFloat(metric, value); + InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle); + } + + private void report(final String key, final String value) { + Bundle bundle = new Bundle(); + bundle.putString(key, value); + InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle); + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml index 3eaf301b5a..02d8503a4d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml @@ -3,38 +3,59 @@ xmlns:tools="http://schemas.android.com/tools" package="com.example.executorchllamademo"> - + + + + + + + - + + android:theme="@style/Theme.AppCompat.Light.NoActionBar"> + + + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK new file mode 100644 index 0000000000..80315c4104 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK @@ -0,0 +1,65 @@ +load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") +load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") + +oncall("executorch") + +fb_android_resource( + name = "app_res", + package = "com.example.executorchllamademo", + res = "res", +) + +fb_android_library( + name = "app_lib", + srcs = [ + "java/com/example/executorchllamademo/AppLog.java", + "java/com/example/executorchllamademo/DemoSharedPreferences.java", + "java/com/example/executorchllamademo/ETImage.java", + "java/com/example/executorchllamademo/ETLogging.java", + "java/com/example/executorchllamademo/LlmBenchmarkRunner.java", + "java/com/example/executorchllamademo/LogsActivity.java", + "java/com/example/executorchllamademo/LogsAdapter.java", + "java/com/example/executorchllamademo/MainActivity.java", + "java/com/example/executorchllamademo/Message.java", + "java/com/example/executorchllamademo/MessageAdapter.java", + "java/com/example/executorchllamademo/MessageType.java", + "java/com/example/executorchllamademo/ModelRunner.java", + "java/com/example/executorchllamademo/ModelRunnerCallback.java", + "java/com/example/executorchllamademo/ModelType.java", + "java/com/example/executorchllamademo/ModelUtils.java", + "java/com/example/executorchllamademo/PromptFormat.java", + "java/com/example/executorchllamademo/SettingsActivity.java", + "java/com/example/executorchllamademo/SettingsFields.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + ":app_res", + "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", + "//third-party/java/com/google/code/gson/gson:gson", + "//xplat/executorch/extension/android:executorch_llama", + ], +) + +fb_android_binary( + name = "ExecuTorchLlamaDemo", + keystore = "//fbandroid/keystores:debug", + manifest = "AndroidManifest.xml", + manifest_entries = { + "min_sdk_version": 21, + "target_sdk_version": 34, + "version_code": "1", + "version_name": "1.0", + }, + package_type = "release", + skip_proguard = True, + deps = [ + ":app_lib", + ":app_res", + "//third-party/java/androidx/appcompat/appcompat:appcompat", + "//third-party/java/com/google/code/gson/gson:gson", + "//xplat/executorch/extension/android:executorch_llama", + "//xplat/executorch/extension/android/jni:executorch_llama_jni", + ], +) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java new file mode 100644 index 0000000000..36d0741938 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + +public class AppLog { + private final Long timestamp; + private final String message; + + public AppLog(String message) { + this.timestamp = getCurrentTimeStamp(); + this.message = message; + } + + public Long getTimestamp() { + return timestamp; + } + + public String getMessage() { + return message; + } + + public String getFormattedLog() { + return "[" + getFormattedTimeStamp() + "] " + message; + } + + private Long getCurrentTimeStamp() { + return System.currentTimeMillis(); + } + + private String getFormattedTimeStamp() { + return formatDate(timestamp); + } + + private String formatDate(long milliseconds) { + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.getDefault()); + Date date = new Date(milliseconds); + return formatter.format(date); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java new file mode 100644 index 0000000000..99a94c00eb --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.content.Context; +import android.content.SharedPreferences; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; +import java.lang.reflect.Type; +import java.util.ArrayList; + +public class DemoSharedPreferences { + Context context; + SharedPreferences sharedPreferences; + + public DemoSharedPreferences(Context context) { + this.context = context; + this.sharedPreferences = getSharedPrefs(); + } + + private SharedPreferences getSharedPrefs() { + return context.getSharedPreferences( + context.getString(R.string.demo_pref_file_key), Context.MODE_PRIVATE); + } + + public String getSavedMessages() { + return sharedPreferences.getString(context.getString(R.string.saved_messages_json_key), ""); + } + + public void addMessages(MessageAdapter messageAdapter) { + SharedPreferences.Editor editor = sharedPreferences.edit(); + Gson gson = new Gson(); + String msgJSON = gson.toJson(messageAdapter.getSavedMessages()); + editor.putString(context.getString(R.string.saved_messages_json_key), msgJSON); + editor.apply(); + } + + public void removeExistingMessages() { + SharedPreferences.Editor editor = sharedPreferences.edit(); + editor.remove(context.getString(R.string.saved_messages_json_key)); + editor.apply(); + } + + public void addSettings(SettingsFields settingsFields) { + SharedPreferences.Editor editor = sharedPreferences.edit(); + Gson gson = new Gson(); + String settingsJSON = gson.toJson(settingsFields); + editor.putString(context.getString(R.string.settings_json_key), settingsJSON); + editor.apply(); + } + + public String getSettings() { + return sharedPreferences.getString(context.getString(R.string.settings_json_key), ""); + } + + public void saveLogs() { + SharedPreferences.Editor editor = sharedPreferences.edit(); + Gson gson = new Gson(); + String msgJSON = gson.toJson(ETLogging.getInstance().getLogs()); + editor.putString(context.getString(R.string.logs_json_key), msgJSON); + editor.apply(); + } + + public void removeExistingLogs() { + SharedPreferences.Editor editor = sharedPreferences.edit(); + editor.remove(context.getString(R.string.logs_json_key)); + editor.apply(); + } + + public ArrayList getSavedLogs() { + String logsJSONString = + sharedPreferences.getString(context.getString(R.string.logs_json_key), null); + if (logsJSONString == null || logsJSONString.isEmpty()) { + return new ArrayList<>(); + } + Gson gson = new Gson(); + Type type = new TypeToken>() {}.getType(); + ArrayList appLogs = gson.fromJson(logsJSONString, type); + if (appLogs == null) { + return new ArrayList<>(); + } + return appLogs; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java new file mode 100644 index 0000000000..e68c847262 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.content.ContentResolver; +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.graphics.Color; +import android.net.Uri; +import androidx.annotation.Nullable; +import java.io.FileNotFoundException; +import java.io.InputStream; + +public class ETImage { + private int width; + private int height; + private final byte[] bytes; + private final Uri uri; + private final ContentResolver contentResolver; + + ETImage(ContentResolver contentResolver, Uri uri) { + this.contentResolver = contentResolver; + this.uri = uri; + bytes = getBytesFromImageURI(uri); + } + + public int getWidth() { + return width; + } + + public int getHeight() { + return height; + } + + public Uri getUri() { + return uri; + } + + public byte[] getBytes() { + return bytes; + } + + public int[] getInts() { + // We need to convert the byte array to an int array because + // the runner expects an int array as input. + int[] intArray = new int[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + intArray[i] = (bytes[i++] & 0xFF); + } + return intArray; + } + + private byte[] getBytesFromImageURI(Uri uri) { + try { + int RESIZED_IMAGE_WIDTH = 336; + Bitmap bitmap = resizeImage(uri, RESIZED_IMAGE_WIDTH); + + if (bitmap == null) { + ETLogging.getInstance().log("Unable to get bytes from Image URI. Bitmap is null"); + return new byte[0]; + } + + width = bitmap.getWidth(); + height = bitmap.getHeight(); + + byte[] rgbValues = new byte[width * height * 3]; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + // Get the color of the current pixel + int color = bitmap.getPixel(x, y); + + // Extract the RGB values from the color + int red = Color.red(color); + int green = Color.green(color); + int blue = Color.blue(color); + + // Store the RGB values in the byte array + rgbValues[y * width + x] = (byte) red; + rgbValues[(y * width + x) + height * width] = (byte) green; + rgbValues[(y * width + x) + 2 * height * width] = (byte) blue; + } + } + return rgbValues; + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } + + @Nullable + private Bitmap resizeImage(Uri uri, int maxLength) throws FileNotFoundException { + InputStream inputStream = contentResolver.openInputStream(uri); + if (inputStream == null) { + ETLogging.getInstance().log("Unable to resize image, input streams is null"); + return null; + } + Bitmap bitmap = BitmapFactory.decodeStream(inputStream); + if (bitmap == null) { + ETLogging.getInstance().log("Unable to resize image, bitmap during decode stream is null"); + return null; + } + + float aspectRatio; + int finalWidth, finalHeight; + + if (bitmap.getWidth() > bitmap.getHeight()) { + // width > height --> width = maxLength, height scale with aspect ratio + aspectRatio = bitmap.getWidth() / (float) bitmap.getHeight(); + finalWidth = maxLength; + finalHeight = Math.round(maxLength / aspectRatio); + } else { + // height >= width --> height = maxLength, width scale with aspect ratio + aspectRatio = bitmap.getHeight() / (float) bitmap.getWidth(); + finalHeight = maxLength; + finalWidth = Math.round(maxLength / aspectRatio); + } + + return Bitmap.createScaledBitmap(bitmap, finalWidth, finalHeight, false); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java new file mode 100644 index 0000000000..e595348945 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.Application; +import android.util.Log; +import java.util.ArrayList; + +public class ETLogging extends Application { + private static ETLogging singleton; + + private ArrayList logs; + private DemoSharedPreferences mDemoSharedPreferences; + + @Override + public void onCreate() { + super.onCreate(); + singleton = this; + mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext()); + logs = mDemoSharedPreferences.getSavedLogs(); + if (logs == null) { // We don't have existing sharedPreference stored + logs = new ArrayList<>(); + } + } + + public static ETLogging getInstance() { + return singleton; + } + + public void log(String message) { + AppLog appLog = new AppLog(message); + logs.add(appLog); + Log.d("ETLogging", appLog.getMessage()); + } + + public ArrayList getLogs() { + return logs; + } + + public void clearLogs() { + logs.clear(); + mDemoSharedPreferences.removeExistingLogs(); + } + + public void saveLogs() { + mDemoSharedPreferences.saveLogs(); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java new file mode 100644 index 0000000000..7236fe317b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.Activity; +import android.app.ActivityManager; +import android.content.Intent; +import android.os.Build; +import android.os.Bundle; +import android.util.Log; +import android.widget.TextView; +import androidx.annotation.NonNull; +import com.google.gson.Gson; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback { + ModelRunner mModelRunner; + + String mPrompt; + TextView mTextView; + StatsDump mStatsDump; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_benchmarking); + mTextView = findViewById(R.id.log_view); + + Intent intent = getIntent(); + + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); + String tokenizerPath = intent.getStringExtra("tokenizer_path"); + + float temperature = intent.getFloatExtra("temperature", 0.8f); + mPrompt = intent.getStringExtra("prompt"); + if (mPrompt == null) { + mPrompt = "The ultimate answer"; + } + + mStatsDump = new StatsDump(); + mStatsDump.modelName = model.getName().replace(".pte", ""); + mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); + mStatsDump.loadStart = System.nanoTime(); + } + + @Override + public void onModelLoaded(int status) { + mStatsDump.loadEnd = System.nanoTime(); + mStatsDump.loadStatus = status; + if (status != 0) { + Log.e("LlmBenchmarkRunner", "Loaded failed: " + status); + onGenerationStopped(); + return; + } + mStatsDump.generateStart = System.nanoTime(); + mModelRunner.generate(mPrompt); + } + + @Override + public void onTokenGenerated(String token) { + runOnUiThread( + () -> { + mTextView.append(token); + }); + } + + @Override + public void onStats(String stats) { + mStatsDump.tokens = stats; + } + + @Override + public void onGenerationStopped() { + mStatsDump.generateEnd = System.nanoTime(); + runOnUiThread( + () -> { + mTextView.append(mStatsDump.toString()); + }); + + final BenchmarkMetric.BenchmarkModel benchmarkModel = + BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName); + final List results = new ArrayList<>(); + // The list of metrics we have atm includes: + // Load status + results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0)); + // Model load time + results.add( + new BenchmarkMetric( + benchmarkModel, + "model_load_time(ms)", + (mStatsDump.loadEnd - mStatsDump.loadStart) * 1e-6, + 0.0f)); + // LLM generate time + results.add( + new BenchmarkMetric( + benchmarkModel, + "generate_time(ms)", + (mStatsDump.generateEnd - mStatsDump.generateStart) * 1e-6, + 0.0f)); + // Token per second + results.add( + new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f)); + + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { + Gson gson = new Gson(); + writer.write(gson.toJson(results)); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private double extractTPS(final String tokens) { + final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens); + if (m.find()) { + return Double.parseDouble(m.group()); + } else { + return 0.0f; + } + } +} + +class BenchmarkMetric { + public static class BenchmarkModel { + // The model name, i.e. stories110M + String name; + String backend; + String quantization; + + public BenchmarkModel(final String name, final String backend, final String quantization) { + this.name = name; + this.backend = backend; + this.quantization = quantization; + } + } + + BenchmarkModel benchmarkModel; + + // The metric name, i.e. TPS + String metric; + + // The actual value and the option target value + double actualValue; + double targetValue; + + public static class DeviceInfo { + // Let's see which information we want to include here + final String device = Build.BRAND; + // The phone model and Android release version + final String arch = Build.MODEL; + final String os = "Android " + Build.VERSION.RELEASE; + final long totalMem = new ActivityManager.MemoryInfo().totalMem; + final long availMem = new ActivityManager.MemoryInfo().availMem; + } + + DeviceInfo deviceInfo = new DeviceInfo(); + + public BenchmarkMetric( + final BenchmarkModel benchmarkModel, + final String metric, + final double actualValue, + final double targetValue) { + this.benchmarkModel = benchmarkModel; + this.metric = metric; + this.actualValue = actualValue; + this.targetValue = targetValue; + } + + // TODO (huydhn): Figure out a way to extract the backend and quantization information from + // the .pte model itself instead of parsing its name + public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) { + final Matcher m = + Pattern.compile("(?\\w+)_(?\\w+)_(?\\w+)").matcher(model); + if (m.matches()) { + return new BenchmarkMetric.BenchmarkModel( + m.group("name"), m.group("backend"), m.group("quantization")); + } else { + return new BenchmarkMetric.BenchmarkModel(model, "", ""); + } + } +} + +class StatsDump { + int loadStatus; + long loadStart; + long loadEnd; + long generateStart; + long generateEnd; + String tokens; + String modelName; + + @NonNull + @Override + public String toString() { + return "loadStart: " + + loadStart + + "\nloadEnd: " + + loadEnd + + "\ngenerateStart: " + + generateStart + + "\ngenerateEnd: " + + generateEnd + + "\n" + + tokens; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java new file mode 100644 index 0000000000..7777b275e6 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java @@ -0,0 +1,92 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.AlertDialog; +import android.content.DialogInterface; +import android.os.Build; +import android.os.Bundle; +import android.widget.ImageButton; +import android.widget.ListView; +import androidx.appcompat.app.AppCompatActivity; +import androidx.core.content.ContextCompat; +import androidx.core.graphics.Insets; +import androidx.core.view.ViewCompat; +import androidx.core.view.WindowInsetsCompat; + +public class LogsActivity extends AppCompatActivity { + + private LogsAdapter mLogsAdapter; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_logs); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } + ViewCompat.setOnApplyWindowInsetsListener( + requireViewById(R.id.main), + (v, insets) -> { + Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars()); + v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom); + return insets; + }); + + setupLogs(); + setupClearLogsButton(); + } + + @Override + public void onResume() { + super.onResume(); + mLogsAdapter.clear(); + mLogsAdapter.addAll(ETLogging.getInstance().getLogs()); + mLogsAdapter.notifyDataSetChanged(); + } + + private void setupLogs() { + ListView mLogsListView = requireViewById(R.id.logsListView); + mLogsAdapter = new LogsAdapter(this, R.layout.logs_message); + + mLogsListView.setAdapter(mLogsAdapter); + mLogsAdapter.addAll(ETLogging.getInstance().getLogs()); + mLogsAdapter.notifyDataSetChanged(); + } + + private void setupClearLogsButton() { + ImageButton clearLogsButton = requireViewById(R.id.clearLogsButton); + clearLogsButton.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Delete Logs History") + .setMessage("Do you really want to delete logs history?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + // Clear the messageAdapter and sharedPreference + ETLogging.getInstance().clearLogs(); + mLogsAdapter.clear(); + mLogsAdapter.notifyDataSetChanged(); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + @Override + protected void onDestroy() { + super.onDestroy(); + ETLogging.getInstance().saveLogs(); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java new file mode 100644 index 0000000000..76c6a1aa1b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.view.LayoutInflater; +import android.view.View; +import android.view.ViewGroup; +import android.widget.ArrayAdapter; +import android.widget.TextView; +import androidx.annotation.NonNull; +import java.util.Objects; + +public class LogsAdapter extends ArrayAdapter { + public LogsAdapter(android.content.Context context, int resource) { + super(context, resource); + } + + static class ViewHolder { + private TextView logTextView; + } + + @NonNull + @Override + public View getView(int position, View convertView, @NonNull ViewGroup parent) { + ViewHolder mViewHolder = null; + + String logMessage = Objects.requireNonNull(getItem(position)).getFormattedLog(); + + if (convertView == null || convertView.getTag() == null) { + mViewHolder = new ViewHolder(); + convertView = LayoutInflater.from(getContext()).inflate(R.layout.logs_message, parent, false); + mViewHolder.logTextView = convertView.requireViewById(R.id.logsTextView); + } else { + mViewHolder = (ViewHolder) convertView.getTag(); + } + mViewHolder.logTextView.setText(logMessage); + return convertView; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 2c94c242ed..f5e50845ec 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -8,37 +8,91 @@ package com.example.executorchllamademo; -import android.app.Activity; +import android.Manifest; import android.app.ActivityManager; import android.app.AlertDialog; -import android.content.Context; +import android.content.ContentResolver; +import android.content.ContentValues; +import android.content.Intent; +import android.content.pm.PackageManager; +import android.net.Uri; +import android.os.Build; import android.os.Bundle; +import android.os.Handler; +import android.os.Looper; +import android.os.Process; +import android.provider.MediaStore; import android.system.ErrnoException; import android.system.Os; -import android.widget.Button; +import android.util.Log; +import android.view.View; import android.widget.EditText; import android.widget.ImageButton; +import android.widget.ImageView; +import android.widget.LinearLayout; import android.widget.ListView; -import java.io.File; +import android.widget.TextView; +import android.widget.Toast; +import androidx.activity.result.ActivityResultLauncher; +import androidx.activity.result.PickVisualMediaRequest; +import androidx.activity.result.contract.ActivityResultContracts; +import androidx.annotation.NonNull; +import androidx.appcompat.app.AppCompatActivity; +import androidx.constraintlayout.widget.ConstraintLayout; +import androidx.core.app.ActivityCompat; +import androidx.core.content.ContextCompat; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; +import java.lang.reflect.Type; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; import org.pytorch.executorch.LlamaCallback; import org.pytorch.executorch.LlamaModule; -public class MainActivity extends Activity implements Runnable, LlamaCallback { +public class MainActivity extends AppCompatActivity implements Runnable, LlamaCallback { private EditText mEditTextMessage; - private Button mSendButton; - private ImageButton mModelButton; + private ImageButton mSendButton; + private ImageButton mGalleryButton; + private ImageButton mCameraButton; private ListView mMessagesView; private MessageAdapter mMessageAdapter; private LlamaModule mModule = null; private Message mResultMessage = null; - - private String mModelFilePath = ""; - private String mTokenizerFilePath = ""; + private ImageButton mSettingsButton; + private TextView mMemoryView; + private ActivityResultLauncher mPickGallery; + private ActivityResultLauncher mCameraRoll; + private List mSelectedImageUri; + private ConstraintLayout mMediaPreviewConstraintLayout; + private LinearLayout mAddMediaLayout; + private static final int MAX_NUM_OF_IMAGES = 5; + private static final int REQUEST_IMAGE_CAPTURE = 1; + private Uri cameraImageUri; + private DemoSharedPreferences mDemoSharedPreferences; + private SettingsFields mCurrentSettingsFields; + private Handler mMemoryUpdateHandler; + private Runnable memoryUpdater; + private int promptID = 0; + private long startPos = 0; + private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2; + private Executor executor; @Override public void onResult(String result) { - mResultMessage.appendText(result); - run(); + if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) { + return; + } + if (result.equals("\n\n") || result.equals("\n")) { + if (!mResultMessage.getText().isEmpty()) { + mResultMessage.appendText(result); + run(); + } + } else { + mResultMessage.appendText(result); + run(); + } } @Override @@ -52,23 +106,13 @@ public void onStats(float tps) { }); } - private static String[] listLocalFile(String path, String suffix) { - File directory = new File(path); - if (directory.exists() && directory.isDirectory()) { - File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix)); - String[] result = new String[files.length]; - for (int i = 0; i < files.length; i++) { - if (files[i].isFile() && files[i].getName().endsWith(suffix)) { - result[i] = files[i].getAbsolutePath(); - } - } - return result; + private void setLocalModel(String modelPath, String tokenizerPath, float temperature) { + if (mModule != null) { + mModule.resetNative(); + mModule = null; } - return new String[0]; - } - - private void setLocalModel(String modelPath, String tokenizerPath) { - Message modelLoadingMessage = new Message("Loading model...", false); + Message modelLoadingMessage = new Message("Loading model...", false, MessageType.SYSTEM, 0); + ETLogging.getInstance().log("Loading model " + modelPath + " with tokenizer " + tokenizerPath); runOnUiThread( () -> { mSendButton.setEnabled(false); @@ -76,9 +120,20 @@ private void setLocalModel(String modelPath, String tokenizerPath) { mMessageAdapter.notifyDataSetChanged(); }); long runStartTime = System.currentTimeMillis(); - mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f); + mModule = + new LlamaModule( + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), + modelPath, + tokenizerPath, + temperature); int loadResult = mModule.load(); + long loadDuration = System.currentTimeMillis() - runStartTime; + String modelLoadError = ""; + String modelInfo = ""; if (loadResult != 0) { + // TODO: Map the error code to a reason to let the user know why model loading failed + modelInfo = "*Model could not load (Error Code: " + loadResult + ")*" + "\n"; + loadDuration = 0; AlertDialog.Builder builder = new AlertDialog.Builder(this); builder.setTitle("Load failed: " + loadResult); runOnUiThread( @@ -86,18 +141,43 @@ private void setLocalModel(String modelPath, String tokenizerPath) { AlertDialog alert = builder.create(); alert.show(); }); + } else { + String[] segments = modelPath.split("/"); + String pteName = segments[segments.length - 1]; + segments = tokenizerPath.split("/"); + String tokenizerName = segments[segments.length - 1]; + modelInfo = + "Successfully loaded model. " + + pteName + + " and tokenizer " + + tokenizerName + + " in " + + (float) loadDuration / 1000 + + " sec." + + " You can send text or image for inference"; + + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + ETLogging.getInstance().log("Llava start prefill prompt"); + startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0); + ETLogging.getInstance().log("Llava completes prefill prompt"); + } } - long loadDuration = System.currentTimeMillis() - runStartTime; - String modelInfo = - "Model path: " + Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); + + String modelLoggingInfo = + modelLoadError + + "Model path: " + modelPath + "\nTokenizer path: " + tokenizerPath + + "\nTemperature: " + + temperature + "\nModel loaded time: " + loadDuration + " ms"; - Message modelLoadedMessage = new Message(modelInfo, false); + ETLogging.getInstance().log("Load complete. " + modelLoggingInfo); + runOnUiThread( () -> { mSendButton.setEnabled(true); @@ -107,55 +187,31 @@ private void setLocalModel(String modelPath, String tokenizerPath) { }); } - private String memoryInfo() { - final ActivityManager am = (ActivityManager) getSystemService(Context.ACTIVITY_SERVICE); - ActivityManager.MemoryInfo memInfo = new ActivityManager.MemoryInfo(); - am.getMemoryInfo(memInfo); - return "Total RAM: " - + Math.floorDiv(memInfo.totalMem, 1000000) - + " MB. Available RAM: " - + Math.floorDiv(memInfo.availMem, 1000000) - + " MB."; - } - - private void modelDialog() { - String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); - String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); - String[] modelFiles = listLocalFile("/data/local/tmp/llama/", ".model"); - String[] tokenizerFiles = new String[binFiles.length + modelFiles.length]; - System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length); - System.arraycopy(modelFiles, 0, tokenizerFiles, binFiles.length, modelFiles.length); - AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); - modelPathBuilder.setTitle("Select model path"); - AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); - tokenizerPathBuilder.setTitle("Select tokenizer path"); - modelPathBuilder.setSingleChoiceItems( - pteFiles, - -1, - (dialog, item) -> { - mModelFilePath = pteFiles[item]; - mEditTextMessage.setText(""); - dialog.dismiss(); - tokenizerPathBuilder.create().show(); - }); + private void loadLocalModelAndParameters( + String modelFilePath, String tokenizerFilePath, float temperature) { + Runnable runnable = + new Runnable() { + @Override + public void run() { + setLocalModel(modelFilePath, tokenizerFilePath, temperature); + } + }; + new Thread(runnable).start(); + } - tokenizerPathBuilder.setSingleChoiceItems( - tokenizerFiles, - -1, - (dialog, item) -> { - mTokenizerFilePath = tokenizerFiles[item]; - Runnable runnable = - new Runnable() { - @Override - public void run() { - setLocalModel(mModelFilePath, mTokenizerFilePath); - } - }; - new Thread(runnable).start(); - dialog.dismiss(); - }); + private void populateExistingMessages(String existingMsgJSON) { + Gson gson = new Gson(); + Type type = new TypeToken>() {}.getType(); + ArrayList savedMessages = gson.fromJson(existingMsgJSON, type); + for (Message msg : savedMessages) { + mMessageAdapter.add(msg); + } + mMessageAdapter.notifyDataSetChanged(); + } + + private int setPromptID() { - modelPathBuilder.create().show(); + return mMessageAdapter.getMaxPromptID() + 1; } @Override @@ -163,33 +219,436 @@ protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_main); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } + try { Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); } catch (ErrnoException e) { finish(); } - mEditTextMessage = findViewById(R.id.editTextMessage); - mSendButton = findViewById(R.id.sendButton); + mEditTextMessage = requireViewById(R.id.editTextMessage); + mSendButton = requireViewById(R.id.sendButton); mSendButton.setEnabled(false); - mModelButton = findViewById(R.id.modelButton); - mMessagesView = findViewById(R.id.messages_view); - mMessageAdapter = new MessageAdapter(this, R.layout.sent_message); + mMessagesView = requireViewById(R.id.messages_view); + mMessageAdapter = new MessageAdapter(this, R.layout.sent_message, new ArrayList()); mMessagesView.setAdapter(mMessageAdapter); - mModelButton.setOnClickListener( + mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext()); + String existingMsgJSON = mDemoSharedPreferences.getSavedMessages(); + if (!existingMsgJSON.isEmpty()) { + populateExistingMessages(existingMsgJSON); + promptID = setPromptID(); + } + mSettingsButton = requireViewById(R.id.settings); + mSettingsButton.setOnClickListener( view -> { - mModule.stop(); - mMessageAdapter.clear(); - mMessageAdapter.notifyDataSetChanged(); - modelDialog(); + Intent myIntent = new Intent(MainActivity.this, SettingsActivity.class); + MainActivity.this.startActivity(myIntent); }); + mCurrentSettingsFields = new SettingsFields(); + mMemoryUpdateHandler = new Handler(Looper.getMainLooper()); onModelRunStopped(); - modelDialog(); + setupMediaButton(); + setupGalleryPicker(); + setupCameraRoll(); + startMemoryUpdate(); + setupShowLogsButton(); + executor = Executors.newSingleThreadExecutor(); + } + + @Override + protected void onPause() { + super.onPause(); + mDemoSharedPreferences.addMessages(mMessageAdapter); + } + + @Override + protected void onResume() { + super.onResume(); + // Check for if settings parameters have changed + Gson gson = new Gson(); + String settingsFieldsJSON = mDemoSharedPreferences.getSettings(); + if (!settingsFieldsJSON.isEmpty()) { + SettingsFields updatedSettingsFields = + gson.fromJson(settingsFieldsJSON, SettingsFields.class); + if (updatedSettingsFields == null) { + // Added this check, because gson.fromJson can return null + askUserToSelectModel(); + return; + } + boolean isUpdated = !mCurrentSettingsFields.equals(updatedSettingsFields); + boolean isLoadModel = updatedSettingsFields.getIsLoadModel(); + if (isUpdated) { + if (isLoadModel) { + // If users change the model file, but not pressing loadModelButton, we won't load the new + // model + checkForUpdateAndReloadModel(updatedSettingsFields); + } else { + askUserToSelectModel(); + } + checkForClearChatHistory(updatedSettingsFields); + // Update current to point to the latest + mCurrentSettingsFields = new SettingsFields(updatedSettingsFields); + } + } else { + askUserToSelectModel(); + } + } + + private void checkForClearChatHistory(SettingsFields updatedSettingsFields) { + if (updatedSettingsFields.getIsClearChatHistory()) { + mMessageAdapter.clear(); + mMessageAdapter.notifyDataSetChanged(); + mDemoSharedPreferences.removeExistingMessages(); + // changing to false since chat history has been cleared. + updatedSettingsFields.saveIsClearChatHistory(false); + mDemoSharedPreferences.addSettings(updatedSettingsFields); + } + } + + private void checkForUpdateAndReloadModel(SettingsFields updatedSettingsFields) { + // TODO need to add 'load model' in settings and queue loading based on that + String modelPath = updatedSettingsFields.getModelFilePath(); + String tokenizerPath = updatedSettingsFields.getTokenizerFilePath(); + double temperature = updatedSettingsFields.getTemperature(); + if (!modelPath.isEmpty() && !tokenizerPath.isEmpty()) { + if (updatedSettingsFields.getIsLoadModel() + || !modelPath.equals(mCurrentSettingsFields.getModelFilePath()) + || !tokenizerPath.equals(mCurrentSettingsFields.getTokenizerFilePath()) + || temperature != mCurrentSettingsFields.getTemperature()) { + loadLocalModelAndParameters( + updatedSettingsFields.getModelFilePath(), + updatedSettingsFields.getTokenizerFilePath(), + (float) updatedSettingsFields.getTemperature()); + updatedSettingsFields.saveLoadModelAction(false); + mDemoSharedPreferences.addSettings(updatedSettingsFields); + } + } else { + askUserToSelectModel(); + } + } + + private void askUserToSelectModel() { + String askLoadModel = + "To get started, select your desired model and tokenizer " + "from the top right corner"; + Message askLoadModelMessage = new Message(askLoadModel, false, MessageType.SYSTEM, 0); + ETLogging.getInstance().log(askLoadModel); + runOnUiThread( + () -> { + mMessageAdapter.add(askLoadModelMessage); + mMessageAdapter.notifyDataSetChanged(); + }); + } + + private void setupShowLogsButton() { + ImageButton showLogsButton = requireViewById(R.id.showLogsButton); + showLogsButton.setOnClickListener( + view -> { + Intent myIntent = new Intent(MainActivity.this, LogsActivity.class); + MainActivity.this.startActivity(myIntent); + }); + } + + private void setupMediaButton() { + mAddMediaLayout = requireViewById(R.id.addMediaLayout); + mAddMediaLayout.setVisibility(View.GONE); // We hide this initially + + ImageButton addMediaButton = requireViewById(R.id.addMediaButton); + addMediaButton.setOnClickListener( + view -> { + mAddMediaLayout.setVisibility(View.VISIBLE); + }); + + mGalleryButton = requireViewById(R.id.galleryButton); + mGalleryButton.setOnClickListener( + view -> { + // Launch the photo picker and let the user choose only images. + mPickGallery.launch( + new PickVisualMediaRequest.Builder() + .setMediaType(ActivityResultContracts.PickVisualMedia.ImageOnly.INSTANCE) + .build()); + }); + mCameraButton = requireViewById(R.id.cameraButton); + mCameraButton.setOnClickListener( + view -> { + Log.d("CameraRoll", "Check permission"); + if (ContextCompat.checkSelfPermission(MainActivity.this, Manifest.permission.CAMERA) + != PackageManager.PERMISSION_GRANTED) { + ActivityCompat.requestPermissions( + MainActivity.this, + new String[] {Manifest.permission.CAMERA}, + REQUEST_IMAGE_CAPTURE); + } else { + launchCamera(); + } + }); + } + + private void setupCameraRoll() { + // Registers a camera roll activity launcher. + mCameraRoll = + registerForActivityResult( + new ActivityResultContracts.TakePicture(), + result -> { + if (result && cameraImageUri != null) { + Log.d("CameraRoll", "Photo saved to uri: " + cameraImageUri); + mAddMediaLayout.setVisibility(View.GONE); + List uris = new ArrayList<>(); + uris.add(cameraImageUri); + showMediaPreview(uris); + } else { + // Delete the temp image file based on the url since the photo is not successfully + // taken + if (cameraImageUri != null) { + ContentResolver contentResolver = MainActivity.this.getContentResolver(); + contentResolver.delete(cameraImageUri, null, null); + Log.d("CameraRoll", "No photo taken. Delete temp uri"); + } + } + }); + mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout); + ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton); + mediaPreviewCloseButton.setOnClickListener( + view -> { + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + mSelectedImageUri = null; + }); + + ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton); + addMoreImageButton.setOnClickListener( + view -> { + Log.d("addMore", "clicked"); + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + // Direct user to select type of input + mCameraButton.callOnClick(); + }); + } + + private String updateMemoryUsage() { + ActivityManager.MemoryInfo memoryInfo = new ActivityManager.MemoryInfo(); + ActivityManager activityManager = (ActivityManager) getSystemService(ACTIVITY_SERVICE); + if (activityManager == null) { + return "---"; + } + activityManager.getMemoryInfo(memoryInfo); + long totalMem = memoryInfo.totalMem / (1024 * 1024); + long availableMem = memoryInfo.availMem / (1024 * 1024); + long usedMem = totalMem - availableMem; + return usedMem + "MB"; + } + + private void startMemoryUpdate() { + mMemoryView = requireViewById(R.id.ram_usage_live); + memoryUpdater = + new Runnable() { + @Override + public void run() { + mMemoryView.setText(updateMemoryUsage()); + mMemoryUpdateHandler.postDelayed(this, 1000); + } + }; + mMemoryUpdateHandler.post(memoryUpdater); + } + + @Override + public void onRequestPermissionsResult( + int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults); + if (requestCode == REQUEST_IMAGE_CAPTURE && grantResults.length != 0) { + if (grantResults[0] == PackageManager.PERMISSION_GRANTED) { + launchCamera(); + } else if (grantResults[0] == PackageManager.PERMISSION_DENIED) { + Log.d("CameraRoll", "Permission denied"); + } + } + } + + private void launchCamera() { + ContentValues values = new ContentValues(); + values.put(MediaStore.Images.Media.TITLE, "New Picture"); + values.put(MediaStore.Images.Media.DESCRIPTION, "From Camera"); + values.put(MediaStore.Images.Media.RELATIVE_PATH, "DCIM/Camera/"); + cameraImageUri = + MainActivity.this + .getContentResolver() + .insert(MediaStore.Images.Media.EXTERNAL_CONTENT_URI, values); + mCameraRoll.launch(cameraImageUri); + } + + private void setupGalleryPicker() { + // Registers a photo picker activity launcher in single-select mode. + mPickGallery = + registerForActivityResult( + new ActivityResultContracts.PickMultipleVisualMedia(MAX_NUM_OF_IMAGES), + uris -> { + if (!uris.isEmpty()) { + Log.d("PhotoPicker", "Selected URIs: " + uris); + mAddMediaLayout.setVisibility(View.GONE); + for (Uri uri : uris) { + MainActivity.this + .getContentResolver() + .takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION); + } + showMediaPreview(uris); + } else { + Log.d("PhotoPicker", "No media selected"); + } + }); + + mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout); + ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton); + mediaPreviewCloseButton.setOnClickListener( + view -> { + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + mSelectedImageUri = null; + }); + + ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton); + addMoreImageButton.setOnClickListener( + view -> { + Log.d("addMore", "clicked"); + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + mGalleryButton.callOnClick(); + }); + } + + private List getProcessedImagesForModel(List uris) { + List imageList = new ArrayList<>(); + if (uris != null) { + uris.forEach( + (uri) -> { + imageList.add(new ETImage(this.getContentResolver(), uri)); + }); + } + return imageList; + } + + private void showMediaPreview(List uris) { + if (mSelectedImageUri == null) { + mSelectedImageUri = uris; + } else { + mSelectedImageUri.addAll(uris); + } + + if (mSelectedImageUri.size() > MAX_NUM_OF_IMAGES) { + mSelectedImageUri = mSelectedImageUri.subList(0, MAX_NUM_OF_IMAGES); + Toast.makeText( + this, "Only max " + MAX_NUM_OF_IMAGES + " images are allowed", Toast.LENGTH_SHORT) + .show(); + } + Log.d("mSelectedImageUri", mSelectedImageUri.size() + " " + mSelectedImageUri); + + mMediaPreviewConstraintLayout.setVisibility(View.VISIBLE); + + List imageViews = new ArrayList(); + + // Pre-populate all the image views that are available from the layout (currently max 5) + imageViews.add(requireViewById(R.id.mediaPreviewImageView1)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView2)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView3)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView4)); + imageViews.add(requireViewById(R.id.mediaPreviewImageView5)); + + // Hide all the image views (reset state) + for (int i = 0; i < imageViews.size(); i++) { + imageViews.get(i).setVisibility(View.GONE); + } + + // Only show/render those that have proper Image URIs + for (int i = 0; i < mSelectedImageUri.size(); i++) { + imageViews.get(i).setVisibility(View.VISIBLE); + imageViews.get(i).setImageURI(mSelectedImageUri.get(i)); + } + + // For LLava, we want to call prefill_image as soon as an image is selected + // Llava only support 1 image for now + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + List processedImageList = getProcessedImagesForModel(mSelectedImageUri); + if (!processedImageList.isEmpty()) { + mMessageAdapter.add( + new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0)); + mMessageAdapter.notifyDataSetChanged(); + Runnable runnable = + () -> { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("Starting runnable prefill image"); + ETImage img = processedImageList.get(0); + ETLogging.getInstance().log("Llava start prefill image"); + startPos = + mModule.prefillImages( + img.getInts(), + img.getWidth(), + img.getHeight(), + ModelUtils.VISION_MODEL_IMAGE_CHANNELS, + startPos); + }; + executor.execute(runnable); + } + } + } + + private void addSelectedImagesToChatThread(List selectedImageUri) { + if (selectedImageUri == null) { + return; + } + mMediaPreviewConstraintLayout.setVisibility(View.GONE); + for (int i = 0; i < selectedImageUri.size(); i++) { + Uri imageURI = selectedImageUri.get(i); + Log.d("image uri ", "test " + imageURI.getPath()); + mMessageAdapter.add(new Message(imageURI.toString(), true, MessageType.IMAGE, 0)); + } + mMessageAdapter.notifyDataSetChanged(); + } + + private String getConversationHistory() { + String conversationHistory = ""; + + ArrayList conversations = + mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK); + if (conversations.isEmpty()) { + return conversationHistory; + } + + int prevPromptID = conversations.get(0).getPromptID(); + String conversationFormat = + PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType()); + String format = conversationFormat; + for (int i = 0; i < conversations.size(); i++) { + Message conversation = conversations.get(i); + int currentPromptID = conversation.getPromptID(); + if (currentPromptID != prevPromptID) { + conversationHistory = conversationHistory + format; + format = conversationFormat; + prevPromptID = currentPromptID; + } + if (conversation.getIsSent()) { + format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText()); + } else { + format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText()); + } + } + conversationHistory = conversationHistory + format; + + return conversationHistory; + } + + private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) { + if (conversationHistory.isEmpty()) { + return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); + } + + return mCurrentSettingsFields.getFormattedSystemPrompt() + + conversationHistory + + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt); } private void onModelRunStarted() { - mSendButton.setText("Stop"); + mSendButton.setClickable(false); + mSendButton.setImageResource(R.drawable.baseline_stop_24); mSendButton.setOnClickListener( view -> { mModule.stop(); @@ -197,20 +656,31 @@ private void onModelRunStarted() { } private void onModelRunStopped() { - setTitle(memoryInfo()); - mSendButton.setText("Generate"); + mSendButton.setClickable(true); + mSendButton.setImageResource(R.drawable.baseline_send_24); mSendButton.setOnClickListener( view -> { - String prompt = mEditTextMessage.getText().toString(); - mMessageAdapter.add(new Message(prompt, true)); + addSelectedImagesToChatThread(mSelectedImageUri); + String rawPrompt = mEditTextMessage.getText().toString(); + // We store raw prompt into message adapter, because we don't want to show the extra + // tokens from system prompt + mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID)); mMessageAdapter.notifyDataSetChanged(); mEditTextMessage.setText(""); - mResultMessage = new Message("", false); + mResultMessage = new Message("", false, MessageType.TEXT, promptID); mMessageAdapter.add(mResultMessage); + // Scroll to bottom of the list + mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1); + // After images are added to prompt and chat thread, we clear the imageURI list + // Note: This has to be done after imageURIs are no longer needed by LlamaModule + mSelectedImageUri = null; + promptID++; Runnable runnable = new Runnable() { @Override public void run() { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("starting runnable generate()"); runOnUiThread( new Runnable() { @Override @@ -218,9 +688,28 @@ public void run() { onModelRunStarted(); } }); + long generateStartTime = System.currentTimeMillis(); + if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) + == ModelUtils.VISION_MODEL) { + mModule.generateFromPos( + mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt), + ModelUtils.VISION_MODEL_SEQ_LEN, + startPos, + MainActivity.this, + false); + } else { + String finalPrompt = + getTotalFormattedPrompt(getConversationHistory(), rawPrompt); + ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); + mModule.generate( + finalPrompt, + (int) (finalPrompt.length() * 0.75) + 64, + MainActivity.this, + false); + } - mModule.generate(prompt, MainActivity.this); - + long generateDuration = System.currentTimeMillis() - generateStartTime; + mResultMessage.setTotalGenerationTime(generateDuration); runOnUiThread( new Runnable() { @Override @@ -228,9 +717,10 @@ public void run() { onModelRunStopped(); } }); + ETLogging.getInstance().log("Inference completed"); } }; - new Thread(runnable).start(); + executor.execute(runnable); }); mMessageAdapter.notifyDataSetChanged(); } @@ -242,8 +732,27 @@ public void run() { @Override public void run() { mMessageAdapter.notifyDataSetChanged(); - setTitle(memoryInfo()); } }); } + + @Override + public void onBackPressed() { + super.onBackPressed(); + if (mAddMediaLayout != null && mAddMediaLayout.getVisibility() == View.VISIBLE) { + mAddMediaLayout.setVisibility(View.GONE); + } else { + // Default behavior of back button + finish(); + } + } + + @Override + protected void onDestroy() { + super.onDestroy(); + mMemoryUpdateHandler.removeCallbacks(memoryUpdater); + // This is to cover the case where the app is shutdown when user is on MainActivity but + // never clicked on the logsActivity + ETLogging.getInstance().saveLogs(); + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java index 81b77b1aba..b2e5380e2a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java @@ -8,14 +8,50 @@ package com.example.executorchllamademo; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + public class Message { private String text; - private boolean isSent; + private final boolean isSent; private float tokensPerSecond; + private long totalGenerationTime; + private final long timestamp; + private final MessageType messageType; + private String imagePath; + private final int promptID; + + private static final String TIMESTAMP_FORMAT = "hh:mm a"; // example: 2:23 PM - public Message(String text, boolean isSent) { - this.text = text; + public Message(String text, boolean isSent, MessageType messageType, int promptID) { this.isSent = isSent; + this.messageType = messageType; + this.promptID = promptID; + + if (messageType == MessageType.IMAGE) { + this.imagePath = text; + } else { + this.text = text; + } + + if (messageType != MessageType.SYSTEM) { + this.timestamp = System.currentTimeMillis(); + } else { + this.timestamp = (long) 0; + } + } + + public int getPromptID() { + return promptID; + } + + public MessageType getMessageType() { + return messageType; + } + + public String getImagePath() { + return imagePath; } public String getText() { @@ -34,7 +70,25 @@ public void setTokensPerSecond(float tokensPerSecond) { this.tokensPerSecond = tokensPerSecond; } + public void setTotalGenerationTime(long totalGenerationTime) { + this.totalGenerationTime = totalGenerationTime; + } + public float getTokensPerSecond() { return tokensPerSecond; } + + public long getTotalGenerationTime() { + return totalGenerationTime; + } + + public long getTimestamp() { + return timestamp; + } + + public String getFormattedTimestamp() { + SimpleDateFormat formatter = new SimpleDateFormat(TIMESTAMP_FORMAT, Locale.getDefault()); + Date date = new Date(timestamp); + return formatter.format(date); + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java index 656da1967d..2538c852e4 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java @@ -8,33 +8,124 @@ package com.example.executorchllamademo; +import android.net.Uri; import android.view.LayoutInflater; import android.view.View; import android.view.ViewGroup; import android.widget.ArrayAdapter; +import android.widget.ImageView; import android.widget.TextView; +import java.util.ArrayList; +import java.util.Collections; public class MessageAdapter extends ArrayAdapter { - public MessageAdapter(android.content.Context context, int resource) { + + private final ArrayList savedMessages; + + public MessageAdapter( + android.content.Context context, int resource, ArrayList savedMessages) { super(context, resource); + this.savedMessages = savedMessages; } @Override public View getView(int position, View convertView, ViewGroup parent) { Message currentMessage = getItem(position); + int layoutIdForListItem; - int layoutIdForListItem = - currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message; + if (currentMessage.getMessageType() == MessageType.SYSTEM) { + layoutIdForListItem = R.layout.system_message; + } else { + layoutIdForListItem = + currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message; + } View listItemView = LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false); - TextView messageTextView = listItemView.findViewById(R.id.message_text); - messageTextView.setText(currentMessage.getText()); + if (currentMessage.getMessageType() == MessageType.IMAGE) { + ImageView messageImageView = listItemView.requireViewById(R.id.message_image); + messageImageView.setImageURI(Uri.parse(currentMessage.getImagePath())); + TextView messageTextView = listItemView.requireViewById(R.id.message_text); + messageTextView.setVisibility(View.GONE); + } else { + TextView messageTextView = listItemView.requireViewById(R.id.message_text); + messageTextView.setText(currentMessage.getText()); + } + String metrics = ""; + TextView tokensView; if (currentMessage.getTokensPerSecond() > 0) { - TextView tokensView = listItemView.findViewById(R.id.tokens_per_second); - tokensView.setText("" + currentMessage.getTokensPerSecond() + " t/s"); + metrics = String.format("%.2f", currentMessage.getTokensPerSecond()) + "t/s "; + } + + if (currentMessage.getTotalGenerationTime() > 0) { + metrics = metrics + (float) currentMessage.getTotalGenerationTime() / 1000 + "s "; + } + + if (currentMessage.getTokensPerSecond() > 0 || currentMessage.getTotalGenerationTime() > 0) { + tokensView = listItemView.requireViewById(R.id.generation_metrics); + tokensView.setText(metrics); + TextView separatorView = listItemView.requireViewById(R.id.bar); + separatorView.setVisibility(View.VISIBLE); + } + + if (currentMessage.getTimestamp() > 0) { + TextView timestampView = listItemView.requireViewById(R.id.timestamp); + timestampView.setText(currentMessage.getFormattedTimestamp()); } return listItemView; } + + @Override + public void add(Message msg) { + super.add(msg); + savedMessages.add(msg); + } + + @Override + public void clear() { + super.clear(); + savedMessages.clear(); + } + + public ArrayList getSavedMessages() { + return savedMessages; + } + + public ArrayList getRecentSavedTextMessages(int numOfLatestPromptMessages) { + ArrayList recentMessages = new ArrayList(); + int lastIndex = savedMessages.size() - 1; + Message messageToAdd = savedMessages.get(lastIndex); + int oldPromptID = messageToAdd.getPromptID(); + + for (int i = 0; i < savedMessages.size(); i++) { + messageToAdd = savedMessages.get(lastIndex - i); + if (messageToAdd.getMessageType() != MessageType.SYSTEM) { + if (messageToAdd.getPromptID() != oldPromptID) { + numOfLatestPromptMessages--; + oldPromptID = messageToAdd.getPromptID(); + } + if (numOfLatestPromptMessages > 0) { + if (messageToAdd.getMessageType() == MessageType.TEXT) { + recentMessages.add(messageToAdd); + } + } else { + break; + } + } + } + + // To place the order in [input1, output1, input2, output2...] + Collections.reverse(recentMessages); + return recentMessages; + } + + public int getMaxPromptID() { + int maxPromptID = -1; + for (Message msg : savedMessages) { + + maxPromptID = Math.max(msg.getPromptID(), maxPromptID); + } + return maxPromptID; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java new file mode 100644 index 0000000000..6042acb572 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public enum MessageType { + TEXT, + IMAGE, + SYSTEM +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java new file mode 100644 index 0000000000..4dc32d1475 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.os.Handler; +import android.os.HandlerThread; +import android.os.Looper; +import android.os.Message; +import androidx.annotation.NonNull; +import org.pytorch.executorch.LlamaCallback; +import org.pytorch.executorch.LlamaModule; + +/** A helper class to handle all model running logic within this class. */ +public class ModelRunner implements LlamaCallback { + LlamaModule mModule = null; + + String mModelFilePath = ""; + String mTokenizerFilePath = ""; + + ModelRunnerCallback mCallback = null; + + HandlerThread mHandlerThread = null; + Handler mHandler = null; + + /** + * ] Helper class to separate between UI logic and model runner logic. Automatically handle + * generate() request on worker thread. + * + * @param modelFilePath + * @param tokenizerFilePath + * @param callback + */ + ModelRunner( + String modelFilePath, + String tokenizerFilePath, + float temperature, + ModelRunnerCallback callback) { + mModelFilePath = modelFilePath; + mTokenizerFilePath = tokenizerFilePath; + mCallback = callback; + + mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f); + mHandlerThread = new HandlerThread("ModelRunner"); + mHandlerThread.start(); + mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this); + + mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL); + } + + int generate(String prompt) { + Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt); + msg.sendToTarget(); + return 0; + } + + void stop() { + mModule.stop(); + } + + @Override + public void onResult(String result) { + mCallback.onTokenGenerated(result); + } + + @Override + public void onStats(float tps) { + mCallback.onStats("tokens/second: " + tps); + } +} + +class ModelRunnerHandler extends Handler { + public static int MESSAGE_LOAD_MODEL = 1; + public static int MESSAGE_GENERATE = 2; + + private final ModelRunner mModelRunner; + + public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) { + super(looper); + mModelRunner = modelRunner; + } + + @Override + public void handleMessage(@NonNull android.os.Message msg) { + if (msg.what == MESSAGE_LOAD_MODEL) { + int status = mModelRunner.mModule.load(); + mModelRunner.mCallback.onModelLoaded(status); + } else if (msg.what == MESSAGE_GENERATE) { + mModelRunner.mModule.generate((String) msg.obj, mModelRunner); + mModelRunner.mCallback.onGenerationStopped(); + } + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java new file mode 100644 index 0000000000..c8bdc53075 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +/** + * A helper interface within the app for MainActivity and Benchmarking to handle callback from + * ModelRunner. + */ +public interface ModelRunnerCallback { + + void onModelLoaded(int status); + + void onTokenGenerated(String token); + + void onStats(String token); + + void onGenerationStopped(); +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java new file mode 100644 index 0000000000..91e84be059 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public enum ModelType { + LLAMA_3, + LLAMA_3_1, + LLAVA_1_5, +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java new file mode 100644 index 0000000000..ab1f1bc92f --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class ModelUtils { + static final int TEXT_MODEL = 1; + static final int VISION_MODEL = 2; + static final int VISION_MODEL_IMAGE_CHANNELS = 3; + static final int VISION_MODEL_SEQ_LEN = 768; + static final int TEXT_MODEL_SEQ_LEN = 256; + + public static int getModelCategory(ModelType modelType) { + switch (modelType) { + case LLAVA_1_5: + return VISION_MODEL; + case LLAMA_3: + case LLAMA_3_1: + default: + return TEXT_MODEL; + } + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java new file mode 100644 index 0000000000..36e738c3d0 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -0,0 +1,75 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class PromptFormat { + + public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; + public static final String USER_PLACEHOLDER = "{{ user_prompt }}"; + public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}"; + public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences"; + + public static String getSystemPromptTemplate(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n" + + SYSTEM_PLACEHOLDER + + "<|eot_id|>"; + case LLAVA_1_5: + return "USER: "; + default: + return SYSTEM_PLACEHOLDER; + } + } + + public static String getUserPromptTemplate(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return "<|start_header_id|>user<|end_header_id|>\n" + + USER_PLACEHOLDER + + "<|eot_id|>" + + "<|start_header_id|>assistant<|end_header_id|>"; + + case LLAVA_1_5: + default: + return USER_PLACEHOLDER; + } + } + + public static String getConversationFormat(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>"; + case LLAVA_1_5: + return USER_PLACEHOLDER + " ASSISTANT:"; + default: + return USER_PLACEHOLDER; + } + } + + public static String getStopToken(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return "<|eot_id|>"; + case LLAVA_1_5: + return ""; + default: + return ""; + } + } + + public static String getLlavaPresetPrompt() { + return "A chat between a curious human and an artificial intelligence assistant. The assistant" + + " gives helpful, detailed, and polite answers to the human's questions. USER: "; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java new file mode 100644 index 0000000000..773fef19dd --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -0,0 +1,393 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.app.AlertDialog; +import android.content.DialogInterface; +import android.os.Build; +import android.os.Bundle; +import android.text.Editable; +import android.text.TextWatcher; +import android.widget.Button; +import android.widget.EditText; +import android.widget.ImageButton; +import android.widget.TextView; +import androidx.appcompat.app.AppCompatActivity; +import androidx.core.content.ContextCompat; +import androidx.core.graphics.Insets; +import androidx.core.view.ViewCompat; +import androidx.core.view.WindowInsetsCompat; +import com.google.gson.Gson; +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +public class SettingsActivity extends AppCompatActivity { + + private String mModelFilePath = ""; + private String mTokenizerFilePath = ""; + private TextView mModelTextView; + private TextView mTokenizerTextView; + private TextView mModelTypeTextView; + private EditText mSystemPromptEditText; + private EditText mUserPromptEditText; + private Button mLoadModelButton; + private double mSetTemperature; + private String mSystemPrompt; + private String mUserPrompt; + private ModelType mModelType; + public SettingsFields mSettingsFields; + + private DemoSharedPreferences mDemoSharedPreferences; + public static double TEMPERATURE_MIN_VALUE = 0.0; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_settings); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } + ViewCompat.setOnApplyWindowInsetsListener( + requireViewById(R.id.main), + (v, insets) -> { + Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars()); + v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom); + return insets; + }); + mDemoSharedPreferences = new DemoSharedPreferences(getBaseContext()); + mSettingsFields = new SettingsFields(); + setupSettings(); + } + + private void setupSettings() { + mModelTextView = requireViewById(R.id.modelTextView); + mTokenizerTextView = requireViewById(R.id.tokenizerTextView); + mModelTypeTextView = requireViewById(R.id.modelTypeTextView); + ImageButton modelImageButton = requireViewById(R.id.modelImageButton); + ImageButton tokenizerImageButton = requireViewById(R.id.tokenizerImageButton); + ImageButton modelTypeImageButton = requireViewById(R.id.modelTypeImageButton); + mSystemPromptEditText = requireViewById(R.id.systemPromptText); + mUserPromptEditText = requireViewById(R.id.userPromptText); + loadSettings(); + + // TODO: The two setOnClickListeners will be removed after file path issue is resolved + modelImageButton.setOnClickListener( + view -> { + setupModelSelectorDialog(); + }); + tokenizerImageButton.setOnClickListener( + view -> { + setupTokenizerSelectorDialog(); + }); + modelTypeImageButton.setOnClickListener( + view -> { + setupModelTypeSelectorDialog(); + }); + mModelFilePath = mSettingsFields.getModelFilePath(); + if (!mModelFilePath.isEmpty()) { + mModelTextView.setText(getFilenameFromPath(mModelFilePath)); + } + mTokenizerFilePath = mSettingsFields.getTokenizerFilePath(); + if (!mTokenizerFilePath.isEmpty()) { + mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath)); + } + mModelType = mSettingsFields.getModelType(); + ETLogging.getInstance().log("mModelType from settings " + mModelType); + if (mModelType != null) { + mModelTypeTextView.setText(mModelType.toString()); + } + + setupParameterSettings(); + setupPromptSettings(); + setupClearChatHistoryButton(); + setupLoadModelButton(); + } + + private void setupLoadModelButton() { + mLoadModelButton = requireViewById(R.id.loadModelButton); + mLoadModelButton.setEnabled(true); + mLoadModelButton.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Load Model") + .setMessage("Do you really want to load the new model?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + mSettingsFields.saveLoadModelAction(true); + mLoadModelButton.setEnabled(false); + onBackPressed(); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupClearChatHistoryButton() { + Button clearChatButton = requireViewById(R.id.clearChatButton); + clearChatButton.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Delete Chat History") + .setMessage("Do you really want to delete chat history?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + mSettingsFields.saveIsClearChatHistory(true); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupParameterSettings() { + setupTemperatureSettings(); + } + + private void setupTemperatureSettings() { + mSetTemperature = mSettingsFields.getTemperature(); + EditText temperatureEditText = requireViewById(R.id.temperatureEditText); + temperatureEditText.setText(String.valueOf(mSetTemperature)); + temperatureEditText.addTextChangedListener( + new TextWatcher() { + @Override + public void beforeTextChanged(CharSequence s, int start, int count, int after) {} + + @Override + public void onTextChanged(CharSequence s, int start, int before, int count) {} + + @Override + public void afterTextChanged(Editable s) { + mSetTemperature = Double.parseDouble(s.toString()); + // This is needed because temperature is changed together with model loading + // Once temperature is no longer in LlamaModule constructor, we can remove this + mSettingsFields.saveLoadModelAction(true); + saveSettings(); + } + }); + } + + private void setupPromptSettings() { + setupSystemPromptSettings(); + setupUserPromptSettings(); + } + + private void setupSystemPromptSettings() { + mSystemPrompt = mSettingsFields.getSystemPrompt(); + mSystemPromptEditText.setText(mSystemPrompt); + mSystemPromptEditText.addTextChangedListener( + new TextWatcher() { + @Override + public void beforeTextChanged(CharSequence s, int start, int count, int after) {} + + @Override + public void onTextChanged(CharSequence s, int start, int before, int count) {} + + @Override + public void afterTextChanged(Editable s) { + mSystemPrompt = s.toString(); + } + }); + + ImageButton resetSystemPrompt = requireViewById(R.id.resetSystemPrompt); + resetSystemPrompt.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Reset System Prompt") + .setMessage("Do you really want to reset system prompt?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + // Clear the messageAdapter and sharedPreference + mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private void setupUserPromptSettings() { + mUserPrompt = mSettingsFields.getUserPrompt(); + mUserPromptEditText.setText(mUserPrompt); + mUserPromptEditText.addTextChangedListener( + new TextWatcher() { + @Override + public void beforeTextChanged(CharSequence s, int start, int count, int after) {} + + @Override + public void onTextChanged(CharSequence s, int start, int before, int count) {} + + @Override + public void afterTextChanged(Editable s) { + if (isValidUserPrompt(s.toString())) { + mUserPrompt = s.toString(); + } else { + showInvalidPromptDialog(); + } + } + }); + + ImageButton resetUserPrompt = requireViewById(R.id.resetUserPrompt); + resetUserPrompt.setOnClickListener( + view -> { + new AlertDialog.Builder(this) + .setTitle("Reset Prompt Template") + .setMessage("Do you really want to reset the prompt template?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + new DialogInterface.OnClickListener() { + public void onClick(DialogInterface dialog, int whichButton) { + // Clear the messageAdapter and sharedPreference + mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType)); + } + }) + .setNegativeButton(android.R.string.no, null) + .show(); + }); + } + + private boolean isValidUserPrompt(String userPrompt) { + return userPrompt.contains(PromptFormat.USER_PLACEHOLDER); + } + + private void showInvalidPromptDialog() { + new AlertDialog.Builder(this) + .setTitle("Invalid Prompt Format") + .setMessage( + "Prompt format must contain " + + PromptFormat.USER_PLACEHOLDER + + ". Do you want to reset prompt format?") + .setIcon(android.R.drawable.ic_dialog_alert) + .setPositiveButton( + android.R.string.yes, + (dialog, whichButton) -> { + mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType)); + }) + .setNegativeButton(android.R.string.no, null) + .show(); + } + + private void setupModelSelectorDialog() { + String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); + AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); + modelPathBuilder.setTitle("Select model path"); + + modelPathBuilder.setSingleChoiceItems( + pteFiles, + -1, + (dialog, item) -> { + mModelFilePath = pteFiles[item]; + mModelTextView.setText(getFilenameFromPath(mModelFilePath)); + mLoadModelButton.setEnabled(true); + dialog.dismiss(); + }); + + modelPathBuilder.create().show(); + } + + private static String[] listLocalFile(String path, String suffix) { + File directory = new File(path); + if (directory.exists() && directory.isDirectory()) { + File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix)); + String[] result = new String[files.length]; + for (int i = 0; i < files.length; i++) { + if (files[i].isFile() && files[i].getName().endsWith(suffix)) { + result[i] = files[i].getAbsolutePath(); + } + } + return result; + } + return null; + } + + private void setupModelTypeSelectorDialog() { + // Convert enum to list + List modelTypesList = new ArrayList<>(); + for (ModelType modelType : ModelType.values()) { + modelTypesList.add(modelType.toString()); + } + // Alert dialog builder takes in arr of string instead of list + String[] modelTypes = modelTypesList.toArray(new String[0]); + AlertDialog.Builder modelTypeBuilder = new AlertDialog.Builder(this); + modelTypeBuilder.setTitle("Select model type"); + modelTypeBuilder.setSingleChoiceItems( + modelTypes, + -1, + (dialog, item) -> { + mModelTypeTextView.setText(modelTypes[item]); + mModelType = ModelType.valueOf(modelTypes[item]); + mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType)); + dialog.dismiss(); + }); + + modelTypeBuilder.create().show(); + } + + private void setupTokenizerSelectorDialog() { + String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); + String[] tokenizerFiles = new String[binFiles.length]; + System.arraycopy(binFiles, 0, tokenizerFiles, 0, binFiles.length); + AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); + tokenizerPathBuilder.setTitle("Select tokenizer path"); + tokenizerPathBuilder.setSingleChoiceItems( + tokenizerFiles, + -1, + (dialog, item) -> { + mTokenizerFilePath = tokenizerFiles[item]; + mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath)); + mLoadModelButton.setEnabled(true); + dialog.dismiss(); + }); + + tokenizerPathBuilder.create().show(); + } + + private String getFilenameFromPath(String uriFilePath) { + String[] segments = uriFilePath.split("/"); + if (segments.length > 0) { + return segments[segments.length - 1]; // get last element (aka filename) + } + return ""; + } + + private void loadSettings() { + Gson gson = new Gson(); + String settingsFieldsJSON = mDemoSharedPreferences.getSettings(); + if (!settingsFieldsJSON.isEmpty()) { + mSettingsFields = gson.fromJson(settingsFieldsJSON, SettingsFields.class); + } + } + + private void saveSettings() { + mSettingsFields.saveModelPath(mModelFilePath); + mSettingsFields.saveTokenizerPath(mTokenizerFilePath); + mSettingsFields.saveParameters(mSetTemperature); + mSettingsFields.savePrompts(mSystemPrompt, mUserPrompt); + mSettingsFields.saveModelType(mModelType); + mDemoSharedPreferences.addSettings(mSettingsFields); + } + + @Override + public void onBackPressed() { + super.onBackPressed(); + saveSettings(); + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java new file mode 100644 index 0000000000..b71799981b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java @@ -0,0 +1,131 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class SettingsFields { + + public String getModelFilePath() { + return modelFilePath; + } + + public String getTokenizerFilePath() { + return tokenizerFilePath; + } + + public double getTemperature() { + return temperature; + } + + public String getSystemPrompt() { + return systemPrompt; + } + + public ModelType getModelType() { + return modelType; + } + + public String getUserPrompt() { + return userPrompt; + } + + public String getFormattedSystemAndUserPrompt(String prompt) { + return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt); + } + + public String getFormattedSystemPrompt() { + return PromptFormat.getSystemPromptTemplate(modelType) + .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt); + } + + public String getFormattedUserPrompt(String prompt) { + return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt); + } + + public boolean getIsClearChatHistory() { + return isClearChatHistory; + } + + public boolean getIsLoadModel() { + return isLoadModel; + } + + private String modelFilePath; + private String tokenizerFilePath; + private double temperature; + private String systemPrompt; + private String userPrompt; + private boolean isClearChatHistory; + private boolean isLoadModel; + private ModelType modelType; + + public SettingsFields() { + ModelType DEFAULT_MODEL = ModelType.LLAMA_3; + + modelFilePath = ""; + tokenizerFilePath = ""; + temperature = SettingsActivity.TEMPERATURE_MIN_VALUE; + systemPrompt = ""; + userPrompt = PromptFormat.getUserPromptTemplate(DEFAULT_MODEL); + isClearChatHistory = false; + isLoadModel = false; + modelType = DEFAULT_MODEL; + } + + public SettingsFields(SettingsFields settingsFields) { + this.modelFilePath = settingsFields.modelFilePath; + this.tokenizerFilePath = settingsFields.tokenizerFilePath; + this.temperature = settingsFields.temperature; + this.systemPrompt = settingsFields.getSystemPrompt(); + this.userPrompt = settingsFields.getUserPrompt(); + this.isClearChatHistory = settingsFields.getIsClearChatHistory(); + this.isLoadModel = settingsFields.getIsLoadModel(); + this.modelType = settingsFields.modelType; + } + + public void saveModelPath(String modelFilePath) { + this.modelFilePath = modelFilePath; + } + + public void saveTokenizerPath(String tokenizerFilePath) { + this.tokenizerFilePath = tokenizerFilePath; + } + + public void saveModelType(ModelType modelType) { + this.modelType = modelType; + } + + public void saveParameters(Double temperature) { + this.temperature = temperature; + } + + public void savePrompts(String systemPrompt, String userPrompt) { + this.systemPrompt = systemPrompt; + this.userPrompt = userPrompt; + } + + public void saveIsClearChatHistory(boolean needToClear) { + this.isClearChatHistory = needToClear; + } + + public void saveLoadModelAction(boolean shouldLoadModel) { + this.isLoadModel = shouldLoadModel; + } + + public boolean equals(SettingsFields anotherSettingsFields) { + if (this == anotherSettingsFields) return true; + return modelFilePath.equals(anotherSettingsFields.modelFilePath) + && tokenizerFilePath.equals(anotherSettingsFields.tokenizerFilePath) + && temperature == anotherSettingsFields.temperature + && systemPrompt.equals(anotherSettingsFields.systemPrompt) + && userPrompt.equals(anotherSettingsFields.userPrompt) + && isClearChatHistory == anotherSettingsFields.isClearChatHistory + && isLoadModel == anotherSettingsFields.isLoadModel + && modelType == anotherSettingsFields.modelType; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml new file mode 100644 index 0000000000..0868ffffa6 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml new file mode 100644 index 0000000000..2ae27b8409 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml new file mode 100644 index 0000000000..7077fedd48 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml new file mode 100644 index 0000000000..a6837b9c69 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml new file mode 100644 index 0000000000..fb902d4331 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml new file mode 100644 index 0000000000..4680bc6629 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml new file mode 100644 index 0000000000..860470ab10 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml @@ -0,0 +1,6 @@ + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml new file mode 100644 index 0000000000..2de1f64208 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml @@ -0,0 +1,6 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml new file mode 100644 index 0000000000..c51d84b9f4 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml @@ -0,0 +1,11 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml new file mode 100644 index 0000000000..832e258595 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml new file mode 100644 index 0000000000..ceb3ac56c9 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml @@ -0,0 +1,8 @@ + + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml new file mode 100644 index 0000000000..eb8b9d1f1a --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml @@ -0,0 +1,21 @@ + + + + + + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml new file mode 100644 index 0000000000..87c82d2a38 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml new file mode 100644 index 0000000000..0a7a71f070 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml @@ -0,0 +1,9 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml new file mode 100644 index 0000000000..35c778a437 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png new file mode 100644 index 0000000000..60e3e5174e Binary files /dev/null and b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png differ diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml new file mode 100644 index 0000000000..bb45d63d85 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml new file mode 100644 index 0000000000..c7b4b2e4a1 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml new file mode 100644 index 0000000000..a8bb4b2f64 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml new file mode 100644 index 0000000000..5f81396e38 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml index ea2d1bbfa1..c2288b5bfc 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml @@ -1,6 +1,6 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml new file mode 100644 index 0000000000..6e48b5de8b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml @@ -0,0 +1,16 @@ + + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml new file mode 100644 index 0000000000..b327a544f2 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml index 089acb572b..7b8b8d1760 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml @@ -1,44 +1,233 @@ - - + + + + + + + - + + + + + + -