From b58df04fe83d9d882c4c3bbb201cf5de4ac88fd0 Mon Sep 17 00:00:00 2001 From: Saiyedul Islam Date: Mon, 25 Nov 2024 07:26:23 -0600 Subject: [PATCH] [rocmlibs] Build using rocm-rel-6.3 branch of components Along with updating the branches of all components to rocm-rel-6.3, this patch also adds an script to build hipblas-common and enables tensile build. --- bin/rocmlibs/build_hipblas-common.sh | 128 ++++++++ bin/rocmlibs/build_rocblas.sh | 52 ++- bin/rocmlibs/build_rocmlibs.sh | 2 +- bin/rocmlibs/clone_rocmlibs.sh | 2 + .../patches/patch-control-file_20.0.txt | 3 +- bin/rocmlibs/patches/rocblas.patch | 4 +- bin/rocmlibs/patches/rocprim.patch | 58 ++-- bin/rocmlibs/patches/rocsolver.patch | 4 +- bin/rocmlibs/patches/rocsparse.patch | 4 +- bin/rocmlibs/patches/tensile_aca95d17.patch | 303 ++++++++++++++++++ bin/rocmlibs/rocmlibsi.xml | 14 +- 11 files changed, 511 insertions(+), 63 deletions(-) create mode 100755 bin/rocmlibs/build_hipblas-common.sh create mode 100644 bin/rocmlibs/patches/tensile_aca95d17.patch diff --git a/bin/rocmlibs/build_hipblas-common.sh b/bin/rocmlibs/build_hipblas-common.sh new file mode 100755 index 000000000..5b22a6acb --- /dev/null +++ b/bin/rocmlibs/build_hipblas-common.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# build_hipprim-common.sh: Script to build and install hipprim-common library +# This build is classic cmake, make, make install +# +BUILD_TYPE=${BUILD_TYPE:-Release} + +# --- Start standard header to set AOMP environment variables ---- +realpath=`realpath $0` +thisdir=`dirname $realpath` +. $thisdir/../aomp_common_vars +# --- end standard header ---- + +# Patch rocr +_repo_dir=$AOMP_REPOS/rocmlibs/hipBLAS-common +patchrepo $_repo_dir + +if [ "$AOMP_USE_NINJA" == 0 ] ; then + AOMP_SET_NINJA_GEN="" +else + AOMP_SET_NINJA_GEN="-G Ninja" +fi + +GFXSEMICOLONS=`echo $GFXLIST | tr ' ' ';' ` +GFXSEMICOLONS=""$GFXSEMICOLONS"" +#export CC=$AOMP/bin/clang +export CXX=$AOMP_INSTALL_DIR/bin/hipcc +export ROCM_DIR=$AOMP +export ROCM_PATH=$AOMP +export HIP_DIR=$AOMP +export PATH=$AOMP_SUPP/cmake/bin:$AOMP/bin:$PATH +export USE_PERL_SCRIPTS=1 +export NUM_PROC=$AOMP_JOB_THREADS +export AMDGPU_TARGETS="$GFXSEMICOLONS" +export CXXFLAGS="-I$LLVM_INSTALL_LOC/include -D__HIP_PLATFORM_AMD__=1" +export LDFLAGS="-fPIC" +export CMAKE_PREFIX_PATH="$LLVM_INSTALL_LOC/lib/cmake" +MYCMAKEOPTS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE \ +-DCMAKE_CXX_COMPILER=$LLVM_INSTALL_LOC/bin/clang++ \ +-DHIP_COMPILER=$LLVM_INSTALL_LOC/bin/clang \ +-DHIP_CXX_COMPILER=$AOMP_INSTALL_DIR/bin/hipcc \ +-DCMAKE_PREFIX_PATH=$LLVM_INSTALL_LOC/lib/cmake \ +-DCMAKE_INSTALL_PREFIX=$AOMP_INSTALL_DIR \ +-DAMDGPU_TARGETS="\'$GFXSEMICOLONS\'" +-DROCM_DIR=$ROCM_DIR \ +-DROCM_PATH=$ROCM_PATH \ +-DHIP_DIR=$HIP_DIR \ +-DHIP_PLATFORM=amd \ +" + +if [ $AOMP_STANDALONE_BUILD == 1 ] ; then + if [ ! -L $AOMP ] ; then + if [ -d $AOMP ] ; then + echo "ERROR: Directory $AOMP is a physical directory." + echo " It must be a symbolic link or not exist" + exit 1 + fi + fi +else + echo "ERROR: $0 only valid for AOMP_STANDALONE_BUILD=1" + exit 1 +fi + +# Make sure we can update the install directory +if [ "$1" == "install" ] ; then + $SUDO mkdir -p $AOMP_INSTALL_DIR + $SUDO touch $AOMP_INSTALL_DIR/testfile + if [ $? != 0 ] ; then + echo "ERROR: No update access to $AOMP_INSTALL_DIR" + exit 1 + fi + $SUDO rm $AOMP_INSTALL_DIR/testfile +fi + +if [ "$1" != "nocmake" ] && [ "$1" != "install" ] ; then + echo + echo "This is a FRESH START. ERASING any previous builds in $BUILD_DIR/build/rocmlibs/hipBLAS-common" + echo "Use ""$0 nocmake"" or ""$0 install"" to avoid FRESH START." + rm -rf $BUILD_DIR/build/rocmlibs/hipBLAS-common + mkdir -p $BUILD_DIR/build/rocmlibs/hipBLAS-common +else + if [ ! -d $BUILD_DIR/build/rocmlibs/hipBLAS-common ] ; then + echo "ERROR: The build directory $BUILD_DIR/build/rocmlibs/hipBLAS-common " + echo " run $0 without nocmake or install options. " + exit 1 + fi +fi + +cd $BUILD_DIR/build/rocmlibs/hipBLAS-common + +if [ "$1" != "nocmake" ] && [ "$1" != "install" ] ; then + echo + echo " -----Running ${AOMP_CMAKE} ---- " + echo ${AOMP_CMAKE} $MYCMAKEOPTS $_repo_dir + ${AOMP_CMAKE} $MYCMAKEOPTS $_repo_dir 2>&1 + if [ $? != 0 ] ; then + echo "ERROR cmake failed. Cmake flags" + echo " $MYCMAKEOPTS" + exit 1 + fi +fi + +# echo +# echo " -----Running ${AOMP_CMAKE} --build ---- " +# echo ${AOMP_CMAKE} --build . -j $AOMP_JOB_THREADS +# ${AOMP_CMAKE} . -j $AOMP_JOB_THREADS +# if [ $? != 0 ] ; then +# echo "ERROR make -j $AOMP_JOB_THREADS failed" +# exit 1 +# fi + +if [ "$1" == "install" ] ; then + echo " -----Installing to $AOMP_INSTALL_DIR ---- " + $SUDO make package install + if [ $? != 0 ] ; then + echo "ERROR make install failed " + exit 1 + fi + echo + echo "SUCCESSFUL INSTALL to $AOMP_INSTALL_DIR" + echo + removepatch $_repo_dir +else + echo + echo "SUCCESSFUL BUILD, please run: $0 install" + echo " to install into $AOMP_INSTALL_DIR" + echo +fi diff --git a/bin/rocmlibs/build_rocblas.sh b/bin/rocmlibs/build_rocblas.sh index e2a45a6dd..7e8d344d7 100755 --- a/bin/rocmlibs/build_rocblas.sh +++ b/bin/rocmlibs/build_rocblas.sh @@ -14,22 +14,33 @@ thisdir=`dirname $realpath` _repo_dir=$AOMP_REPOS/rocmlibs/rocBLAS _build_dir=$_repo_dir/build -AOMP_BUILD_TENSILE=${AOMP_BUILD_TENSILE:-0} - +# Check if Tensile is to be built with rocBLAS +AOMP_BUILD_TENSILE=${AOMP_BUILD_TENSILE:-1} if [ $AOMP_BUILD_TENSILE == 0 ] ; then echo echo "WARNING: Building rocblas without Tensile" - _local_tensile_opt="" + _local_tensile_opt="--no_tensile" else - _tensile_repo_dir=$AOMP_REPOS/rocmlibs/Tensile _cwd=$PWD + _tensile_repo_dir=$AOMP_REPOS/rocmlibs/Tensile cd $_tensile_repo_dir - git checkout release/rocm-rel-6.2 - git pull - # FIXME: We should get the Tensile hash from rocBLAS/tensile_tag.txt - git checkout 09ec3476785198159195e2b8d635db13733682d4 + # Read the commit SHA from the file rocBLAS/tensile_tag.txt + _tensile_commit_sha=$(cat $_repo_dir/tensile_tag.txt) + # Checkout the specific commit SHA + git checkout $_tensile_commit_sha + echo "Checking out Tensile commit $_tensile_commit_sha" cd $_cwd _local_tensile_opt="--test_local_path=$_tensile_repo_dir" + patchrepo $_tensile_repo_dir +fi + +# Check if rocBLAS is to be built with hipBLASLT +# It won't work unless hipBLASLT is already installed +ROCBLAS_USE_HIPBLASLT=${ROCBLAS_USE_HIPBLASLT:-0} +if [ $ROCBLAS_USE_HIPBLASLT == 0 ] ; then + echo + echo "WARNING: Building rocblas without hipBLASLT" + _local_hipblaslt_opt="--no_hipblaslt" fi patchrepo $_repo_dir @@ -45,9 +56,9 @@ for _arch in $GFXLIST ; do fi _sep=";" done -export CC=$AOMP_INSTALL_DIR/bin/hipcc -export CXX=$AOMP_INSTALL_DIR/bin/hipcc -export FC=gfortran +export CC=$LLVM_INSTALL_LOC/bin/clang +export CXX=$LLVM_INSTALL_LOC/bin/clang++ +export FC=$LLVM_INSTALL_LOC/bin/flang export ROCM_DIR=$AOMP_INSTALL_DIR export ROCM_PATH=$AOMP_INSTALL_DIR export PATH=$AOMP_SUPP/cmake/bin:$AOMP_INSTALL_DIR/bin:$PATH @@ -63,6 +74,11 @@ if [ "$AOMP_USE_CCACHE" != 0 ] ; then # export CMAKE_CXX_COMPILER_LAUNCHER=$_ccache_bin fi +# Set _build_type_option to Release or Debug based on BUILD_TYPE +if [ "$BUILD_TYPE" == "Debug" ] ; then + _build_type_option="--debug" +fi + if [ $AOMP_STANDALONE_BUILD == 1 ] ; then if [ ! -L $AOMP ] ; then if [ -d $AOMP ] ; then @@ -122,10 +138,11 @@ if [ "$1" != "install" ] ; then cd $_repo_dir _rmake_py_cmd="python3 ./rmake.py \ $_local_tensile_opt \ +$_local_hipblaslt_opt \ +$_build_type_option \ --install_invoked \ --build_dir $_build_dir \ --src_path=$_repo_dir \ ---no_tensile \ --jobs=$AOMP_JOB_THREADS \ --architecture="""$_gfxlist""" \ " @@ -148,8 +165,14 @@ fi if [ "$1" == "install" ] ; then echo " -----Installing to $AOMP_INSTALL_DIR ---- " - echo rsync -av $_build_dir/release/rocblas-install/ $AOMP_INSTALL_DIR/ - rsync -av $_build_dir/release/rocblas-install/ $AOMP_INSTALL_DIR/ + + if [ "$BUILD_TYPE" == "Release" ] ; then + _build_type_dir=release + else + _build_type_dir=debug + fi + echo rsync -av $_build_dir/$_build_type_dir/rocblas-install/ $AOMP_INSTALL_DIR/ + rsync -av $_build_dir/$_build_type_dir/rocblas-install/ $AOMP_INSTALL_DIR/ if [ $? != 0 ] ; then echo "ERROR copy to $AOMP_INSTALL_DIR failed " exit 1 @@ -158,6 +181,7 @@ if [ "$1" == "install" ] ; then echo "SUCCESSFUL INSTALL to $AOMP_INSTALL_DIR" echo removepatch $_repo_dir + removepatch $_tensile_repo_dir else echo echo "SUCCESSFUL BUILD, please run: $0 install" diff --git a/bin/rocmlibs/build_rocmlibs.sh b/bin/rocmlibs/build_rocmlibs.sh index fec252e2b..f8a22267b 100755 --- a/bin/rocmlibs/build_rocmlibs.sh +++ b/bin/rocmlibs/build_rocmlibs.sh @@ -99,7 +99,7 @@ components="prereq rocm-cmake" if [ "$AOMP_STANDALONE_BUILD" == 1 ] ; then # This ordered build is important when starting from scratch - components="$components rocblas rocprim rocsparse rocsolver hipblas" + components="$components rocblas rocprim rocsparse rocsolver hipblas-common hipblas" else echo "ERROR: Cannot run $0 with AOMP_STANDALONE_BUILD=$AOMP_STANDALONE_BUILD" echo " Please set $AOMP_STANDALONE_BUILD=1" diff --git a/bin/rocmlibs/clone_rocmlibs.sh b/bin/rocmlibs/clone_rocmlibs.sh index 8c3c4b534..c30ce8b28 100755 --- a/bin/rocmlibs/clone_rocmlibs.sh +++ b/bin/rocmlibs/clone_rocmlibs.sh @@ -47,6 +47,8 @@ if [ -d $repodirname ] ; then if [ "$STASH_BEFORE_PULL" == "YES" ] ; then git stash -u fi + echo "git checkout $COBRANCH" + git checkout $COBRANCH echo "git pull " git pull if [ $? != 0 ] && [ "$IGNORE_GIT_ERROR" != 1 ] ; then diff --git a/bin/rocmlibs/patches/patch-control-file_20.0.txt b/bin/rocmlibs/patches/patch-control-file_20.0.txt index 833aabf0f..7492deaff 100644 --- a/bin/rocmlibs/patches/patch-control-file_20.0.txt +++ b/bin/rocmlibs/patches/patch-control-file_20.0.txt @@ -1,6 +1,7 @@ +Tensile: tensile_aca95d17.patch rocBLAS: rocblas.patch rocPRIM: rocprim.patch rocSPARSE: rocsparse.patch rocSOLVER: rocsolver.patch hipBLAS: hipblas.patch -PowerInfer: powerinfer.patch +PowerInfer: powerinfer.patch \ No newline at end of file diff --git a/bin/rocmlibs/patches/rocblas.patch b/bin/rocmlibs/patches/rocblas.patch index 04cd768a4..02ee92342 100644 --- a/bin/rocmlibs/patches/rocblas.patch +++ b/bin/rocmlibs/patches/rocblas.patch @@ -6,8 +6,8 @@ index f70de1cf..5eedfcc3 100644 set( TARGET_LIST_ROCM_5.6 "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102") set( TARGET_LIST_ROCM_5.7 "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102") set( TARGET_LIST_ROCM_6.0 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102") -- set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201") -+ set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx1103;gfx90c") +- set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201") ++ set( TARGET_LIST_ROCM_6.3 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx1103;gfx90c") else() set( TARGET_LIST_ROCM_5.6 "gfx908:xnack+;gfx90a:xnack+") set( TARGET_LIST_ROCM_5.7 "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+") diff --git a/bin/rocmlibs/patches/rocprim.patch b/bin/rocmlibs/patches/rocprim.patch index 69578c606..3593c1d72 100644 --- a/bin/rocmlibs/patches/rocprim.patch +++ b/bin/rocmlibs/patches/rocprim.patch @@ -1,50 +1,38 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 68e9b7bf..049ed365 100644 +index a5b9b127..0153b24d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -91,7 +91,7 @@ if(NOT USE_HIP_CPU) - - if(GPU_TARGETS STREQUAL "all") - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS -- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151" -+ TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx90c;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1151" - ) - set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) - endif() -diff --git a/rmake.py b/rmake.py -index 265577f1..653fcd7b 100644 ---- a/rmake.py -+++ b/rmake.py -@@ -37,7 +37,7 @@ def parse_args(): - help='Install after build (default: False)') - parser.add_argument( '--cmake-darg', required=False, dest='cmake_dargs', action='append', default=[], - help='List of additional cmake defines for builds (e.g. CMAKE_CXX_COMPILER_LAUNCHER=ccache)') -- parser.add_argument('-a', '--architecture', dest='gpu_architecture', required=False, default="gfx906;gfx1030;gfx1100;gfx1101;gfx1102", #:sramecc+:xnack-" ) #gfx1030" ) #gfx906" ) # gfx1030" ) -+ parser.add_argument('-a', '--architecture', dest='gpu_architecture', required=False, default="gfx906;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103", #:sramecc+:xnack-" ) #gfx1030" ) #gfx906" ) # gfx1030" ) - help='Set GPU architectures, e.g. all, gfx000, gfx803, gfx906:xnack-;gfx1030;gfx1100 (optional, default: all)') - parser.add_argument('-v', '--verbose', required=False, default=False, action='store_true', - help='Verbose build (default: False)') +@@ -99,7 +99,7 @@ if(NOT USE_HIP_CPU) + ) + else() + rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS +- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" ++ TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx90c;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1151;gfx1200;gfx1201" + ) + endif() + diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp -index 484db834..0f8a59fb 100644 +index 58729b1d..22f7272a 100644 --- a/rocprim/include/rocprim/device/config_types.hpp +++ b/rocprim/include/rocprim/device/config_types.hpp -@@ -169,8 +169,10 @@ enum class target_arch : unsigned int +@@ -169,9 +169,11 @@ enum class target_arch : unsigned int gfx906 = 906, gfx908 = 908, gfx90a = 910, + gfx90c = 912, gfx1030 = 1030, + gfx1100 = 1100, gfx1102 = 1102, + gfx1103 = 1103, unknown = std::numeric_limits::max(), }; #endif // DOXYGEN_SHOULD_SKIP_THIS -@@ -203,15 +205,17 @@ constexpr bool prefix_equals(const char* lhs, const char* rhs, std::size_t n) +@@ -204,16 +206,18 @@ constexpr bool prefix_equals(const char* lhs, const char* rhs, std::size_t n) constexpr target_arch get_target_arch_from_name(const char* const arch_name, const std::size_t n) { constexpr const char* target_names[] -- = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx1030", "gfx1102"}; -+ = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx90c", "gfx1030", "gfx1102", "gfx1103"}; +- = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx1030", "gfx1100", "gfx1102"}; ++ = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx90c", "gfx1030", "gfx1100", "gfx1102", "gfx1103"}; constexpr target_arch target_architectures[] = { target_arch::gfx803, target_arch::gfx900, @@ -53,12 +41,13 @@ index 484db834..0f8a59fb 100644 target_arch::gfx90a, + target_arch::gfx90c, target_arch::gfx1030, + target_arch::gfx1100, target_arch::gfx1102, + target_arch::gfx1103, }; static_assert(sizeof(target_names) / sizeof(target_names[0]) == sizeof(target_architectures) / sizeof(target_architectures[0]), -@@ -264,10 +268,14 @@ auto dispatch_target_arch(const target_arch target_arch) +@@ -266,12 +270,16 @@ auto dispatch_target_arch(const target_arch target_arch) return Config::template architecture_config::params; case target_arch::gfx90a: return Config::template architecture_config::params; @@ -66,6 +55,8 @@ index 484db834..0f8a59fb 100644 + return Config::template architecture_config::params; case target_arch::gfx1030: return Config::template architecture_config::params; + case target_arch::gfx1100: + return Config::template architecture_config::params; case target_arch::gfx1102: return Config::template architecture_config::params; + case target_arch::gfx1103: @@ -74,15 +65,14 @@ index 484db834..0f8a59fb 100644 assert(false && "Invalid target architecture selected at runtime."); } diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py -index 57c2e0d6..e68f44f9 100755 +index 130bdb3c..171de162 100755 --- a/scripts/autotune/create_optimization.py +++ b/scripts/autotune/create_optimization.py @@ -41,7 +41,7 @@ from collections import defaultdict - from typing import Dict, List, Callable + from typing import Dict, List, Callable, Optional, Tuple from jinja2 import Environment, PackageLoader, select_autoescape --TARGET_ARCHITECTURES = ['gfx803', 'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx1030', 'gfx1102'] -+TARGET_ARCHITECTURES = ['gfx803', 'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx90c', 'gfx1030', 'gfx1102', 'gfx1103' ] +-TARGET_ARCHITECTURES = ['gfx803', 'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx1030', 'gfx1100', 'gfx1102'] ++TARGET_ARCHITECTURES = ['gfx803', 'gfx900', 'gfx906', 'gfx908', 'gfx90a', 'gfx90c', 'gfx1030', 'gfx1100', 'gfx1102', 'gfx1103'] # C++ typename used for optional types EMPTY_TYPENAME = "empty_type" - diff --git a/bin/rocmlibs/patches/rocsolver.patch b/bin/rocmlibs/patches/rocsolver.patch index d16cacafb..dfe4f9ba5 100644 --- a/bin/rocmlibs/patches/rocsolver.patch +++ b/bin/rocmlibs/patches/rocsolver.patch @@ -7,13 +7,13 @@ index 0f88788..95d5ac6 100644 gfx90a:xnack- gfx90a:xnack+ + gfx90c - gfx940 - gfx941 gfx942 gfx1100 gfx1101 gfx1102 + gfx1103 gfx1151 + gfx1200 + gfx1201 ) set(AMDGPU_TARGETS_INIT diff --git a/bin/rocmlibs/patches/rocsparse.patch b/bin/rocmlibs/patches/rocsparse.patch index dc88068ac..82dba4cc2 100644 --- a/bin/rocmlibs/patches/rocsparse.patch +++ b/bin/rocmlibs/patches/rocsparse.patch @@ -6,8 +6,8 @@ index 6a4f9d21..ff9d3635 100644 TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+") else() rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS -- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201") -+ TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx90c;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1151;gfx1200;gfx1201") +- TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201") ++ TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx90c;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103;gfx1151;gfx1200;gfx1201") endif() else() # Use target ID syntax if supported for AMDGPU_TARGETS diff --git a/bin/rocmlibs/patches/tensile_aca95d17.patch b/bin/rocmlibs/patches/tensile_aca95d17.patch new file mode 100644 index 000000000..7c3321a83 --- /dev/null +++ b/bin/rocmlibs/patches/tensile_aca95d17.patch @@ -0,0 +1,303 @@ +diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py +index 548b31f2..de4c2dd5 100644 +--- a/Tensile/AsmCaps.py ++++ b/Tensile/AsmCaps.py +@@ -771,6 +771,50 @@ CACHED_ASM_CAPS = \ + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, ++ (11, 0, 3): {'HasAddLshl': True, ++ 'HasAtomicAdd': True, ++ 'HasDirectToLdsDest': False, ++ 'HasDirectToLdsNoDest': False, ++ 'HasExplicitCO': True, ++ 'HasExplicitNC': True, ++ 'HasGLCModifier': True, ++ 'HasNTModifier': False, ++ 'HasLshlOr': True, ++ 'HasMFMA': False, ++ 'HasMFMA_b8': False, ++ 'HasMFMA_bf16_1k': False, ++ 'HasMFMA_bf16_original': False, ++ 'HasMFMA_constSrc': False, ++ 'HasMFMA_f64': False, ++ 'HasMFMA_f8': False, ++ 'HasMFMA_i8_908': False, ++ 'HasMFMA_i8_940': False, ++ 'HasMFMA_vgpr': False, ++ 'HasMFMA_xf32': False, ++ 'HasSMulHi': True, ++ 'HasWMMA': True, ++ 'KernargPreloading': False, ++ 'MaxLgkmcnt': 15, ++ 'MaxVmcnt': 63, ++ 'SupportedISA': True, ++ 'SupportedSource': True, ++ 'VOP3v_dot4_i32_i8': False, ++ 'v_dot2_f32_f16': True, ++ 'v_dot2c_f32_f16': True, ++ 'v_dot4_i32_i8': False, ++ 'v_dot4c_i32_i8': False, ++ 'v_fma_f16': True, ++ 'v_fma_f32': True, ++ 'v_fma_f64': True, ++ 'v_fma_mix_f32': True, ++ 'v_fmac_f16': False, ++ 'v_fmac_f32': True, ++ 'v_mac_f16': False, ++ 'v_mac_f32': False, ++ 'v_mad_mix_f32': False, ++ 'v_mov_b64': False, ++ 'v_pk_fma_f16': True, ++ 'v_pk_fmac_f16': False}, + (11, 5, 1): {'HasAddLshl': True, + 'HasAtomicAdd': True, + 'HasDirectToLdsDest': False, +diff --git a/Tensile/Common.py b/Tensile/Common.py +index 66f2caa2..d4f79558 100644 +--- a/Tensile/Common.py ++++ b/Tensile/Common.py +@@ -253,7 +253,7 @@ globalParameters["SupportedISA"] = [(8,0,3), + (9,0,0), (9,0,6), (9,0,8), (9,0,10), + (9,4,0), (9,4,1), (9,4,2), + (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), +- (11,0,0), (11,0,1), (11,0,2), ++ (11,0,0), (11,0,1), (11,0,2), (11,0,3), + (11,5,1), + (12,0,0), (12,0,1)] # assembly kernels writer supports these architectures + +@@ -328,13 +328,13 @@ architectureMap = { + 'all':'_','gfx000':'none', 'gfx803':'r9nano', 'gfx900':'vega10', 'gfx900:xnack-':'vega10', + 'gfx906':'vega20', 'gfx906:xnack+':'vega20', 'gfx906:xnack-':'vega20', + 'gfx908':'arcturus','gfx908:xnack+':'arcturus', 'gfx908:xnack-':'arcturus', +- 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran', ++ 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran', 'gfx90c':'gfx90c', + 'gfx940':'aquavanjaram', 'gfx940:xnack+':'aquavanjaram', 'gfx940:xnack-':'aquavanjaram', + 'gfx941':'aquavanjaram941', 'gfx941:xnack+':'aquavanjaram941', 'gfx941:xnack-':'aquavanjaram941', + 'gfx942':'aquavanjaram942', 'gfx942:xnack+':'aquavanjaram942', 'gfx942:xnack-':'aquavanjaram942', + 'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14', + 'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt', +- 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', ++ 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'phoenix', + 'gfx1151':'gfx1151', + 'gfx1200':'gfx1200', + 'gfx1201':'gfx1201' +@@ -2461,7 +2461,7 @@ def assignGlobalParameters( config ): + if os.name == "nt": + globalParameters["CurrentISA"] = (9,0,6) + printWarning("Failed to detect ISA so forcing (gfx906) on windows") +- isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (11,0,0), (11,0,1), (11,0,2), (12,0,0), (12,0,1)) ++ isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (12,0,0), (12,0,1)) + if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor: + isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor)) + printWarning(f"HardwareMonitor currently disabled for {isaString}") +diff --git a/Tensile/Source/CMakeLists.txt b/Tensile/Source/CMakeLists.txt +index e02b209a..4d13ade3 100644 +--- a/Tensile/Source/CMakeLists.txt ++++ b/Tensile/Source/CMakeLists.txt +@@ -51,9 +51,9 @@ if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang + endif() + + if(CMAKE_CXX_COMPILER STREQUAL "hipcc") +- set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "GPU architectures") ++ set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx90c gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 CACHE STRING "GPU architectures") + else() +- set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "GPU architectures") ++ set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a gfx90c gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 CACHE STRING "GPU architectures") + endif() + + include(CMakeDependentOption) +diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +index 0ab8ced5..0d7a5930 100644 +--- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp ++++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +@@ -60,6 +60,7 @@ namespace Tensile + gfx906 = 906, + gfx908 = 908, + gfx90a = 910, ++ gfx90c = 912, + gfx940 = 940, + gfx941 = 941, + gfx942 = 942, +@@ -73,7 +74,8 @@ namespace Tensile + gfx1035 = 1035, + gfx1100 = 1100, + gfx1101 = 1101, +- gfx1102 = 1102 ++ gfx1102 = 1102, ++ gfx1103 = 1103 + }; + + static std::string toString(Processor p) +@@ -90,6 +92,8 @@ namespace Tensile + return "gfx908"; + case AMDGPU::Processor::gfx90a: + return "gfx90a"; ++ case AMDGPU::Processor::gfx90c: ++ return "gfx90c"; + case AMDGPU::Processor::gfx940: + return "gfx940"; + case AMDGPU::Processor::gfx941: +@@ -118,6 +122,8 @@ namespace Tensile + return "gfx1101"; + case AMDGPU::Processor::gfx1102: + return "gfx1102"; ++ case AMDGPU::Processor::gfx1103: ++ return "gfx1103"; + } + return ""; + } +@@ -144,6 +150,10 @@ namespace Tensile + { + return AMDGPU::Processor::gfx90a; + } ++ else if(deviceString.find("gfx90c") != std::string::npos) ++ { ++ return AMDGPU::Processor::gfx90c; ++ } + else if(deviceString.find("gfx940") != std::string::npos) + { + return AMDGPU::Processor::gfx940; +@@ -184,6 +194,10 @@ namespace Tensile + { + return AMDGPU::Processor::gfx1102; + } ++ else if(deviceString.find("gfx1103") != std::string::npos) ++ { ++ return AMDGPU::Processor::gfx1103; ++ } + else + { + return static_cast(0); +diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +index 10898ec2..f6b5305e 100644 +--- a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp ++++ b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +@@ -44,6 +44,7 @@ namespace Tensile + gfx906, + gfx908, + gfx90a, ++ gfx90c, + gfx940, + gfx941, + gfx942, +@@ -58,6 +59,7 @@ namespace Tensile + gfx1100, + gfx1101, + gfx1102, ++ gfx1103, + All + }; + +@@ -78,6 +80,8 @@ namespace Tensile + return "TensileLibrary_*_gfx908"; + case LazyLoadingInit::gfx90a: + return "TensileLibrary_*_gfx90a"; ++ case LazyLoadingInit::gfx90c: ++ return "TensileLibrary_*_gfx90c"; + case LazyLoadingInit::gfx940: + return "TensileLibrary_*_gfx940"; + case LazyLoadingInit::gfx941: +@@ -106,6 +110,8 @@ namespace Tensile + return "TensileLibrary_*_gfx1101"; + case LazyLoadingInit::gfx1102: + return "TensileLibrary_*_gfx1102"; ++ case LazyLoadingInit::gfx1103: ++ return "TensileLibrary_*_gfx1103"; + case LazyLoadingInit::None: + return ""; + } +diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp b/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp +index 87fc0d24..f0c7fef4 100644 +--- a/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp ++++ b/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp +@@ -218,6 +218,7 @@ namespace Tensile + iot::enumCase(io, value, "gfx906", AMDGPU::Processor::gfx906); + iot::enumCase(io, value, "gfx908", AMDGPU::Processor::gfx908); + iot::enumCase(io, value, "gfx90a", AMDGPU::Processor::gfx90a); ++ iot::enumCase(io, value, "gfx90c", AMDGPU::Processor::gfx90c); + iot::enumCase(io, value, "gfx940", AMDGPU::Processor::gfx940); + iot::enumCase(io, value, "gfx941", AMDGPU::Processor::gfx941); + iot::enumCase(io, value, "gfx942", AMDGPU::Processor::gfx942); +@@ -232,6 +233,7 @@ namespace Tensile + iot::enumCase(io, value, "gfx1100", AMDGPU::Processor::gfx1100); + iot::enumCase(io, value, "gfx1101", AMDGPU::Processor::gfx1101); + iot::enumCase(io, value, "gfx1102", AMDGPU::Processor::gfx1102); ++ iot::enumCase(io, value, "gfx1103", AMDGPU::Processor::gfx1103); + } + }; + +diff --git a/Tensile/Source/lib/source/ocl/OclUtils.cpp b/Tensile/Source/lib/source/ocl/OclUtils.cpp +index 8ee6d217..bd67cfd7 100644 +--- a/Tensile/Source/lib/source/ocl/OclUtils.cpp ++++ b/Tensile/Source/lib/source/ocl/OclUtils.cpp +@@ -148,6 +148,10 @@ namespace Tensile + { + return AMDGPU::Processor::gfx90a; + } ++ else if(deviceString.find("gfx90c") != std::string::npos) ++ { ++ return AMDGPU::Processor::gfx90c; ++ } + else if(deviceString.find("gfx940") != std::string::npos) + { + return AMDGPU::Processor::gfx940; +@@ -188,6 +192,10 @@ namespace Tensile + { + return AMDGPU::Processor::gfx1102; + } ++ else if(deviceString.find("gfx1103") != std::string::npos) ++ { ++ return AMDGPU::Processor::gfx1103; ++ } + else + { + return static_cast(0); +diff --git a/docs/src/cli-reference/TensileCreateLibrary.rst b/docs/src/cli-reference/TensileCreateLibrary.rst +index 6e22a2c7..09345113 100644 +--- a/docs/src/cli-reference/TensileCreateLibrary.rst ++++ b/docs/src/cli-reference/TensileCreateLibrary.rst +@@ -35,9 +35,9 @@ When invoking *TensileCreateLibrary*, one can provide zero or more options. + Architectures to generate a library for. When specifying multiple options, use quoted, semicolon delimited + architectures, e.g., --architecture='gfx908;gfx1012'. + Supported architectures include: all gfx000 gfx803 gfx900 gfx900:xnack- gfx906 gfx906:xnack+ gfx906:xnack- gfx908 gfx908:xnack+ +- gfx908:xnack- gfx90a gfx90a:xnack+ gfx90a:xnack- gfx940 gfx940:xnack+ gfx940:xnack- gfx941 gfx941:xnack+ ++ gfx908:xnack- gfx90a gfx90a:xnack+ gfx90a:xnack- gfx90c gfx940 gfx940:xnack+ gfx940:xnack- gfx941 gfx941:xnack+ + gfx941:xnack- gfx942 gfx942:xnack+ gfx942:xnack- gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 +- gfx1100 gfx1101 gfx1102. ++ gfx1100 gfx1101 gfx1102 gfx1103. + \-\-build-client + Build Tensile client executable; used for stand alone benchmarking (default). + \-\-client-config +diff --git a/pytest.ini b/pytest.ini +index 13c43039..23a53d35 100644 +--- a/pytest.ini ++++ b/pytest.ini +@@ -92,6 +92,7 @@ markers = + xfail-gfx906: architecture + xfail-gfx908: architecture + xfail-gfx90a: architecture ++ xfail-gfx90c: architecture + xfail-gfx940: architecture + xfail-gfx941: architecture + xfail-gfx942: architecture +@@ -106,11 +107,13 @@ markers = + xfail-gfx1100: architecture + xfail-gfx1101: architecture + xfail-gfx1102: architecture ++ xfail-gfx1103: architecture + skip-gfx000: architecture + skip-gfx900: architecture + skip-gfx906: architecture + skip-gfx908: architecture + skip-gfx90a: architecture ++ skip-gfx90c: architecture + skip-gfx940: architecture + skip-gfx941: architecture + skip-gfx942: architecture +@@ -125,4 +128,5 @@ markers = + skip-gfx1100: architecture + skip-gfx1101: architecture + skip-gfx1102: architecture ++ skip-gfx1103: architecture + skip-gfx1151: architecture diff --git a/bin/rocmlibs/rocmlibsi.xml b/bin/rocmlibs/rocmlibsi.xml index d877a1c61..97c785d7b 100644 --- a/bin/rocmlibs/rocmlibsi.xml +++ b/bin/rocmlibs/rocmlibsi.xml @@ -7,12 +7,12 @@ - - - - - - - + + + + + + +