Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hip support(draft) #1407

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
56 changes: 48 additions & 8 deletions .flake/pkgs/legion.nix
Original file line number Diff line number Diff line change
@@ -1,19 +1,37 @@
{ lib
, stdenv
, fetchFromGitLab
, cmake
, config
, python3
, cudaPackages ? { }
, cudaCapabilities ? [ "60" "70" "80" "86" ]
, rocmPackages ? { }
, maxDim ? 5
, useCuda ? config.cudaSupport
, useRocm ? config.rocmSupport
, stdenv ? if useCuda then cudaPackages.backendStdenv else rocmPackages.llvm.rocmClangStdenv
}:

# from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix

let
cmakeFlag = x: if x then "1" else "0";

inherit (cudaPackages) cudatoolkit;
inherit (lib)
cmakeBool
cmakeFeature
optionals
;

cudaBuildInputs = with cudaPackages; [
cudatoolkit
];
rocmBuildInputs = with rocmPackages; [
clr
rocthrust
rocprim
llvm.clang
];
in

stdenv.mkDerivation rec {
Expand All @@ -35,19 +53,41 @@ stdenv.mkDerivation rec {
cmakeFlags = [
"-DLegion_USE_Python=1"
"-DLegion_BUILD_BINDINGS=1"
"-DLegion_USE_CUDA=1"
"-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
"-DLegion_MAX_DIM=${toString maxDim}"
];
]
++ optionals useRocm [
# TODO: this is the legacy way of setting hip compiler. Once we update nixpkgs version we should use the new way. It will be a quick fix
(cmakeFeature "Legion_USE_HIP" "1")
(cmakeFeature "HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
(cmakeFeature "HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "HIP_RUNTIME" "rocclr")
(cmakeFeature "HIP_PLATFORM" "amd")
(cmakeFeature "HIP_PATH" "${rocmPackages.clr}/hip")
(cmakeFeature "HIP_ROOT_DIR" "${rocmPackages.clr}")
(cmakeFeature "HIP_THRUST_ROOT_DIR" "${rocmPackages.rocthrust}")
(cmakeFeature "ROCM_PATH" "${rocmPackages.clr}")

(cmakeFeature "HIP_INCLUDE_DIRS" "${rocmPackages.clr}/hip/include")

(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
]
++ optionals useCuda [
(cmakeFeature "Legion_USE_CUDA" "1")
(cmakeFeature "CMAKE_CUDA_ARCH" (builtins.concatStringsSep ";" cudaCapabilities))
];



buildInputs = [
python3
cudatoolkit
];
]
++ optionals useCuda cudaBuildInputs
++ optionals useRocm rocmBuildInputs;

meta = with lib; {
description = "Legion is a parallel programming model for distributed, heterogeneous machines";
homepage = "https://github.com/StanfordLegion/legion";
license = licenses.asl20;
};
}
}
30 changes: 19 additions & 11 deletions .proj.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,34 @@ project_name = "flexflow"
testsuite_macro = "FF_TEST_SUITE"
namespace_name = "FlexFlow"
header_extension = ".h"
fix_compile_commands = false

build_targets = [
"utils",
"op-attrs",
"kernels",
"pcg",
# "pcg",
# "substitutions",
# "compiler",
"substitution-generator",
"local-execution",
# "substitution-generator",
# "local-execution",
]
test_targets = [
"utils-tests",
"op-attrs-tests",
"pcg-tests",
# "utils-tests",
# "substitutions-tests",
# "compiler-tests",
"substitution-generator-tests",
# "pcg",
# "substitutions",
# "compiler",
# "substitution-generator",
]

[cmake_flags_extra]
FF_CUDA_ARCH = "60"
CMAKE_CUDA_ARCHITECTURES = "60"
FF_USE_HIP_ROCM = "ON"
FF_GPU_BACKEND = "hip_rocm"
# CMAKE_CUDA_ARCHITECTURES = "60"
CMAKE_HIP_ARCHITECTURES = "gfx900"
# HIP_PLATFORM = "amd"
# HIP_RUNTIME = "rocclr"
CMAKE_CXX_COMPILER = "hipcc"
CMAKE_C_COMPILER = "hipcc"

# FF_CUDA_ARCH = "60"
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(LIBEXT ".so")
endif()

include(cuda)
include(cudnn)
include(nccl)
# include(cuda)
# include(cudnn)
# include(nccl)
if (FF_USE_CODE_COVERAGE)
include(CodeCoverage)
append_coverage_compiler_flags()
Expand Down
68 changes: 58 additions & 10 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
extra-substituters = [
"https://ff.cachix.org"
"https://cuda-maintainers.cachix.org/"
"https://llama-cpp.cachix.org"
"https://nixos-rocm.cachix.org/"
];
extra-trusted-public-keys = [
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
"ff.cachix.org-1:/kyZ0w35ToSJBjpiNfPLrL3zTjuPkUiqf2WH0GIShXM="
"nixos-rocm.cachix.org-1:VEpsf7pRIijjd8csKjFNBGzkBqOmw8H9PRmgAq14LnE="
"llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
];
};

Expand All @@ -29,11 +33,33 @@
pkgs = import nixpkgs {
inherit system;
config.allowUnfree = true;
config.rocmSupport = true;
};
lib = pkgs.lib;

inherit (pkgs.rocmPackages) clr miopen miopengemm rccl rocm-runtime;

rocm = pkgs.symlinkJoin {
name = "rocm";
paths = with pkgs.rocmPackages; [
rocm-thunk
rocm-runtime
rocm-device-libs
clr
hipcc
rccl
llvm.clang
miopen
miopengemm
miopen-hip
hipblas
rocm-cmake
hip-common
];
};

mkShell = pkgs.mkShell.override {
stdenv = pkgs.cudaPackages.backendStdenv;
stdenv = pkgs.rocmPackages.llvm.rocmClangStdenv;
};
in
{
Expand Down Expand Up @@ -61,7 +87,14 @@
devShells = rec {
ci = mkShell {
shellHook = ''
export HIP_COMPILER="${pkgs.rocmPackages.llvm.clang}/bin/clang"
export PATH="$HOME/ff/.scripts/:$PATH"
export ROCM_PATH=${clr}
export HIP_DEVICE_LIB_PATH="${pkgs.rocmPackages.rocm-device-libs}/amdgcn/bitcode"
# export HIP_ROOT_DIR=${clr}
# export HIP_PATH=${clr}/hip
# export HIP_INCLUDE_DIRS=${clr}/hip/include
echo "ROCm path set to: $ROCM_PATH"
'';

CMAKE_FLAGS = lib.strings.concatStringsSep " " [
Expand All @@ -76,6 +109,14 @@
"-DFF_USE_EXTERNAL_RANGEV3=ON"
"-DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON"
"-DFF_USE_EXTERNAL_TYPE_INDEX=ON"

# hip related flags
"-DHIP_PLATFORM=amd"
# "-DHIP_RUNTIME=rocclr"
# "-DHIP_COMPILER=${pkgs.rocmPackages.llvm.clang}/bin/clang"
"-DHIP_PATH=${clr}/hip"
"-DHIP_ROOT_DIR=${clr}/hip"

];

RC_PARAMS = "max_discard_ratio=100";
Expand All @@ -92,21 +133,29 @@
ccache
pkg-config
python3
cudatoolkit
cudaPackages.cuda_nvcc
cudaPackages.cudnn
cudaPackages.nccl
cudaPackages.libcublas
cudaPackages.cuda_cudart
tl-expected
lcov # for code coverage
])
(with self.packages.${system}; [
legion
hpp2plantuml
rapidcheckFull
doctest
])
(with pkgs.rocmPackages; [
clr
miopen
miopengemm
rccl
rocm-runtime
hipblas
hipcc
hip-common
rocm-cmake
miopen-hip
rocm-thunk
rocm-device-libs
])
# [ rocm ]
];
};

Expand All @@ -129,7 +178,6 @@
compdb
jq
gh
lcov # for code coverage
])
(with proj-repo.packages.${system}; [
proj
Expand All @@ -152,4 +200,4 @@
};
}
);
}
}
53 changes: 45 additions & 8 deletions lib/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,50 @@
set(project_target kernels)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

message("rocm path: $ENV{ROCM_PATH}")

project(${project_target}
LANGUAGES CXX CUDA)
LANGUAGES CXX HIP)

message("rocm path after: $ENV{ROCM_PATH}")


# if (DEFINED ENV{ROCM_PATH})
# set(ROCM_PATH $ENV{ROCM_PATH})
# else()
# message(FATAL_ERROR "ROCM_PATH is not set")
# endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
if(CXX_IS_HIPCC)
if(LINUX)
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()

message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
" Prefer setting the HIP compiler directly. See README for details.")
endif()
else()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
endif()


find_package(hip REQUIRED)
find_package(miopen REQUIRED)
find_package(rccl REQUIRED)

file(GLOB_RECURSE SRC
CONFIGURE_DEPENDS
LIST_DIRECTORIES False
src/*.cc
src/cuda/ops/*.cu
# src/*.cc
src/hip/concat_kernels.cpp
)

add_library(
Expand All @@ -25,15 +62,15 @@ target_include_directories(
target_link_libraries(
${project_target}
op-attrs
cuda
cudnn
nccl
MIOpen
hip::host
rccl
)

define_ff_vars(${project_target})

set_target_properties(
${project_target}
PROPERTIES
CUDA_STANDARD 17
)
HIP_STANDARD 17
)
18 changes: 9 additions & 9 deletions lib/kernels/include/kernels/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <cudnn.h>
#elif defined(FF_USE_HIP_ROCM)
#include <hip/hip_runtime.h>
#include <hipblas.h>
#include <hipblas/hipblas.h>
#include <miopen/miopen.h>
#else
#error "Unknown device"
Expand Down Expand Up @@ -57,21 +57,21 @@ typedef miopenTensorDescriptor_t ffTensorDescriptor_t;
typedef miopenActivationDescriptor_t ffActivationDescriptor_t;
typedef miopenPoolingDescriptor_t ffPoolingDescriptor_t;
typedef miopenBatchNormMode_t ffBatchNormMode_t;
typedef miopenFilterDescriptor_t ffFilterDescriptor_t;
typedef miopenTensorDescriptor_t ffFilterDescriptor_t;
typedef miopenConvolutionDescriptor_t ffConvolutionDescriptor_t;
typedef miopenConvolutionFwdAlgo_t ffConvolutionFwdAlgo_t;
typedef miopenConvolutionBwdFilterAlgo_t ffConvolutionBwdFilterAlgo_t;
typedef miopenConvolutionBwdDataAlgo_t ffConvolutionBwdDataAlgo_t;
// typedef miopenConvolutionFwdAlgo_t ffConvolutionFwdAlgo_t; //we don't have this one in miopen
// typedef miopenConvolutionBwdFilterAlgo_t ffConvolutionBwdFilterAlgo_t; // don't have this either
// typedef miopenConvolutionBwdDataAlgo_t ffConvolutionBwdDataAlgo_t;
typedef miopenDropoutDescriptor_t ffDropoutDescriptor_t;
typedef miopenOpTensorDescriptor_t ffOpTensorDescriptor_t;
typedef miopenTensorDescriptor_t ffOpTensorDescriptor_t; //don't have this either but will use miopenTensorDescriptor_t as a placeholder
typedef miopenReduceTensorDescriptor_t ffReduceTensorDescriptor_t;
typedef miopenAttnDescriptor_t ffAttnDescriptor_t;
typedef miopenSeqDataDescriptor_t ffSeqDataDescriptor_t;
// typedef miopenAttnDescriptor_t ffAttnDescriptor_t;
// typedef miopenSeqDataDescriptor_t ffSeqDataDescriptor_t;
typedef miopenHandle_t ffHandle_t;
typedef hipEvent_t ffEvent_t;
typedef hipblasHandle_t ffblasHandle_t;
typedef miopenStatus_t ffStatus_t;
typedef hipblasDataType_t ffDataType_t;
typedef hipblasDatatype_t ffDataType_t;
typedef miopenDataType_t ffCudnnDataType_t;
typedef hipError_t ffError_t;
#else
Expand Down
Loading
Loading