Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental AMDGPU support #858

Merged
merged 15 commits into from
Mar 1, 2025
8 changes: 5 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ PkgVersion = "eebad327-c553-4316-9ea0-9fa01ccd7688"
Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
PseudoPotentialIO = "cb339c56-07fa-4cb2-923a-142469552264"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -46,6 +45,7 @@ Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
GenericLinearAlgebra = "14197337-ba66-59df-a3e3-ca00e7dcff7a"
GeometryOptimization = "673bf261-a53d-43b9-876f-d3c1fc8329c2"
Expand All @@ -59,6 +59,7 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
wannier90_jll = "c5400fa0-8d08-52c2-913f-1e3f656c1ce9"

[extensions]
DFTKAMDGPUExt = "AMDGPU"
DFTKCUDAExt = "CUDA"
DFTKGenericLinearAlgebraExt = "GenericLinearAlgebra"
DFTKGeometryOptimizationExt = "GeometryOptimization"
Expand All @@ -72,6 +73,7 @@ DFTKWannierExt = "Wannier"
DFTKWriteVTKExt = "WriteVTK"

[compat]
AMDGPU = "1.2.3"
ASEconvert = "0.1"
AbstractFFTs = "1"
Aqua = "0.8.5"
Expand Down Expand Up @@ -116,7 +118,6 @@ Plots = "1"
Polynomials = "3, 4"
PrecompileTools = "1"
Preferences = "1"
Primes = "0.5"
Printf = "1"
PseudoPotentialData = "0.2.2"
PseudoPotentialIO = "0.1"
Expand All @@ -140,6 +141,7 @@ julia = "1.10"
wannier90_jll = "3.1"

[extras]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
ASEconvert = "3da9722f-58c2-4165-81be-b4d7253e8fd2"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
AtomsBuilder = "f5cc8831-eeb7-4288-8d9f-d6c1ddb77004"
Expand Down Expand Up @@ -169,4 +171,4 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
wannier90_jll = "c5400fa0-8d08-52c2-913f-1e3f656c1ce9"

[targets]
test = ["Test", "TestItemRunner", "ASEconvert", "AtomsBuilder", "Aqua", "AtomsIO", "AtomsIOPython", "CUDA", "CUDA_Runtime_jll", "ComponentArrays", "DoubleFloats", "FiniteDiff", "FiniteDifferences", "GenericLinearAlgebra", "GeometryOptimization", "IntervalArithmetic", "JLD2", "JSON3", "Logging", "Plots", "PseudoPotentialData", "PythonCall", "QuadGK", "Random", "KrylovKit", "Wannier", "WriteVTK", "wannier90_jll"]
test = ["Test", "TestItemRunner", "AMDGPU", "ASEconvert", "AtomsBuilder", "Aqua", "AtomsIO", "AtomsIOPython", "CUDA", "CUDA_Runtime_jll", "ComponentArrays", "DoubleFloats", "FiniteDiff", "FiniteDifferences", "GenericLinearAlgebra", "GeometryOptimization", "IntervalArithmetic", "JLD2", "JSON3", "Logging", "Plots", "PseudoPotentialData", "PythonCall", "QuadGK", "Random", "KrylovKit", "Wannier", "WriteVTK", "wannier90_jll"]
16 changes: 16 additions & 0 deletions examples/amdgpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
using AtomsBuilder
using DFTK
using AMDGPU
using PseudoPotentialData

model = model_DFT(bulk(:Si);
functionals=PBE(),
pseudopotentials=PseudoFamily("cp2k.nc.sr.pbe.v0_1.semicore.gth"))

# If available, use AMDGPU to store DFT quantities and perform main computations
architecture = has_rocm_gpu() ? DFTK.GPU(AMDGPU.ROCArray) : DFTK.CPU()

basis = PlaneWaveBasis(model; Ecut=20, kgrid=(1, 1, 1), architecture)

# Anderson does not yet work ...
scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
4 changes: 2 additions & 2 deletions examples/gpu.jl → examples/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ using PseudoPotentialData

model = model_DFT(bulk(:Si);
functionals=PBE(),
pseudopotentials=PseudoFamily("cp2k.nc.sr.pbe.v0_1.semicore.gth"))
pseudopotentials=PseudoFamily("dojo.nc.sr.pbe.v0_4_1.standard.upf"))

# If available, use CUDA to store DFT quantities and perform main computations
architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU()

basis = PlaneWaveBasis(model; Ecut=30, kgrid=(5, 5, 5), architecture)
scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
scfres = self_consistent_field(basis; tol=1e-6)
2 changes: 1 addition & 1 deletion examples/silicon.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ a = 10.26 # Silicon lattice constant in Bohr
lattice = a / 2 * [[0 1 1.];
[1 0 1.];
[1 1 0.]]
Si = ElementPsp(:Si, PseudoFamily("cp2k.nc.sr.lda.v0_1.semicore.gth"))
Si = ElementPsp(:Si, PseudoFamily("dojo.nc.sr.lda.v0_4_1.standard.upf"))
atoms = [Si, Si]
positions = [ones(3)/8, -ones(3)/8]

Expand Down
15 changes: 15 additions & 0 deletions ext/DFTKAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module DFTKAMDGPUExt
using AMDGPU
using LinearAlgebra
import DFTK: GPU
using DFTK

DFTK.synchronize_device(::GPU{<:AMDGPU.ROCArray}) = AMDGPU.synchronize()

Check warning on line 7 in ext/DFTKAMDGPUExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/DFTKAMDGPUExt.jl#L7

Added line #L7 was not covered by tests

# Temporary workaround to not trigger https://github.com/JuliaGPU/AMDGPU.jl/issues/734
function LinearAlgebra.cholesky(A::Hermitian{T, <:AMDGPU.ROCArray}) where {T}
Acopy, info = AMDGPU.rocSOLVER.potrf!(A.uplo, copy(A.data))
LinearAlgebra.Cholesky(Acopy, A.uplo, info)

Check warning on line 12 in ext/DFTKAMDGPUExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/DFTKAMDGPUExt.jl#L10-L12

Added lines #L10 - L12 were not covered by tests
end

end
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,8 @@ function DFTK.build_fft_plans!(tmp::AbstractArray{<:Complex})
# opBFFT = inv(opFFT).p
opFFT = generic_plan_fft(tmp) # Fallback for now
opBFFT = generic_plan_bfft(tmp)
# TODO Can be cut once FourierTransforms supports AbstractFFTs properly
ipFFT = DummyInplace{typeof(opFFT)}(opFFT)
ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
ipFFT = DummyInplace(opFFT)
ipBFFT = DummyInplace(opBFFT)

ipFFT, opFFT, ipBFFT, opBFFT
end
Expand All @@ -51,7 +50,7 @@ struct GenericPlan{T}
factor::T
end

function Base.:*(p::GenericPlan, X::AbstractArray)
function Base.:*(p::GenericPlan, X::AbstractArray{T, 3}) where {T}
pl1, pl2, pl3 = p.subplans
ret = similar(X)
for i = 1:size(X, 1), j = 1:size(X, 2)
Expand All @@ -66,7 +65,7 @@ function Base.:*(p::GenericPlan, X::AbstractArray)
p.factor .* ret
end

LinearAlgebra.mul!(Y, p::GenericPlan, X) = Y .= p * X
LinearAlgebra.mul!(Y, p::GenericPlan, X) = Y .= p * X
LinearAlgebra.ldiv!(Y, p::GenericPlan, X) = Y .= p \ X

length(p::GenericPlan) = prod(length, p.subplans)
Expand Down
17 changes: 9 additions & 8 deletions ext/DFTKGenericLinearAlgebraExt/ctfft.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using Primes

# COV_EXCL_START
## import Primes

# These files are copied (and very slightly modified) from [FourierTransforms.jl](https://github.com/JuliaComputing/FourierTransforms.jl)
# commit [6a206bcfc8f49a129ca34beaf57d05cdc148dc8a](https://github.com/JuliaComputing/FourierTransforms.jl/tree/6a206bcfc8f49a129ca34beaf57d05cdc148dc8a).
Expand Down Expand Up @@ -618,12 +617,14 @@ function fft_factors(T::Type, n::Integer)
end
end
end
# get any leftover prime factors:
for (f,k) in factor(m)
for i=1:k
push!(factors, f)
end
end
@assert isone(m)
## Disabled to get rid of Primes dependency
## # get any leftover prime factors:
## for (f,k) in Primes.factor(m)
## for i=1:k
## push!(factors, f)
## end
## end
end
factors
end
Expand Down
6 changes: 5 additions & 1 deletion ext/DFTKIntervalArithmeticExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import SpecialFunctions: erfc
# ... this is far from proper and a bit specific for our use case here
# (that's why it's not contributed upstream).
# should be done e.g. by changing the rounding mode ...
#
# Could be made more rigorous by using core-math C library, which provides
# a interval rounding version.
#
erfc(i::Interval) = Interval(prevfloat(erfc(i.lo)), nextfloat(erfc(i.hi)))
Base.nextfloat(x::Interval) = Interval(nextfloat(x.lo), nextfloat(x.hi))
Base.prevfloat(x::Interval) = Interval(prevfloat(x.lo), prevfloat(x.hi))
Expand All @@ -20,7 +24,7 @@ Base.prevfloat(x::Interval) = Interval(prevfloat(x.lo), prevfloat(x.hi))
# see issue #513 on IntervalArithmetic repository
DFTK.cis2pi(x::Interval) = exp(2 * (pi * (im * x)))

DFTK.value_type(::Type{<:Interval{T}}) where {T} = T
DFTK.value_type(::Type{<:Interval{T}}) where {T} = T # IntervalArithmetic.numtype(T)

function compute_Glims_fast(lattice::AbstractMatrix{<:Interval}, args...; kwargs...)
# This is done to avoid a call like ceil(Int, ::Interval)
Expand Down
10 changes: 6 additions & 4 deletions src/fft.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ function ifft!(f_real::AbstractArray3, fft_grid::FFTGrid,
fill!(f_real, 0)
f_real[Gvec_mapping] = f_fourier

mul!(f_real, fft_grid.ipBFFT, f_real) # perform IFFT
# Perform iFFT, equivalent to mul!(f_real, fft_grid.ipBFFT, f_real)
f_real = fft_grid.ipBFFT * f_real
normalize && (f_real .*= fft_grid.ifft_normalization)
f_real
end
Expand Down Expand Up @@ -165,8 +166,8 @@ function fft!(f_fourier::AbstractVector, fft_grid::FFTGrid,
@assert size(f_real) == fft_grid.fft_size
@assert length(f_fourier) == length(Gvec_mapping)

# FFT
mul!(f_real, fft_grid.ipFFT, f_real)
# Perform FFT, equivalent to mul!(f_real, fft_grid.ipFFT, f_real)
f_real = fft_grid.ipFFT * f_real

# Truncate
f_fourier .= view(f_real, Gvec_mapping)
Expand Down Expand Up @@ -365,7 +366,8 @@ function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float3
end

# TODO Some grid sizes are broken in the generic FFT implementation
# in FourierTransforms, for more details see workarounds/fft_generic.jl
# in FourierTransforms, for more details see
# ext/DFTKGenericLinearAlgebraExt/DFTKGenericLinearAlgebraExt.jl
default_primes(::Type{Float32}) = (2, 3, 5)
default_primes(::Type{Float64}) = default_primes(Float32)
next_working_fft_size(::Type{Float32}, size::Int) = size
Expand Down
9 changes: 3 additions & 6 deletions src/workarounds/dummy_inplace_fft.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
# This is needed for some generic FFT implementations, which do not have in-place plans
# This is needed for some FFT implementations, which do not have in-place plans
struct DummyInplace{opFFT}
fft::opFFT
end
LinearAlgebra.mul!(Y, p::DummyInplace, X) = copy!(Y, p * X)
LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = copy!(Y, p \ X)

import Base: *, \, length
*(p::DummyInplace, X) = p.fft * X
\(p::DummyInplace, X) = p.fft \ X
import Base: *, length
Base.:*(p::DummyInplace, X) = copy!(X, p.fft * X)
length(p::DummyInplace) = length(p.fft)
4 changes: 2 additions & 2 deletions src/workarounds/forwarddiff_rules.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ end
function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Dual}
opFFT = AbstractFFTs.plan_fft(tmp)
opBFFT = AbstractFFTs.plan_bfft(tmp)
ipFFT = DummyInplace{typeof(opFFT)}(opFFT)
ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
ipFFT = DummyInplace(opFFT)
ipBFFT = DummyInplace(opBFFT)
ipFFT, opFFT, ipBFFT, opBFFT
end

Expand Down
1 change: 0 additions & 1 deletion test/aqua.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,5 @@
DFTK.potential_terms, DFTK.kernel_terms]),
piracies=false,
deps_compat=(; check_extras=(; ignore=[:CUDA_Runtime_jll])),
stale_deps=(; ignore=[:Primes, ]),
)
end
77 changes: 49 additions & 28 deletions test/gpu.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# These are not yet the best tests, but just to ensure our GPU support
# does not just break randomly
#
# TODO Also test other cases on AMDGPU
# TODO Refactor this file to reduce code duplication
#
# TODO Test Hamiltonian application on GPU
# TODO Direct minimisation
# TODO Float32
# TODO meta GGA

@testitem "CUDA silicon functionality test" tags=[:gpu] setup=[TestCases] begin
@testitem "GPU silicon functionality test" tags=[:gpu] setup=[TestCases] begin
using DFTK
using CUDA
using AMDGPU
using LinearAlgebra
silicon = TestCases.silicon

Expand All @@ -14,12 +23,27 @@
self_consistent_field(basis; tol=1e-9, solver=scf_damping_solver(damping=1.0))
end

# Bad copy and paste for now ... think of something more clever later
scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-9
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-9
# Test that forces compute: symmetric structure, forces are zero
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-10
@testset "CUDA" begin
if CUDA.has_cuda() && CUDA.has_cuda_gpu()
scfres_cuda = run_problem(; architecture=DFTK.GPU(CuArray))
@test scfres_cuda.ρ isa CuArray
@test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-9
@test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-9
@test norm(compute_forces(scfres_cuda)) < 1e-6 # Symmetric structure
end
end

@testset "AMDGPU" begin
if AMDGPU.has_rocm_gpu()
scfres_rocm = run_problem(; architecture=DFTK.GPU(ROCArray))
@test scfres_rocm.ρ isa ROCArray
@test abs(scfres_cpu.energies.total - scfres_rocm.energies.total) < 1e-9
@test norm(scfres_cpu.ρ - Array(scfres_rocm.ρ)) < 1e-9
@test norm(compute_forces(scfres_rocm)) < 1e-6 # Symmetric structure
end
end
end

@testitem "CUDA iron functionality test" tags=[:gpu] setup=[TestCases] begin
Expand All @@ -37,24 +61,26 @@ end
ρ = guess_density(basis, magnetic_moments)

# TODO Bump tolerance a bit here ... still leads to NaNs unfortunately
self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing(),
solver=scf_damping_solver(damping=1.0))
self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing())
end

scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-7
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-6
# Test that forces compute: symmetric structure, forces are zero
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-9
if CUDA.has_cuda() && CUDA.has_cuda_gpu()
ArrayType = CuArray
scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_cuda = run_problem(; architecture=DFTK.GPU(ArrayType))
@test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-7
@test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-6
# Test that forces compute: symmetric structure, forces are zero
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_cuda)) < 1e-9
end
end

@testitem "CUDA aluminium forces test" tags=[:gpu] setup=[TestCases] begin
using DFTK
using CUDA
using LinearAlgebra
aluminium = TestCases.aluminium
positions = aluminium.positions
positions = copy(aluminium.positions)
# Perturb equilibrium position for non-zero forces
positions[1] += [0.01, 0.0, -0.01]

Expand All @@ -68,17 +94,12 @@ end
self_consistent_field(basis; tol=1e-10)
end

scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-10
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-8
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_gpu)) < 1e-7
if CUDA.has_cuda() && CUDA.has_cuda_gpu()
ArrayType = CuArray
scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_cuda = run_problem(; architecture=DFTK.GPU(ArrayType))
@test abs(scfres_cpu.energies.total - scfres_cuda.energies.total) < 1e-10
@test norm(scfres_cpu.ρ - Array(scfres_cuda.ρ)) < 1e-8
@test norm(compute_forces(scfres_cpu) - compute_forces(scfres_cuda)) < 1e-7
end
end


# TODO Test hamiltonian application on GPU
# TODO Direct minimisation
# TODO Float32
# TODO meta GGA
# TODO Aluminium with LdosMixing
# TODO Anderson acceleration
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ runfile = joinpath(@__DIR__, "runtests_runner.jl")
# Trigger some precompilation and build steps
using ASEconvert
using CUDA
using AMDGPU
using DFTK
using Interpolations

Expand Down
Loading