From cf52a942b55d3ee5230ae4327ba6c1b94bf37c26 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Sat, 22 Apr 2023 22:34:22 +0200
Subject: [PATCH] Experimental AMDGPU implementation

---
 examples/amdgpu.jl            | 14 ++++++++++++++
 examples/{gpu.jl => cuda.jl}  |  0
 src/DFTK.jl                   |  9 ++++-----
 src/terms/xc.jl               | 22 +++++++++++-----------
 src/workarounds/gpu_arrays.jl | 10 ++++++++++
 src/workarounds/roc_arrays.jl |  1 +
 6 files changed, 40 insertions(+), 16 deletions(-)
 create mode 100644 examples/amdgpu.jl
 rename examples/{gpu.jl => cuda.jl} (100%)
 create mode 100644 src/workarounds/roc_arrays.jl

diff --git a/examples/amdgpu.jl b/examples/amdgpu.jl
new file mode 100644
index 0000000000..20161682e9
--- /dev/null
+++ b/examples/amdgpu.jl
@@ -0,0 +1,14 @@
+using DFTK
+using AMDGPU
+
+a = 10.26  # Silicon lattice constant in Bohr
+lattice = a / 2 * [[0 1 1.];
+                   [1 0 1.];
+                   [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms     = [Si, Si]
+positions = [ones(3)/8, -ones(3)/8]
+model = model_PBE(lattice, atoms, positions)
+
+basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(5, 5, 5), architecture=DFTK.GPU(AMDGPU.ROCArray)
+scfres = self_consistent_field(basis; tol=1e-2, solver=scf_damping_solver())
diff --git a/examples/gpu.jl b/examples/cuda.jl
similarity index 100%
rename from examples/gpu.jl
rename to examples/cuda.jl
diff --git a/src/DFTK.jl b/src/DFTK.jl
index d1c4c048d4..211534c70b 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -230,15 +230,14 @@ function __init__()
     @require DoubleFloats="497a8b3b-efae-58df-a0af-a86822472b78" begin
         !isdefined(DFTK, :GENERIC_FFT_LOADED) && include("workarounds/fft_generic.jl")
     end
-    @require Plots="91a5bcdd-55d7-5caf-9e0b-520d859cae80" include("plotting.jl")
-    @require JLD2="033835bb-8acc-5ee8-8aae-3f567f8a3819"  include("external/jld2io.jl")
+    @require Plots="91a5bcdd-55d7-5caf-9e0b-520d859cae80"    include("plotting.jl")
+    @require JLD2="033835bb-8acc-5ee8-8aae-3f567f8a3819"     include("external/jld2io.jl")
     @require WriteVTK="64499a7a-5c06-52f2-abe2-ccb03c286192" include("external/vtkio.jl")
     @require wannier90_jll="c5400fa0-8d08-52c2-913f-1e3f656c1ce9" begin
         include("external/wannier90.jl")
     end
-    @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba"  begin
-        include("workarounds/cuda_arrays.jl")
-    end
+    @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba"   include("workarounds/cuda_arrays.jl")
+    @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" include("workarounds/roc_arrays.jl")
 end
 
 # Precompilation block with a basic workflow
diff --git a/src/terms/xc.jl b/src/terms/xc.jl
index 06a9b2b92c..249e3c8fc0 100644
--- a/src/terms/xc.jl
+++ b/src/terms/xc.jl
@@ -79,9 +79,8 @@ function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupatio
     max_ρ_derivs = maximum(max_required_derivative, term.functionals)
     density = LibxcDensities(basis, max_ρ_derivs, ρ, τ)
 
-    # Evaluate terms and energy contribution (zk == energy per unit particle)
-    # It may happen that a functional does only provide a potenital and not an energy term
-    # Therefore skip_unsupported_derivatives=true to avoid an error.
+    # Evaluate terms and energy contribution
+    # If the XC functional is not supported for an architecture, terms is on the CPU
     terms = potential_terms(term.functionals, density)
     @assert haskey(terms, :Vρ) && haskey(terms, :e)
     E = term.scaling_factor * sum(terms.e) * basis.dvol
@@ -94,14 +93,14 @@ function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupatio
     # Potential contributions Vρ -2 ∇⋅(Vσ ∇ρ) + ΔVl
     potential = zero(ρ)
     @views for s in 1:n_spin
-        Vρ = reshape(terms.Vρ, n_spin, basis.fft_size...)
+        Vρ = to_device(basis.architecture, reshape(terms.Vρ, n_spin, basis.fft_size...))
 
         potential[:, :, :, s] .+= Vρ[s, :, :, :]
         if haskey(terms, :Vσ) && any(x -> abs(x) > potential_threshold, terms.Vσ)
             # Need gradient correction
             # TODO Drop do-block syntax here?
             potential[:, :, :, s] .+= -2divergence_real(basis) do α
-                Vσ = reshape(terms.Vσ, :, basis.fft_size...)
+                Vσ = to_device(basis.architecture, reshape(terms.Vσ, :, basis.fft_size...))
 
                 # Extra factor (1/2) for s != t is needed because libxc only keeps σ_{αβ}
                 # in the energy expression. See comment block below on spin-polarised XC.
@@ -113,7 +112,7 @@ function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupatio
         if haskey(terms, :Vl) && any(x -> abs(x) > potential_threshold, terms.Vl)
             @warn "Meta-GGAs with a Δρ term have not yet been thoroughly tested." maxlog=1
             mG² = .-norm2.(G_vectors_cart(basis))
-            Vl  = reshape(terms.Vl, n_spin, basis.fft_size...)
+            Vl  = to_device(basis.architecture, reshape(terms.Vl, n_spin, basis.fft_size...))
             Vl_fourier = fft(basis, Vl[s, :, :, :])
             potential[:, :, :, s] .+= irfft(basis, mG² .* Vl_fourier)  # ΔVl
         end
@@ -123,7 +122,7 @@ function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupatio
     Vτ = nothing
     if haskey(terms, :Vτ) && any(x -> abs(x) > potential_threshold, terms.Vτ)
         # Need meta-GGA non-local operator (Note: -½ part of the definition of DivAgrid)
-        Vτ = reshape(terms.Vτ, n_spin, basis.fft_size...)
+        Vτ = to_device(basis.architecture, reshape(terms.Vτ, n_spin, basis.fft_size...))
         Vτ = term.scaling_factor * permutedims(Vτ, (2, 3, 4, 1))
     end
 
@@ -379,10 +378,11 @@ function apply_kernel(term::TermXc, basis::PlaneWaveBasis{T}, δρ; ρ, kwargs..
         ]
     end
 
+    # If the XC functional is not supported for an architecture, terms is on the CPU
     terms = kernel_terms(term.functionals, density)
     δV = zero(ρ)  # [ix, iy, iz, iσ]
 
-    Vρρ = reshape(terms.Vρρ, n_spin, n_spin, basis.fft_size...)
+    Vρρ = to_device(basis.architecture, reshape(terms.Vρρ, n_spin, n_spin, basis.fft_size...))
     @views for s in 1:n_spin, t in 1:n_spin  # LDA term
         δV[:, :, :, s] .+= Vρρ[s, t, :, :, :] .* δρ[t, :, :, :]
     end
@@ -422,9 +422,9 @@ function add_kernel_gradient_correction!(δV, terms, density, perturbation, cros
     δρ  = perturbation.ρ_real
     ∇δρ = perturbation.∇ρ_real
     δσ  = cross_derivatives[:δσ]
-    Vρσ = reshape(terms.Vρσ, n_spin, spin_σ, basis.fft_size...)
-    Vσσ = reshape(terms.Vσσ, spin_σ, spin_σ, basis.fft_size...)
-    Vσ  = reshape(terms.Vσ,  spin_σ,         basis.fft_size...)
+    Vρσ = to_device(basis.architecture, reshape(terms.Vρσ, n_spin, spin_σ, basis.fft_size...))
+    Vσσ = to_device(basis.architecture, reshape(terms.Vσσ, spin_σ, spin_σ, basis.fft_size...))
+    Vσ  = to_device(basis.architecture, reshape(terms.Vσ,  spin_σ,         basis.fft_size...))
 
     T   = eltype(ρ)
     tσ  = DftFunctionals.spinindex_σ
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index 2bffa1f46a..ba32c8e2fa 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -11,3 +11,13 @@ function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis; symmetries=basis.sym
     ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries)
     ρ .= to_device(basis.architecture, ρ_CPU)
 end
+
+for fun in (:potential_terms, :kernel_terms)
+    @eval function DftFunctionals.$fun(fun::DispatchFunctional, ρ::AT,
+                                       args...) where {AT <: AbstractGPUArray{Float64}}
+        # Fallback implementation for the GPU: Transfer to the CPU and run computation there
+        cpuify(::Nothing) = nothing
+        cpuify(x::AbstractArray) = Array(x)
+        $fun(fun, Array(ρ), cpuify.(args)...)
+    end
+end
diff --git a/src/workarounds/roc_arrays.jl b/src/workarounds/roc_arrays.jl
new file mode 100644
index 0000000000..4c34875c7d
--- /dev/null
+++ b/src/workarounds/roc_arrays.jl
@@ -0,0 +1 @@
+synchronize_device(::GPU{<:AMDGPU.ROCArray}) = AMDGPU.Device.sync_workgroup()