exanauts · amontoison · Mar 28, 2024 · Mar 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 Manifest.toml
 /LocalPreferences.toml
+docs/build
diff --git a/README.md b/README.md
@@ -27,6 +27,15 @@ pkg> add CUDSS
 pkg> test CUDSS
 ```
 
+## Content
+
+CUDSS.jl provides a structured approach for leveraging NVIDIA cuDSS functionalities.
+It introduces the `CudssSolver` type along with three core routines: `cudss`, `cudss_set`, and `cudss_get`.
+Additionally, specialized methods for the `CuSparseMatrixCSR` type have been incorporated for `cholesky`, `ldlt`, `lu` and `\`.
+To further enhance performance, in-place variants including `cholesky!`, `ldlt!`, `lu!` and `ldiv!` have been implemented.
+These variants optimize performance by reusing the symbolic factorization as well as storage.
+This ensures efficient solving of sparse linear systems on GPUs.
+
 ## Examples
 
 ### Example 1: Sparse unsymmetric linear system with one right-hand side

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,6 +1,8 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [compat]
 Documenter = "1.0"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,4 +1,6 @@
 using Documenter, CUDSS
+using LinearAlgebra
+using CUDA, CUDA.CUSPARSE
 
 makedocs(
   modules = [CUDSS],

diff --git a/docs/src/generic.md b/docs/src/generic.md
@@ -1,41 +1,61 @@
-# Examples
+## LLᵀ and LLᴴ
 
-## Example 1: Sparse unsymmetric linear system with one right-hand side
+```@docs
+    LinearAlgebra.cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: LinearAlgebra.BlasFloat
+    LinearAlgebra.cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
+```
 
 ```julia
 using CUDA, CUDA.CUSPARSE
 using CUDSS
 using LinearAlgebra
 using SparseArrays
 
-T = Float64
+T = ComplexF64
+R = real(T)
 n = 100
-A_cpu = sprand(T, n, n, 0.05) + I
-b_cpu = rand(T, n)
+p = 5
+A_cpu = sprand(T, n, n, 0.01)
+A_cpu = A_cpu * A_cpu' + I
+B_cpu = rand(T, n, p)
 
-A_gpu = CuSparseMatrixCSR(A_cpu)
-b_gpu = CuVector(b_cpu)
+A_gpu = CuSparseMatrixCSR(A_cpu |> triu)
+B_gpu = CuMatrix(B_cpu)
+X_gpu = similar(B_gpu)
 
-F = lu(A_gpu)
-x_gpu = F \ b_gpu
+F = cholesky(A_gpu, view='U')
+X_gpu = F \ B_gpu
 
-r_gpu = b_gpu - A_gpu * x_gpu
-norm(r_gpu)
+R_gpu = B_gpu - CuSparseMatrixCSR(A_cpu) * X_gpu
+norm(R_gpu)
 
-# In-place LU
-d_gpu = rand(T, n) |> CuVector
-B_gpu = A_gpu + Diagonal(d_gpu)
-lu!(F, B_gpu)
+# In-place LLᴴ
+d_gpu = rand(R, n) |> CuVector
+A_gpu = A_gpu + Diagonal(d_gpu)
+cholesky!(F, A_gpu)
 
-c_cpu = rand(T, n)
-c_gpu = CuVector(c_cpu)
-ldiv!(x_gpu, F, c_gpu)
+C_cpu = rand(T, n, p)
+C_gpu = CuMatrix(C_cpu)
+ldiv!(X_gpu, F, C_gpu)
 
-r_gpu = c_gpu - B_gpu * x_gpu
-norm(r_gpu)
+R_gpu = C_gpu - ( CuSparseMatrixCSR(A_cpu) + Diagonal(d_gpu) ) * X_gpu
+norm(R_gpu)
 ```
 
-## Example 2: Sparse symmetric linear system with multiple right-hand sides
+!!! note
+    If we only store one triangle of `A_gpu`, we can also use the wrappers `Symmetric` and `Hermitian` instead of using the keyword argument `view` in `cholesky`. For real matrices, both wrappers are allowed but only `Hermitian` can be used for complex matrices.
+
+```julia
+H_gpu = Hermitian(A_gpu, :U)
+F = cholesky(H_gpu)
+```
+
+## LDLᵀ and LDLᴴ
+
+```@docs
+    LinearAlgebra.ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: LinearAlgebra.BlasFloat
+    LinearAlgebra.ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
+```
 
 ```julia
 using CUDA, CUDA.CUSPARSE
@@ -63,8 +83,8 @@ norm(R_gpu)
 
 # In-place LDLᵀ
 d_gpu = rand(R, n) |> CuVector
-B_gpu = A_gpu + Diagonal(d_gpu)
-ldlt!(F, B_gpu)
+A_gpu = A_gpu + Diagonal(d_gpu)
+ldlt!(F, A_gpu)
 
 C_cpu = rand(T, n, p)
 C_gpu = CuMatrix(C_cpu)
@@ -82,49 +102,42 @@ S_gpu = Symmetric(A_gpu, :L)
 F = ldlt(S_gpu)
 ```
 
-## Example 3: Sparse hermitian positive definite linear system with multiple right-hand sides
+## LU
+
+```@docs
+    LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
+    LinearAlgebra.lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
+```
 
 ```julia
 using CUDA, CUDA.CUSPARSE
 using CUDSS
 using LinearAlgebra
 using SparseArrays
 
-T = ComplexF64
-R = real(T)
+T = Float64
 n = 100
-p = 5
-A_cpu = sprand(T, n, n, 0.01)
-A_cpu = A_cpu * A_cpu' + I
-B_cpu = rand(T, n, p)
-
-A_gpu = CuSparseMatrixCSR(A_cpu |> triu)
-B_gpu = CuMatrix(B_cpu)
-X_gpu = similar(B_gpu)
-
-F = cholesky(A_gpu, view='U')
-X_gpu = F \ B_gpu
+A_cpu = sprand(T, n, n, 0.05) + I
+b_cpu = rand(T, n)
 
-R_gpu = B_gpu - CuSparseMatrixCSR(A_cpu) * X_gpu
-norm(R_gpu)
+A_gpu = CuSparseMatrixCSR(A_cpu)
+b_gpu = CuVector(b_cpu)
 
-# In-place LLᴴ
-d_gpu = rand(R, n) |> CuVector
-B_gpu = A_gpu + Diagonal(d_gpu)
-cholesky!(F, B_gpu)
+F = lu(A_gpu)
+x_gpu = F \ b_gpu
 
-C_cpu = rand(T, n, p)
-C_gpu = CuMatrix(C_cpu)
-ldiv!(X_gpu, F, C_gpu)
+r_gpu = b_gpu - A_gpu * x_gpu
+norm(r_gpu)
 
-R_gpu = C_gpu - ( CuSparseMatrixCSR(A_cpu) + Diagonal(d_gpu) ) * X_gpu
-norm(R_gpu)
-```
+# In-place LU
+d_gpu = rand(T, n) |> CuVector
+A_gpu = A_gpu + Diagonal(d_gpu)
+lu!(F, A_gpu)
 
-!!! note
-    If we only store one triangle of `A_gpu`, we can also use the wrappers `Symmetric` and `Hermitian` instead of using the keyword argument `view` in `cholesky`. For real matrices, both wrappers are allowed but only `Hermitian` can be used for complex matrices.
+c_cpu = rand(T, n)
+c_gpu = CuVector(c_cpu)
+ldiv!(x_gpu, F, c_gpu)
 
-```julia
-H_gpu = Hermitian(A_gpu, :U)
-F = cholesky(H_gpu)
+r_gpu = c_gpu - A_gpu * x_gpu
+norm(r_gpu)
 ```
diff --git a/src/CUDSS.jl b/src/CUDSS.jl
@@ -14,7 +14,7 @@ else
 end
 
 import CUDA: @checked, libraryPropertyType, cudaDataType, initialize_context, retry_reclaim, CUstream
-import LinearAlgebra: lu, lu!, ldlt, ldlt!, cholesky, cholesky!, ldiv!, BlasFloat, BlasReal
+import LinearAlgebra: lu, lu!, ldlt, ldlt!, cholesky, cholesky!, ldiv!, BlasFloat, BlasReal, checksquare
 import Base: \
 
 include("libcudss.jl")

diff --git a/src/generic.jl b/src/generic.jl
@@ -1,5 +1,19 @@
+"""
+    solver = lu(A::CuSparseMatrixCSR{T,Cint})
+
+Compute the LU factorization of a sparse matrix `A` on an NVIDIA GPU.
+The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
+
+#### Input argument
+
+* `A`: a sparse square matrix stored in the `CuSparseMatrixCSR` format.
+
+#### Output argument
+
+* `solver`: an opaque structure [`CudssSolver`](@ref) that stores the factors of the LU decomposition.
+"""
 function LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
-  n = LinearAlgebra.checksquare(A)
+  n = checksquare(A)
   solver = CudssSolver(A, "G", 'F')
   x = CudssMatrix(T, n)
   b = CudssMatrix(T, n)
@@ -8,8 +22,41 @@ function LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
   return solver
 end
 
+"""
+    solver = lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint})
+
+Compute the LU factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`.
+The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
+"""
+function LinearAlgebra.lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
+  n = checksquare(A)
+  cudss_set(solver, A)
+  x = CudssMatrix(T, n)
+  b = CudssMatrix(T, n)
+  cudss("factorization", solver, x, b)
+  return solver
+end
+
+"""
+    solver = ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F')
+
+Compute the LDLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU.
+The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
+
+#### Input argument
+
+* `A`: a sparse Hermitian matrix stored in the `CuSparseMatrixCSR` format.
+
+#### Keyword argument
+
+*`view`: A character that specifies which triangle of the sparse matrix is provided. Possible options are `L` for the lower triangle, `U` for the upper triangle, and `F` for the full matrix.
+
+#### Output argument
+
+* `solver`: Opaque structure [`CudssSolver`](@ref) that stores the factors of the LDLᴴ decomposition.
+"""
 function LinearAlgebra.ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: BlasFloat
-  n = LinearAlgebra.checksquare(A)
+  n = checksquare(A)
   structure = T <: Real ? "S" : "H"
   solver = CudssSolver(A, structure, view)
   (T <: Complex) && cudss_set(solver, "pivot_type", 'N')
@@ -23,8 +70,41 @@ end
 LinearAlgebra.ldlt(A::Symmetric{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasReal = LinearAlgebra.ldlt(A.data, view=A.uplo)
 LinearAlgebra.ldlt(A::Hermitian{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat = LinearAlgebra.ldlt(A.data, view=A.uplo)
 
+"""
+    solver = ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint})
+
+Compute the LDLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`.
+The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
+"""
+function LinearAlgebra.ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
+  n = checksquare(A)
+  cudss_set(solver, A)
+  x = CudssMatrix(T, n)
+  b = CudssMatrix(T, n)
+  cudss("factorization", solver, x, b)
+  return solver
+end
+
+"""
+    solver = cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F')
+
+Compute the LLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU.
+The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
+
+#### Input argument
+
+* `A`: a sparse Hermitian positive definite matrix stored in the `CuSparseMatrixCSR` format.
+
+#### Keyword argument
+
+*`view`: A character that specifies which triangle of the sparse matrix is provided. Possible options are `L` for the lower triangle, `U` for the upper triangle, and `F` for the full matrix.
+
+#### Output argument
+
+* `solver`: Opaque structure [`CudssSolver`](@ref) that stores the factors of the LLᴴ decomposition.
+"""
 function LinearAlgebra.cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: BlasFloat
-  n = LinearAlgebra.checksquare(A)
+  n = checksquare(A)
   structure = T <: Real ? "SPD" : "HPD"
   solver = CudssSolver(A, structure, view)
   x = CudssMatrix(T, n)
@@ -37,17 +117,19 @@ end
 LinearAlgebra.cholesky(A::Symmetric{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasReal = LinearAlgebra.cholesky(A.data, view=A.uplo)
 LinearAlgebra.cholesky(A::Hermitian{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat = LinearAlgebra.cholesky(A.data, view=A.uplo)
 
-for fun in (:lu!, :ldlt!, :cholesky!)
-  @eval begin
-    function LinearAlgebra.$fun(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
-      n = LinearAlgebra.checksquare(A)
-      cudss_set(solver.matrix, A)
-      x = CudssMatrix(T, n)
-      b = CudssMatrix(T, n)
-      cudss("factorization", solver, x, b)
-      return solver
-    end
-  end
+"""
+    solver = cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint})
+
+Compute the LLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`.
+The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
+"""
+function LinearAlgebra.cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
+  n = checksquare(A)
+  cudss_set(solver, A)
+  x = CudssMatrix(T, n)
+  b = CudssMatrix(T, n)
+  cudss("factorization", solver, x, b)
+  return solver
 end
 
 for type in (:CuVector, :CuMatrix)