Skip to content

Commit

Permalink
Update the documentation for the generic API
Browse files Browse the repository at this point in the history
  • Loading branch information
amontoison committed Mar 28, 2024
1 parent 6fc918e commit d7d020f
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 69 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
Manifest.toml
/LocalPreferences.toml
docs/build
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ pkg> add CUDSS
pkg> test CUDSS
```

## Content

CUDSS.jl provides a structured approach for leveraging NVIDIA cuDSS functionalities.
It introduces the `CudssSolver` type along with three core routines: `cudss`, `cudss_set`, and `cudss_get`.
Additionally, specialized methods for the `CuSparseMatrixCSR` type have been incorporated for `cholesky`, `ldlt`, `lu` and `\`.
To further enhance performance, in-place variants including `cholesky!`, `ldlt!`, `lu!` and `ldiv!` have been implemented.
These variants optimize performance by reusing the symbolic factorization as well as storage.
This ensures efficient solving of sparse linear systems on GPUs.

## Examples

### Example 1: Sparse unsymmetric linear system with one right-hand side
Expand Down
2 changes: 2 additions & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[compat]
Documenter = "1.0"
2 changes: 2 additions & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using Documenter, CUDSS
using LinearAlgebra
using CUDA, CUDA.CUSPARSE

makedocs(
modules = [CUDSS],
Expand Down
121 changes: 67 additions & 54 deletions docs/src/generic.md
Original file line number Diff line number Diff line change
@@ -1,41 +1,61 @@
# Examples
## LLᵀ and LLᴴ

## Example 1: Sparse unsymmetric linear system with one right-hand side
```@docs
LinearAlgebra.cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: LinearAlgebra.BlasFloat
LinearAlgebra.cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
```

```julia
using CUDA, CUDA.CUSPARSE
using CUDSS
using LinearAlgebra
using SparseArrays

T = Float64
T = ComplexF64
R = real(T)
n = 100
A_cpu = sprand(T, n, n, 0.05) + I
b_cpu = rand(T, n)
p = 5
A_cpu = sprand(T, n, n, 0.01)
A_cpu = A_cpu * A_cpu' + I
B_cpu = rand(T, n, p)

A_gpu = CuSparseMatrixCSR(A_cpu)
b_gpu = CuVector(b_cpu)
A_gpu = CuSparseMatrixCSR(A_cpu |> triu)
B_gpu = CuMatrix(B_cpu)
X_gpu = similar(B_gpu)

F = lu(A_gpu)
x_gpu = F \ b_gpu
F = cholesky(A_gpu, view='U')
X_gpu = F \ B_gpu

r_gpu = b_gpu - A_gpu * x_gpu
norm(r_gpu)
R_gpu = B_gpu - CuSparseMatrixCSR(A_cpu) * X_gpu
norm(R_gpu)

# In-place LU
d_gpu = rand(T, n) |> CuVector
B_gpu = A_gpu + Diagonal(d_gpu)
lu!(F, B_gpu)
# In-place LLᴴ
d_gpu = rand(R, n) |> CuVector
A_gpu = A_gpu + Diagonal(d_gpu)
cholesky!(F, A_gpu)

c_cpu = rand(T, n)
c_gpu = CuVector(c_cpu)
ldiv!(x_gpu, F, c_gpu)
C_cpu = rand(T, n, p)
C_gpu = CuMatrix(C_cpu)
ldiv!(X_gpu, F, C_gpu)

r_gpu = c_gpu - B_gpu * x_gpu
norm(r_gpu)
R_gpu = C_gpu - ( CuSparseMatrixCSR(A_cpu) + Diagonal(d_gpu) ) * X_gpu
norm(R_gpu)
```

## Example 2: Sparse symmetric linear system with multiple right-hand sides
!!! note
If we only store one triangle of `A_gpu`, we can also use the wrappers `Symmetric` and `Hermitian` instead of using the keyword argument `view` in `cholesky`. For real matrices, both wrappers are allowed but only `Hermitian` can be used for complex matrices.

```julia
H_gpu = Hermitian(A_gpu, :U)
F = cholesky(H_gpu)
```

## LDLᵀ and LDLᴴ

```@docs
LinearAlgebra.ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: LinearAlgebra.BlasFloat
LinearAlgebra.ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
```

```julia
using CUDA, CUDA.CUSPARSE
Expand Down Expand Up @@ -63,8 +83,8 @@ norm(R_gpu)

# In-place LDLᵀ
d_gpu = rand(R, n) |> CuVector
B_gpu = A_gpu + Diagonal(d_gpu)
ldlt!(F, B_gpu)
A_gpu = A_gpu + Diagonal(d_gpu)
ldlt!(F, A_gpu)

C_cpu = rand(T, n, p)
C_gpu = CuMatrix(C_cpu)
Expand All @@ -82,49 +102,42 @@ S_gpu = Symmetric(A_gpu, :L)
F = ldlt(S_gpu)
```

## Example 3: Sparse hermitian positive definite linear system with multiple right-hand sides
## LU

```@docs
LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
LinearAlgebra.lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: LinearAlgebra.BlasFloat
```

```julia
using CUDA, CUDA.CUSPARSE
using CUDSS
using LinearAlgebra
using SparseArrays

T = ComplexF64
R = real(T)
T = Float64
n = 100
p = 5
A_cpu = sprand(T, n, n, 0.01)
A_cpu = A_cpu * A_cpu' + I
B_cpu = rand(T, n, p)

A_gpu = CuSparseMatrixCSR(A_cpu |> triu)
B_gpu = CuMatrix(B_cpu)
X_gpu = similar(B_gpu)

F = cholesky(A_gpu, view='U')
X_gpu = F \ B_gpu
A_cpu = sprand(T, n, n, 0.05) + I
b_cpu = rand(T, n)

R_gpu = B_gpu - CuSparseMatrixCSR(A_cpu) * X_gpu
norm(R_gpu)
A_gpu = CuSparseMatrixCSR(A_cpu)
b_gpu = CuVector(b_cpu)

# In-place LLᴴ
d_gpu = rand(R, n) |> CuVector
B_gpu = A_gpu + Diagonal(d_gpu)
cholesky!(F, B_gpu)
F = lu(A_gpu)
x_gpu = F \ b_gpu

C_cpu = rand(T, n, p)
C_gpu = CuMatrix(C_cpu)
ldiv!(X_gpu, F, C_gpu)
r_gpu = b_gpu - A_gpu * x_gpu
norm(r_gpu)

R_gpu = C_gpu - ( CuSparseMatrixCSR(A_cpu) + Diagonal(d_gpu) ) * X_gpu
norm(R_gpu)
```
# In-place LU
d_gpu = rand(T, n) |> CuVector
A_gpu = A_gpu + Diagonal(d_gpu)
lu!(F, A_gpu)

!!! note
If we only store one triangle of `A_gpu`, we can also use the wrappers `Symmetric` and `Hermitian` instead of using the keyword argument `view` in `cholesky`. For real matrices, both wrappers are allowed but only `Hermitian` can be used for complex matrices.
c_cpu = rand(T, n)
c_gpu = CuVector(c_cpu)
ldiv!(x_gpu, F, c_gpu)

```julia
H_gpu = Hermitian(A_gpu, :U)
F = cholesky(H_gpu)
r_gpu = c_gpu - A_gpu * x_gpu
norm(r_gpu)
```
2 changes: 1 addition & 1 deletion src/CUDSS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ else
end

import CUDA: @checked, libraryPropertyType, cudaDataType, initialize_context, retry_reclaim, CUstream
import LinearAlgebra: lu, lu!, ldlt, ldlt!, cholesky, cholesky!, ldiv!, BlasFloat, BlasReal
import LinearAlgebra: lu, lu!, ldlt, ldlt!, cholesky, cholesky!, ldiv!, BlasFloat, BlasReal, checksquare
import Base: \

include("libcudss.jl")
Expand Down
110 changes: 96 additions & 14 deletions src/generic.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
"""
solver = lu(A::CuSparseMatrixCSR{T,Cint})
Compute the LU factorization of a sparse matrix `A` on an NVIDIA GPU.
The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
#### Input argument
* `A`: a sparse square matrix stored in the `CuSparseMatrixCSR` format.
#### Output argument
* `solver`: an opaque structure [`CudssSolver`](@ref) that stores the factors of the LU decomposition.
"""
function LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
n = LinearAlgebra.checksquare(A)
n = checksquare(A)
solver = CudssSolver(A, "G", 'F')
x = CudssMatrix(T, n)
b = CudssMatrix(T, n)
Expand All @@ -8,8 +22,41 @@ function LinearAlgebra.lu(A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
return solver
end

"""
solver = lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint})
Compute the LU factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`.
The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
"""
function LinearAlgebra.lu!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
n = checksquare(A)
cudss_set(solver, A)
x = CudssMatrix(T, n)
b = CudssMatrix(T, n)
cudss("factorization", solver, x, b)
return solver
end

"""
solver = ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F')
Compute the LDLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU.
The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
#### Input argument
* `A`: a sparse Hermitian matrix stored in the `CuSparseMatrixCSR` format.
#### Keyword argument
*`view`: A character that specifies which triangle of the sparse matrix is provided. Possible options are `L` for the lower triangle, `U` for the upper triangle, and `F` for the full matrix.
#### Output argument
* `solver`: Opaque structure [`CudssSolver`](@ref) that stores the factors of the LDLᴴ decomposition.
"""
function LinearAlgebra.ldlt(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: BlasFloat
n = LinearAlgebra.checksquare(A)
n = checksquare(A)
structure = T <: Real ? "S" : "H"
solver = CudssSolver(A, structure, view)
(T <: Complex) && cudss_set(solver, "pivot_type", 'N')
Expand All @@ -23,8 +70,41 @@ end
LinearAlgebra.ldlt(A::Symmetric{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasReal = LinearAlgebra.ldlt(A.data, view=A.uplo)
LinearAlgebra.ldlt(A::Hermitian{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat = LinearAlgebra.ldlt(A.data, view=A.uplo)

"""
solver = ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint})
Compute the LDLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`.
The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
"""
function LinearAlgebra.ldlt!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
n = checksquare(A)
cudss_set(solver, A)
x = CudssMatrix(T, n)
b = CudssMatrix(T, n)
cudss("factorization", solver, x, b)
return solver
end

"""
solver = cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F')
Compute the LLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU.
The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
#### Input argument
* `A`: a sparse Hermitian positive definite matrix stored in the `CuSparseMatrixCSR` format.
#### Keyword argument
*`view`: A character that specifies which triangle of the sparse matrix is provided. Possible options are `L` for the lower triangle, `U` for the upper triangle, and `F` for the full matrix.
#### Output argument
* `solver`: Opaque structure [`CudssSolver`](@ref) that stores the factors of the LLᴴ decomposition.
"""
function LinearAlgebra.cholesky(A::CuSparseMatrixCSR{T,Cint}; view::Char='F') where T <: BlasFloat
n = LinearAlgebra.checksquare(A)
n = checksquare(A)
structure = T <: Real ? "SPD" : "HPD"
solver = CudssSolver(A, structure, view)
x = CudssMatrix(T, n)
Expand All @@ -37,17 +117,19 @@ end
LinearAlgebra.cholesky(A::Symmetric{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasReal = LinearAlgebra.cholesky(A.data, view=A.uplo)
LinearAlgebra.cholesky(A::Hermitian{T,<:CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat = LinearAlgebra.cholesky(A.data, view=A.uplo)

for fun in (:lu!, :ldlt!, :cholesky!)
@eval begin
function LinearAlgebra.$fun(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
n = LinearAlgebra.checksquare(A)
cudss_set(solver.matrix, A)
x = CudssMatrix(T, n)
b = CudssMatrix(T, n)
cudss("factorization", solver, x, b)
return solver
end
end
"""
solver = cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint})
Compute the LLᴴ factorization of a sparse matrix `A` on an NVIDIA GPU, reusing the symbolic factorization stored in `solver`.
The type `T` can be `Float32`, `Float64`, `ComplexF32` or `ComplexF64`.
"""
function LinearAlgebra.cholesky!(solver::CudssSolver{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
n = checksquare(A)
cudss_set(solver, A)
x = CudssMatrix(T, n)
b = CudssMatrix(T, n)
cudss("factorization", solver, x, b)
return solver
end

for type in (:CuVector, :CuMatrix)
Expand Down

0 comments on commit d7d020f

Please sign in to comment.