Skip to content

Commit

Permalink
Update CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
cfranken committed Apr 16, 2024
1 parent f7b8c99 commit b559db7
Show file tree
Hide file tree
Showing 13 changed files with 35 additions and 43 deletions.
18 changes: 5 additions & 13 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
name = "vSmartMOM"
uuid = "7ba11eeb-0a61-4a04-a413-bf612cc2007e"
authors = ["Rupesh Jeyaram <rjeyaram@caltech.edu> and contributors"]
version = "1.0.1"
authors = ["Christian Frankenberg <cfranken@caltech.edu>, Suniti Sanghavi ([email protected]), Rupesj Jeyaram and contributors"]
version = "1.0.2"

[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7"
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
Expand Down Expand Up @@ -45,25 +44,18 @@ WignerSymbols = "9f57e263-0b3d-5e2e-b1be-24f2bb48858b"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

[compat]

CUDAKernels = "0.2, 0.3, 0.4"
CUDA = "3, 4"

CUDA = "4, 5"
DataInterpolations = "3.6, 4"
DelimitedFiles = "1"

DiffResults = "1.0"
Distributions = "0.23, 0.24, 0.25"
DocStringExtensions = "0.8, 0.9"
ForwardDiff = "0.10"
FastGaussQuadrature = "0.4, 0.5"

ForwardDiff = "0.10"
InstrumentOperator = "0.1"
Interpolations = "0.12, 0.13, 0.14"
JLD2 = "0.1, 0.2, 0.3, 0.4"
JSON = "0.21"


KernelAbstractions = "0.8, 0.9"
NCDatasets = "0.11, 0.12"
NNlib = "0.8, 0.9"
Expand All @@ -76,7 +68,7 @@ StaticArrays = "1.2"
StatsBase = "0.33, 0.34"
TimerOutputs = "0.5"
YAML = "0.4"
julia = "1.7, 1.8"
julia = "1.8,1.9,1.10"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
2 changes: 1 addition & 1 deletion src/Absorption/Absorption.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ using Interpolations # For interpolating in lookup tables and interpo
using JLD2 # For saving and loading the interpolator
using ProgressMeter # For showing progress, especially in creating interpolator
using KernelAbstractions # For heterogeneous (GPU+CPU) programming
using CUDAKernels # Access to CUDADevice
using CUDA.CUDAKernels # Access to CUDADevice
using CUDA # For GPU programming
using ForwardDiff, DiffResults # For auto-differentiation
using NetCDF # For loading NetCDF files with constants
Expand Down
2 changes: 1 addition & 1 deletion src/Absorption/compute_absorption_cross_section.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ function compute_absorption_cross_section(
# Run the event on the kernel
# That this, this function adds to each element in result, the contribution from this transition
event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view))
wait(device, event)
#wait(device, event)
synchronize_if_gpu()
end
end
Expand Down
2 changes: 1 addition & 1 deletion src/Architectures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export
using CUDA

using KernelAbstractions
using CUDAKernels
using CUDA.CUDAKernels

"""
AbstractArchitecture
Expand Down
4 changes: 2 additions & 2 deletions src/CoreRT/CoreKernel/doubling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A
device = devi(architecture(r⁻⁺))
applyD_kernel! = apply_D!(device)
event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand All @@ -134,7 +134,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where
device = devi(architecture(J₀⁻))
applyD_kernel! = apply_D_SFI!(device)
event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
nothing
end
8 changes: 4 additions & 4 deletions src/CoreRT/CoreKernel/doubling_inelastic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes
applyD_kernel_IE! = apply_D_IE_VS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes,
ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
wait(device, event);
#wait(device, event);
synchronize();
return nothing
end
Expand All @@ -414,7 +414,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra
applyD_kernel_IE! = apply_D_IE_RRS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes,
ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
wait(device, event);
#wait(device, event);
synchronize();
return nothing
end
Expand All @@ -440,7 +440,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract
applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes,
ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4)));
wait(device, event);
#wait(device, event);
synchronize_if_gpu()
return nothing
end
Expand All @@ -453,7 +453,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st
applyD_kernel_IE! = apply_D_SFI_IE_VS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes,
ieJ₀⁻, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻));
wait(device, event);
# wait(device, event);
synchronize_if_gpu()
return nothing
end
Expand Down
12 changes: 6 additions & 6 deletions src/CoreRT/CoreKernel/elemental.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ function elemental!(pol_type, SFI::Bool,
kernel! = get_elem_rt!(device)
event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺,
qp_μN, wct2, ndrange=size(r⁻⁺));
wait(device, event)
#wait(device, event)
synchronize_if_gpu()

if SFI
kernel! = get_elem_rt_SFI!(device)
event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺))
wait(device, event)
#wait(device, event)
synchronize_if_gpu()
end
end
Expand Down Expand Up @@ -140,13 +140,13 @@ function elemental!(pol_type, SFI::Bool,
# with absorption in batch mode, low tau_scatt but higher tau_total, needs exact equations
kernel! = get_elem_rt!(device)
event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺));
wait(device, event)
#wait(device, event)
synchronize_if_gpu()

# SFI part
kernel! = get_elem_rt_SFI!(device)
event = kernel!(j₀⁺, j₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(j₀⁺))
wait(device, event)
#wait(device, event)
synchronize_if_gpu()

# Apply D Matrix
Expand Down Expand Up @@ -288,7 +288,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract
device = devi(architecture(r⁻⁺))
applyD_kernel! = apply_D_elemental!(device)
event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand All @@ -300,7 +300,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst
device = devi(architecture(J₀⁻))
applyD_kernel! = apply_D_elemental_SFI!(device)
event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand Down
4 changes: 2 additions & 2 deletions src/CoreRT/CoreKernel/elemental_canopy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ function elemental!(pol_type, SFI::Bool,
# with absorption in batch mode, low tau_scatt but higher tau_total, needs exact equations
kernel! = get_canopy_elem_rt!(device)
event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, G, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺));
wait(device, event)
#wait(device, event)
synchronize_if_gpu()
#@show G
# SFI part
kernel! = get_canopy_elem_rt_SFI!(device)
event = kernel!(j₀⁺, j₀⁻, ϖ, dτ, arr_type(τ_sum), G, Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(j₀⁺))
wait(device, event)
#wait(device, event)
synchronize_if_gpu()

# Apply D Matrix
Expand Down
16 changes: 8 additions & 8 deletions src/CoreRT/CoreKernel/elemental_inelastic_plus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ function get_elem_rt!(RS_type::RRS_plus,
aType(Z⁺⁺_λ₁λ₀[:,:,bandSpecLim[iB]]),
qp_μN, wct2,
ndrange=getKernelDim(RS_type,ier⁻⁺[:,:,RS_type.bandSpecLim[iB],:]));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
end
end
Expand Down Expand Up @@ -215,7 +215,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀),
qp_μN, wct2,
ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();

t_ier⁻⁺ = similar(ier⁻⁺)
Expand All @@ -229,7 +229,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
aType(Z⁻⁺_λ₁λ₀_VS_n2), aType(Z⁺⁺_λ₁λ₀_VS_n2),
qp_μN, wct2,
ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀_VS_n2));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();

ier⁻⁺ += t_ier⁻⁺
Expand All @@ -245,7 +245,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
aType(Z⁻⁺_λ₁λ₀_VS_o2), aType(Z⁺⁺_λ₁λ₀_VS_o2),
qp_μN, wct2,
ndrange=getKernelDim(RS_type,ier⁻⁺, i_λ₁λ₀_VS_o2));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
ier⁻⁺ += t_ier⁻⁺
iet⁺⁺ += t_iet⁺⁺
Expand Down Expand Up @@ -354,7 +354,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
qp_μN, ndoubl, wct02, nStokes,
I₀, iμ0, D,
ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀)); #change this
wait(device, event)
#wait(device, event)
synchronize_if_gpu();

t_ieJ₀⁺ = similar(ieJ₀⁻)
Expand All @@ -369,7 +369,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
qp_μN, ndoubl, wct02, nStokes,
I₀, iμ0, D,
ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_n2)); #change this
wait(device, event)
#wait(device, event)
synchronize_if_gpu();

ieJ₀⁺ += t_ieJ₀⁺
Expand All @@ -384,7 +384,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
qp_μN, ndoubl, wct02, nStokes,
I₀, iμ0, D,
ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_o2)); #change this
wait(device, event)
#wait(device, event)
synchronize_if_gpu();

ieJ₀⁺ += t_ieJ₀⁺
Expand Down Expand Up @@ -482,7 +482,7 @@ function get_elem_rt_SFI!(RS_type::RRS_plus,
qp_μN, ndoubl, wct02, nStokes,
I₀, iμ0, D,
ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
wait(device, event)
#wait(device, event)
synchronize_if_gpu();
end

Expand Down
2 changes: 1 addition & 1 deletion src/CoreRT/CoreKernel/raman_kernel_test.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using KernelAbstractions
using CUDAKernels
using CUDA.CUDAKernels
using CUDA

nij = 14
Expand Down
2 changes: 1 addition & 1 deletion src/CoreRT/CoreRT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ using ...Architectures # Use Architectures module
using CUDA # GPU CuArrays and functions
using KernelAbstractions # Abstracting code for CPU/GPU
using KernelAbstractions.Extras
using CUDAKernels
using CUDA.CUDAKernels

using Unitful # For parsing
using UnitfulEquivalences # For converting between wavenumber / wavelength
Expand Down
2 changes: 1 addition & 1 deletion src/CoreRT/Surfaces/rpv_surface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ function expandSurface!(Rsurf::AbstractArray{FT,2}, n_stokes::Int, v) where {FT}
device = devi(architecture(Rsurf))
applyExpansion_! = applyExpansion!(device)
event = applyExpansion_!(Rsurf, n_stokes, v, ndrange=size(v));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand Down
4 changes: 2 additions & 2 deletions test/gpu_tests/elemental_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ kernel!(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=si

function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺));
wait(device, event)
#wait(device, event)
synchronize();
end
@time test2(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
Expand All @@ -59,7 +59,7 @@ kernel! = get_r!(device)
kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=size(r⁻⁺));
function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺));
wait(device, event)
#wait(device, event)
synchronize();
end
@time test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
Expand Down

0 comments on commit b559db7

Please sign in to comment.