From b559db77ed586f143444159a038668f56cff446d Mon Sep 17 00:00:00 2001 From: Christian Frankenberg Date: Tue, 16 Apr 2024 16:12:09 -0700 Subject: [PATCH] Update CUDA --- Project.toml | 18 +++++------------- src/Absorption/Absorption.jl | 2 +- .../compute_absorption_cross_section.jl | 2 +- src/Architectures.jl | 2 +- src/CoreRT/CoreKernel/doubling.jl | 4 ++-- src/CoreRT/CoreKernel/doubling_inelastic.jl | 8 ++++---- src/CoreRT/CoreKernel/elemental.jl | 12 ++++++------ src/CoreRT/CoreKernel/elemental_canopy.jl | 4 ++-- .../CoreKernel/elemental_inelastic_plus.jl | 16 ++++++++-------- src/CoreRT/CoreKernel/raman_kernel_test.jl | 2 +- src/CoreRT/CoreRT.jl | 2 +- src/CoreRT/Surfaces/rpv_surface.jl | 2 +- test/gpu_tests/elemental_test.jl | 4 ++-- 13 files changed, 35 insertions(+), 43 deletions(-) diff --git a/Project.toml b/Project.toml index dbc295dd..b8c35210 100644 --- a/Project.toml +++ b/Project.toml @@ -1,11 +1,10 @@ name = "vSmartMOM" uuid = "7ba11eeb-0a61-4a04-a413-bf612cc2007e" -authors = ["Rupesh Jeyaram and contributors"] -version = "1.0.1" +authors = ["Christian Frankenberg , Suniti Sanghavi (suniti.sanghavi@gmail.com), Rupesj Jeyaram and contributors"] +version = "1.0.2" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7" ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0" @@ -45,25 +44,18 @@ WignerSymbols = "9f57e263-0b3d-5e2e-b1be-24f2bb48858b" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] - -CUDAKernels = "0.2, 0.3, 0.4" -CUDA = "3, 4" - +CUDA = "4, 5" DataInterpolations = "3.6, 4" DelimitedFiles = "1" - DiffResults = "1.0" Distributions = "0.23, 0.24, 0.25" DocStringExtensions = "0.8, 0.9" -ForwardDiff = "0.10" FastGaussQuadrature = "0.4, 0.5" - +ForwardDiff = "0.10" InstrumentOperator = "0.1" Interpolations = "0.12, 0.13, 0.14" JLD2 = "0.1, 0.2, 0.3, 0.4" JSON = "0.21" - - KernelAbstractions = "0.8, 0.9" NCDatasets = "0.11, 0.12" NNlib = "0.8, 0.9" @@ -76,7 +68,7 @@ StaticArrays = "1.2" StatsBase = "0.33, 0.34" TimerOutputs = "0.5" YAML = "0.4" -julia = "1.7, 1.8" +julia = "1.8,1.9,1.10" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/Absorption/Absorption.jl b/src/Absorption/Absorption.jl index e7642941..512cff29 100644 --- a/src/Absorption/Absorption.jl +++ b/src/Absorption/Absorption.jl @@ -15,7 +15,7 @@ using Interpolations # For interpolating in lookup tables and interpo using JLD2 # For saving and loading the interpolator using ProgressMeter # For showing progress, especially in creating interpolator using KernelAbstractions # For heterogeneous (GPU+CPU) programming -using CUDAKernels # Access to CUDADevice +using CUDA.CUDAKernels # Access to CUDADevice using CUDA # For GPU programming using ForwardDiff, DiffResults # For auto-differentiation using NetCDF # For loading NetCDF files with constants diff --git a/src/Absorption/compute_absorption_cross_section.jl b/src/Absorption/compute_absorption_cross_section.jl index c9c5b9a8..f316ca70 100644 --- a/src/Absorption/compute_absorption_cross_section.jl +++ b/src/Absorption/compute_absorption_cross_section.jl @@ -120,7 +120,7 @@ function compute_absorption_cross_section( # Run the event on the kernel # That this, this function adds to each element in result, the contribution from this transition event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() end end diff --git a/src/Architectures.jl b/src/Architectures.jl index 90685f04..55b50d3d 100644 --- a/src/Architectures.jl +++ b/src/Architectures.jl @@ -12,7 +12,7 @@ export using CUDA using KernelAbstractions -using CUDAKernels +using CUDA.CUDAKernels """ AbstractArchitecture diff --git a/src/CoreRT/CoreKernel/doubling.jl b/src/CoreRT/CoreKernel/doubling.jl index 2a51dd7f..1294e3ab 100644 --- a/src/CoreRT/CoreKernel/doubling.jl +++ b/src/CoreRT/CoreKernel/doubling.jl @@ -122,7 +122,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A device = devi(architecture(r⁻⁺)) applyD_kernel! = apply_D!(device) event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end @@ -134,7 +134,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where device = devi(architecture(J₀⁻)) applyD_kernel! = apply_D_SFI!(device) event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); nothing end diff --git a/src/CoreRT/CoreKernel/doubling_inelastic.jl b/src/CoreRT/CoreKernel/doubling_inelastic.jl index fc1ae07c..a2f7826d 100644 --- a/src/CoreRT/CoreKernel/doubling_inelastic.jl +++ b/src/CoreRT/CoreKernel/doubling_inelastic.jl @@ -397,7 +397,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes applyD_kernel_IE! = apply_D_IE_VS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺)); - wait(device, event); + #wait(device, event); synchronize(); return nothing end @@ -414,7 +414,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra applyD_kernel_IE! = apply_D_IE_RRS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺)); - wait(device, event); + #wait(device, event); synchronize(); return nothing end @@ -440,7 +440,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes, ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4))); - wait(device, event); + #wait(device, event); synchronize_if_gpu() return nothing end @@ -453,7 +453,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st applyD_kernel_IE! = apply_D_SFI_IE_VS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, ieJ₀⁻, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻)); - wait(device, event); + # wait(device, event); synchronize_if_gpu() return nothing end diff --git a/src/CoreRT/CoreKernel/elemental.jl b/src/CoreRT/CoreKernel/elemental.jl index be88caa9..d0548eb8 100644 --- a/src/CoreRT/CoreKernel/elemental.jl +++ b/src/CoreRT/CoreKernel/elemental.jl @@ -80,13 +80,13 @@ function elemental!(pol_type, SFI::Bool, kernel! = get_elem_rt!(device) event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize_if_gpu() if SFI kernel! = get_elem_rt_SFI!(device) event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() end end @@ -140,13 +140,13 @@ function elemental!(pol_type, SFI::Bool, # with absorption in batch mode, low tau_scatt but higher tau_total, needs exact equations kernel! = get_elem_rt!(device) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize_if_gpu() # SFI part kernel! = get_elem_rt_SFI!(device) event = kernel!(j₀⁺, j₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(j₀⁺)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() # Apply D Matrix @@ -288,7 +288,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract device = devi(architecture(r⁻⁺)) applyD_kernel! = apply_D_elemental!(device) event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end @@ -300,7 +300,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst device = devi(architecture(J₀⁻)) applyD_kernel! = apply_D_elemental_SFI!(device) event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end diff --git a/src/CoreRT/CoreKernel/elemental_canopy.jl b/src/CoreRT/CoreKernel/elemental_canopy.jl index fe1ab03d..b58c188c 100644 --- a/src/CoreRT/CoreKernel/elemental_canopy.jl +++ b/src/CoreRT/CoreKernel/elemental_canopy.jl @@ -38,13 +38,13 @@ function elemental!(pol_type, SFI::Bool, # with absorption in batch mode, low tau_scatt but higher tau_total, needs exact equations kernel! = get_canopy_elem_rt!(device) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, G, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize_if_gpu() #@show G # SFI part kernel! = get_canopy_elem_rt_SFI!(device) event = kernel!(j₀⁺, j₀⁻, ϖ, dτ, arr_type(τ_sum), G, Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(j₀⁺)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() # Apply D Matrix diff --git a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl index 73ba0579..0b5e4f38 100644 --- a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl +++ b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl @@ -186,7 +186,7 @@ function get_elem_rt!(RS_type::RRS_plus, aType(Z⁺⁺_λ₁λ₀[:,:,bandSpecLim[iB]]), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺[:,:,RS_type.bandSpecLim[iB],:])); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); end end @@ -215,7 +215,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); t_ier⁻⁺ = similar(ier⁻⁺) @@ -229,7 +229,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, aType(Z⁻⁺_λ₁λ₀_VS_n2), aType(Z⁺⁺_λ₁λ₀_VS_n2), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀_VS_n2)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); ier⁻⁺ += t_ier⁻⁺ @@ -245,7 +245,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, aType(Z⁻⁺_λ₁λ₀_VS_o2), aType(Z⁺⁺_λ₁λ₀_VS_o2), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺, i_λ₁λ₀_VS_o2)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); ier⁻⁺ += t_ier⁻⁺ iet⁺⁺ += t_iet⁺⁺ @@ -354,7 +354,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀)); #change this - wait(device, event) + #wait(device, event) synchronize_if_gpu(); t_ieJ₀⁺ = similar(ieJ₀⁻) @@ -369,7 +369,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_n2)); #change this - wait(device, event) + #wait(device, event) synchronize_if_gpu(); ieJ₀⁺ += t_ieJ₀⁺ @@ -384,7 +384,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_o2)); #change this - wait(device, event) + #wait(device, event) synchronize_if_gpu(); ieJ₀⁺ += t_ieJ₀⁺ @@ -482,7 +482,7 @@ function get_elem_rt_SFI!(RS_type::RRS_plus, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type,ieJ₀⁻)); - wait(device, event) + #wait(device, event) synchronize_if_gpu(); end diff --git a/src/CoreRT/CoreKernel/raman_kernel_test.jl b/src/CoreRT/CoreKernel/raman_kernel_test.jl index 4fe34b8a..3fc22ed3 100644 --- a/src/CoreRT/CoreKernel/raman_kernel_test.jl +++ b/src/CoreRT/CoreKernel/raman_kernel_test.jl @@ -1,5 +1,5 @@ using KernelAbstractions -using CUDAKernels +using CUDA.CUDAKernels using CUDA nij = 14 diff --git a/src/CoreRT/CoreRT.jl b/src/CoreRT/CoreRT.jl index 1734b49d..19b7decc 100644 --- a/src/CoreRT/CoreRT.jl +++ b/src/CoreRT/CoreRT.jl @@ -24,7 +24,7 @@ using ...Architectures # Use Architectures module using CUDA # GPU CuArrays and functions using KernelAbstractions # Abstracting code for CPU/GPU using KernelAbstractions.Extras -using CUDAKernels +using CUDA.CUDAKernels using Unitful # For parsing using UnitfulEquivalences # For converting between wavenumber / wavelength diff --git a/src/CoreRT/Surfaces/rpv_surface.jl b/src/CoreRT/Surfaces/rpv_surface.jl index 55638f9b..6b3ede58 100644 --- a/src/CoreRT/Surfaces/rpv_surface.jl +++ b/src/CoreRT/Surfaces/rpv_surface.jl @@ -148,7 +148,7 @@ function expandSurface!(Rsurf::AbstractArray{FT,2}, n_stokes::Int, v) where {FT} device = devi(architecture(Rsurf)) applyExpansion_! = applyExpansion!(device) event = applyExpansion_!(Rsurf, n_stokes, v, ndrange=size(v)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end diff --git a/test/gpu_tests/elemental_test.jl b/test/gpu_tests/elemental_test.jl index c0f5b7fa..6f9f188c 100644 --- a/test/gpu_tests/elemental_test.jl +++ b/test/gpu_tests/elemental_test.jl @@ -43,7 +43,7 @@ kernel!(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=si function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize(); end @time test2(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) @@ -59,7 +59,7 @@ kernel! = get_r!(device) kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=size(r⁻⁺)); function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize(); end @time test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)