Update CUDA

RemoteSensingTools · Apr 16, 2024 · b559db7 · b559db7
1 parent f7b8c99
commit b559db7
Show file tree

Hide file tree

Showing 13 changed files with 35 additions and 43 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,10 @@
 name = "vSmartMOM"
 uuid = "7ba11eeb-0a61-4a04-a413-bf612cc2007e"
-authors = ["Rupesh Jeyaram <rjeyaram@caltech.edu> and contributors"]
-version = "1.0.1"
+authors = ["Christian Frankenberg <cfranken@caltech.edu>, Suniti Sanghavi ([email protected]), Rupesj Jeyaram and contributors"]
+version = "1.0.2"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7"
 ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
 DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
@@ -45,25 +44,18 @@ WignerSymbols = "9f57e263-0b3d-5e2e-b1be-24f2bb48858b"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 
 [compat]
-
-CUDAKernels = "0.2, 0.3, 0.4"
-CUDA = "3, 4"
-
+CUDA = "4, 5"
 DataInterpolations = "3.6, 4"
 DelimitedFiles = "1"
-
 DiffResults = "1.0"
 Distributions = "0.23, 0.24, 0.25"
 DocStringExtensions = "0.8, 0.9"
-ForwardDiff = "0.10"
 FastGaussQuadrature = "0.4, 0.5"
-
+ForwardDiff = "0.10"
 InstrumentOperator = "0.1"
 Interpolations = "0.12, 0.13, 0.14"
 JLD2 = "0.1, 0.2, 0.3, 0.4"
 JSON = "0.21"
-
-
 KernelAbstractions = "0.8, 0.9"
 NCDatasets = "0.11, 0.12"
 NNlib = "0.8, 0.9"
@@ -76,7 +68,7 @@ StaticArrays = "1.2"
 StatsBase = "0.33, 0.34"
 TimerOutputs = "0.5"
 YAML = "0.4"
-julia = "1.7, 1.8"
+julia = "1.8,1.9,1.10"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/src/Absorption/Absorption.jl b/src/Absorption/Absorption.jl
@@ -15,7 +15,7 @@ using Interpolations            # For interpolating in lookup tables and interpo
 using JLD2                      # For saving and loading the interpolator
 using ProgressMeter             # For showing progress, especially in creating interpolator
 using KernelAbstractions        # For heterogeneous (GPU+CPU) programming
-using CUDAKernels               # Access to CUDADevice
+using CUDA.CUDAKernels               # Access to CUDADevice
 using CUDA                      # For GPU programming
 using ForwardDiff, DiffResults  # For auto-differentiation
 using NetCDF                    # For loading NetCDF files with constants

diff --git a/src/Absorption/compute_absorption_cross_section.jl b/src/Absorption/compute_absorption_cross_section.jl
@@ -120,7 +120,7 @@ function compute_absorption_cross_section(
             # Run the event on the kernel 
             # That this, this function adds to each element in result, the contribution from this transition
             event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view))
-            wait(device, event)
+            #wait(device, event)
             synchronize_if_gpu()
         end
     end

diff --git a/src/Architectures.jl b/src/Architectures.jl
@@ -12,7 +12,7 @@ export
 using CUDA
 
 using KernelAbstractions
-using CUDAKernels
+using CUDA.CUDAKernels
 
 """
     AbstractArchitecture

diff --git a/src/CoreRT/CoreKernel/doubling.jl b/src/CoreRT/CoreKernel/doubling.jl
@@ -122,7 +122,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A
         device = devi(architecture(r⁻⁺))
         applyD_kernel! = apply_D!(device)
         event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
         return nothing
     end
@@ -134,7 +134,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where
     device = devi(architecture(J₀⁻))
     applyD_kernel! = apply_D_SFI!(device)
     event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     nothing
 end
diff --git a/src/CoreRT/CoreKernel/doubling_inelastic.jl b/src/CoreRT/CoreKernel/doubling_inelastic.jl
@@ -397,7 +397,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes
         applyD_kernel_IE! = apply_D_IE_VS!(device)
         event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, 
             ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end
@@ -414,7 +414,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra
         applyD_kernel_IE! = apply_D_IE_RRS!(device)
         event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes, 
             ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end
@@ -440,7 +440,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract
     applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device)
     event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes, 
                     ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4)));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu()
     return nothing
 end
@@ -453,7 +453,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st
     applyD_kernel_IE! = apply_D_SFI_IE_VS!(device)
     event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, 
                     ieJ₀⁻, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻));
-    wait(device, event);
+   # wait(device, event);
     synchronize_if_gpu()
     return nothing
 end

diff --git a/src/CoreRT/CoreKernel/elemental.jl b/src/CoreRT/CoreKernel/elemental.jl
@@ -80,13 +80,13 @@ function elemental!(pol_type, SFI::Bool,
             kernel! = get_elem_rt!(device)
             event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺, 
                 qp_μN, wct2, ndrange=size(r⁻⁺)); 
-            wait(device, event)
+            #wait(device, event)
             synchronize_if_gpu()
 
             if SFI
                 kernel! = get_elem_rt_SFI!(device)
                 event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺))
-                wait(device, event)
+                #wait(device, event)
                 synchronize_if_gpu()
             end
         end
@@ -140,13 +140,13 @@ function elemental!(pol_type, SFI::Bool,
         # with absorption in batch mode, low tau_scatt but higher tau_total, needs exact equations
         kernel! = get_elem_rt!(device)
         event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); 
-        wait(device, event)
+        #wait(device, event)
         synchronize_if_gpu()
 
         # SFI part
         kernel! = get_elem_rt_SFI!(device)
         event = kernel!(j₀⁺, j₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(j₀⁺))
-        wait(device, event)
+        #wait(device, event)
         synchronize_if_gpu()
 
         # Apply D Matrix
@@ -288,7 +288,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract
     device = devi(architecture(r⁻⁺))
     applyD_kernel! = apply_D_elemental!(device)
     event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -300,7 +300,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst
         device = devi(architecture(J₀⁻))
         applyD_kernel! = apply_D_elemental_SFI!(device)
         event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻));
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
         return nothing
     end

diff --git a/src/CoreRT/CoreKernel/elemental_canopy.jl b/src/CoreRT/CoreKernel/elemental_canopy.jl
@@ -38,13 +38,13 @@ function elemental!(pol_type, SFI::Bool,
         # with absorption in batch mode, low tau_scatt but higher tau_total, needs exact equations
         kernel! = get_canopy_elem_rt!(device)
         event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, G, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); 
-        wait(device, event)
+        #wait(device, event)
         synchronize_if_gpu()
         #@show G
         # SFI part
         kernel! = get_canopy_elem_rt_SFI!(device)
         event = kernel!(j₀⁺, j₀⁻, ϖ, dτ, arr_type(τ_sum), G, Z⁻⁺, Z⁺⁺, qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(j₀⁺))
-        wait(device, event)
+        #wait(device, event)
         synchronize_if_gpu()
 
         # Apply D Matrix

diff --git a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl
@@ -186,7 +186,7 @@ function get_elem_rt!(RS_type::RRS_plus,
                     aType(Z⁺⁺_λ₁λ₀[:,:,bandSpecLim[iB]]), 
                     qp_μN, wct2, 
                     ndrange=getKernelDim(RS_type,ier⁻⁺[:,:,RS_type.bandSpecLim[iB],:])); 
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
     end
 end
@@ -215,7 +215,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
 
     t_ier⁻⁺  = similar(ier⁻⁺)
@@ -229,7 +229,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         aType(Z⁻⁺_λ₁λ₀_VS_n2), aType(Z⁺⁺_λ₁λ₀_VS_n2), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀_VS_n2)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
 
     ier⁻⁺ += t_ier⁻⁺
@@ -245,7 +245,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         aType(Z⁻⁺_λ₁λ₀_VS_o2), aType(Z⁺⁺_λ₁λ₀_VS_o2), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺, i_λ₁λ₀_VS_o2)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     ier⁻⁺ += t_ier⁻⁺
     iet⁺⁺ += t_iet⁺⁺
@@ -354,7 +354,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         qp_μN, ndoubl, wct02, nStokes, 
         I₀, iμ0, D, 
         ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀)); #change this
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 
     t_ieJ₀⁺ = similar(ieJ₀⁻)
@@ -369,7 +369,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         qp_μN, ndoubl, wct02, nStokes, 
         I₀, iμ0, D, 
         ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_n2)); #change this
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 
     ieJ₀⁺ += t_ieJ₀⁺
@@ -384,7 +384,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         qp_μN, ndoubl, wct02, nStokes, 
         I₀, iμ0, D, 
         ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_o2)); #change this
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 
     ieJ₀⁺ += t_ieJ₀⁺
@@ -482,7 +482,7 @@ function get_elem_rt_SFI!(RS_type::RRS_plus,
                 qp_μN, ndoubl, wct02, nStokes, 
                 I₀, iμ0, D, 
                 ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 end
 

diff --git a/src/CoreRT/CoreKernel/raman_kernel_test.jl b/src/CoreRT/CoreKernel/raman_kernel_test.jl
@@ -1,5 +1,5 @@
 using KernelAbstractions
-using CUDAKernels
+using CUDA.CUDAKernels
 using CUDA
 
 nij = 14

diff --git a/src/CoreRT/CoreRT.jl b/src/CoreRT/CoreRT.jl
@@ -24,7 +24,7 @@ using ...Architectures             # Use Architectures module
 using CUDA                         # GPU CuArrays and functions
 using KernelAbstractions           # Abstracting code for CPU/GPU
 using KernelAbstractions.Extras
-using CUDAKernels
+using CUDA.CUDAKernels
 
 using Unitful                      # For parsing 
 using UnitfulEquivalences          # For converting between wavenumber / wavelength

diff --git a/src/CoreRT/Surfaces/rpv_surface.jl b/src/CoreRT/Surfaces/rpv_surface.jl
@@ -148,7 +148,7 @@ function expandSurface!(Rsurf::AbstractArray{FT,2}, n_stokes::Int, v) where {FT}
     device = devi(architecture(Rsurf))
     applyExpansion_! = applyExpansion!(device)
     event = applyExpansion_!(Rsurf, n_stokes, v, ndrange=size(v));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end

diff --git a/test/gpu_tests/elemental_test.jl b/test/gpu_tests/elemental_test.jl
@@ -43,7 +43,7 @@ kernel!(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=si
 
 function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
     event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺));
-    wait(device, event)
+    #wait(device, event)
     synchronize();
 end
 @time test2(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
@@ -59,7 +59,7 @@ kernel! = get_r!(device)
 kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=size(r⁻⁺));
 function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
     event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺));
-    wait(device, event)
+    #wait(device, event)
     synchronize();
 end
 @time test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)