update vSmartMOM

RemoteSensingTools · Dec 1, 2024 · dc0bf7c · dc0bf7c
1 parent bbb577a
commit dc0bf7c
Show file tree

Hide file tree

Showing 16 changed files with 64 additions and 99 deletions.
diff --git a/Project.toml b/Project.toml
@@ -4,67 +4,34 @@ authors = ["Rupesh Jeyaram <[email protected]> and contributors"]
 version = "0.5.0"
 
 [deps]
-AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
-AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
-ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
-CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7"
-ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c"
-ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
-Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
-Decimals = "abce61dc-4473-55a0-ba07-351d65e31d42"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
-Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
-GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
-GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6"
-GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
-Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
-GridLayoutBase = "3955a311-db13-416c-9275-1d80ed98e5e9"
-HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
-HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
-Insolation = "e98cc03f-d57e-4e3c-b70c-8d51efe9e0d8"
 InstrumentOperator = "9e589c1b-9e01-4e00-831a-aa39ce86e3ef"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
-LegendrePolynomials = "3db4a2ba-fc88-11e8-3e01-49c72059a882"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
 NetCDF_jll = "7243133f-43d8-5620-bbf4-c2c921802cf3"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-PlotUtils = "995b91a9-d308-5afd-9ec6-746e21dbc043"
-PlotlyBase = "a03496cd-edff-5a9b-9e67-9cda94a718b5"
-PlotlyJS = "f0f68f2c-4968-5e81-91da-67840de0976a"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
-PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
-PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -76,26 +43,25 @@ UnitfulEquivalences = "da9c4bc3-91c8-4f02-8a40-6b990d2a7e0c"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 
 [compat]
-CUDA = "3,  4"
-CUDAKernels = "0.2, 0.3, 0.4"
+CUDA = "4, 5"
 DataInterpolations = "3.6,4"
 DiffResults = "1.0"
 Distributions = "0.23, 0.24, 0.25"
 DocStringExtensions = "0.8,0.9"
 FastGaussQuadrature = "0.4,0.5"
 ForwardDiff = "0.10"
-Interpolations = "0.12, 0.13, 0.14"
+Interpolations = "0.14.0"
 JLD2 = "0.1, 0.2, 0.3, 0.4"
 JSON = "0.21"
 KernelAbstractions = "0.8,0.9"
-NNlib = "0.8"
+NNlib = "0.8,0.9"
 NetCDF = "0.10, 0.11"
 Parameters = "0.12"
 Polynomials = "2,3"
 ProgressMeter = "1.3"
 SpecialFunctions = "2"
 StaticArrays = "1.2"
-StatsBase = "0.33"
+StatsBase = "0.34.3"
 TimerOutputs = "0.5"
 YAML = "0.4"
 julia = "1.7,1.8,1.9"

diff --git a/src/Absorption/Absorption.jl b/src/Absorption/Absorption.jl
@@ -15,7 +15,7 @@ using Interpolations            # For interpolating in lookup tables and interpo
 using JLD2                      # For saving and loading the interpolator
 using ProgressMeter             # For showing progress, especially in creating interpolator
 using KernelAbstractions        # For heterogeneous (GPU+CPU) programming
-using CUDAKernels               # Access to CUDADevice
+using CUDA.CUDAKernels               # Access to CUDADevice
 using CUDA                      # For GPU programming
 using ForwardDiff, DiffResults  # For auto-differentiation
 using NetCDF                    # For loading NetCDF files with constants

diff --git a/src/Absorption/compute_absorption_cross_section.jl b/src/Absorption/compute_absorption_cross_section.jl
@@ -120,7 +120,7 @@ function compute_absorption_cross_section(
             # Run the event on the kernel 
             # That this, this function adds to each element in result, the contribution from this transition
             event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view))
-            wait(device, event)
+            ##wait(device, event)
             synchronize_if_gpu()
         end
     end

diff --git a/src/Architectures.jl b/src/Architectures.jl
@@ -12,7 +12,7 @@ export
 using CUDA
 
 using KernelAbstractions
-using CUDAKernels
+using CUDA.CUDAKernels
 
 """
     AbstractArchitecture
@@ -42,18 +42,17 @@ macro hascuda(expr)
 end
 
 devi(::CPU) = KernelAbstractions.CPU()
-devi(::GPU) = CUDAKernels.CUDADevice()
+devi(::GPU) = CUDA.CUDABackend(; always_inline=true)
 
-         architecture(::Array)   = CPU()
-#@hascuda 
-        architecture(::CuArray) = GPU()
+architecture(::Array)   = CPU()
+@hascuda architecture(::CuArray) = GPU()
 
-         array_type(::CPU) = Array
-#@hascuda 
-        array_type(::GPU) = CuArray
+array_type(::CPU) = Array
+@hascuda array_type(::GPU) = CuArray
 
 default_architecture = has_cuda() ? GPU() : CPU()
 
-synchronize_if_gpu() = has_cuda() ? synchronize() : nothing
+synchronize_if_gpu() = has_cuda() ? CUDA.synchronize() : nothing
+
 
 end
diff --git a/src/CoreRT/CoreKernel/doubling.jl b/src/CoreRT/CoreKernel/doubling.jl
@@ -128,7 +128,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A
         device = devi(architecture(r⁻⁺))
         applyD_kernel! = apply_D!(device)
         event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-        wait(device, event);
+        ##wait(device, event);
         synchronize_if_gpu();
         return nothing
     end
@@ -144,7 +144,7 @@ end
         device = devi(Architectures.CPU())
         applyD_kernel! = apply_D!(device)
         event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-        wait(device, event);
+        #wait(device, event);
         return nothing
     end
 end=#
@@ -154,7 +154,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where
     device = devi(architecture(J₀⁻))
     applyD_kernel! = apply_D_SFI!(device)
     event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-    wait(device, event);
+    ##wait(device, event);
     synchronize_if_gpu();
     nothing
 end
@@ -167,7 +167,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::Array{FT,3}) where {FT}
     device = devi(architecture(J₀⁻))
     applyD_kernel! = apply_D_SFI!(device)
     event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-    wait(device, event);
+    #wait(device, event);
     
     return nothing
 end=#
diff --git a/src/CoreRT/CoreKernel/doubling_inelastic.jl b/src/CoreRT/CoreKernel/doubling_inelastic.jl
@@ -410,7 +410,7 @@ end
 #        device = devi(architecture(r⁻⁺))
 #        applyD_kernel! = apply_D!(device)
 #        event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); #Suniti: is it possible to  use the same kernel for the 3D elastic and 4D inelastic terms or do we need to call two different kernels separately? 
-#        wait(device, event);
+#        #wait(device, event);
 #        synchronize_if_gpu();
 #        return nothing
 #    end
@@ -427,7 +427,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes
         applyD_kernel_IE! = apply_D_IE_VS!(device)
         event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, 
             ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺,(RS_type.i_λ₁λ₀_all)));
-        wait(device, event);
+        ##wait(device, event);
         synchronize();
         return nothing
     end
@@ -444,7 +444,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra
         applyD_kernel_IE! = apply_D_IE_RRS!(device)
         event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes, 
             ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
-        wait(device, event);
+        ##wait(device, event);
         synchronize();
         return nothing
     end
@@ -456,7 +456,7 @@ end
 #    device = devi(architecture(J₀⁻)) #Suniti: how to do this so that ieJ₀⁻ can also be included?
 #    applyD_kernel! = apply_D_SFI!(device)
 #    event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-#    wait(device, event);
+#    #wait(device, event);
 #    synchronize();
 #    
 #    return nothing
@@ -470,7 +470,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract
     applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device)
     event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes, 
                     ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4)));
-    wait(device, event);
+    ##wait(device, event);
     synchronize_if_gpu()
     return nothing
 end
@@ -490,7 +490,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st
                             ieJ₀⁻, 
                             aType(RS_type.i_λ₁λ₀_all)));
     #@show "here 3"
-    wait(device, event);
+    ##wait(device, event);
     #@show "here 4"
     synchronize_if_gpu()
     return nothing

diff --git a/src/CoreRT/CoreKernel/elemental.jl b/src/CoreRT/CoreKernel/elemental.jl
@@ -81,13 +81,13 @@ function elemental!(pol_type, SFI::Bool,
             kernel! = get_elem_rt!(device)
             event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺, F₀,
                 qp_μN, wct2, ndrange=size(r⁻⁺)); 
-            wait(device, event)
+            #wait(device, event)
             synchronize_if_gpu()
 
             if SFI
                 kernel! = get_elem_rt_SFI!(device)
                 event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, F₀, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺))
-                wait(device, event)
+                #wait(device, event)
                 synchronize_if_gpu()
             end
         end
@@ -153,15 +153,15 @@ function elemental!(pol_type, SFI::Bool,
         #@show "Start event",   typeof(wct2)
         event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); 
         #@show "Stop event"
-        wait(device, event)
+        #wait(device, event)
         synchronize_if_gpu()
 
         if SFI
             kernel! = get_elem_rt_SFI!(device)
             #@show size(F₀)
             event = kernel!(J₀⁺, J₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺, 
             arr_type(F₀), qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(J₀⁺))
-            wait(device, event)
+            #wait(device, event)
         end
         #ii = pol_type.n*(iμ0-1)+1
         #@show 'B',iμ0,  r⁻⁺[1,ii,1]/(J₀⁻[1,1,1]*wt_μ[iμ0]), r⁻⁺[1,ii,1], J₀⁻[1,1,1]*wt_μ[iμ0], J₀⁺[1,1,1]*wt_μ[iμ0]
@@ -317,7 +317,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract
     device = devi(architecture(r⁻⁺))
     applyD_kernel! = apply_D_elemental!(device)
     event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -329,7 +329,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst
         device = devi(architecture(J₀⁻))
         applyD_kernel! = apply_D_elemental_SFI!(device)
         event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻));
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
         return nothing
     end

diff --git a/src/CoreRT/CoreKernel/elemental_inelastic.jl b/src/CoreRT/CoreKernel/elemental_inelastic.jl
@@ -224,7 +224,7 @@ function get_elem_rt!(RS_type::RRS,
                     aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), 
                     qp_μN, wct2, 
                     ndrange=getKernelDim(RS_type,ier⁻⁺)); 
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
         #for j=1:1:length(qp_μN)
         #    @show minimum(iet⁺⁺[1:3:end,j,200,50]), minimum(ier⁻⁺[1:3:end,j,200,50]) 
@@ -254,7 +254,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1, VS_1to0},
         aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
 end
 
@@ -342,7 +342,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1, VS_1to0},
     qp_μN, ndoubl, wct02, nStokes, 
     I₀, iμ0, D, 
     ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 end
 
@@ -441,7 +441,7 @@ function get_elem_rt_SFI!(RS_type::RRS,
                 I₀, iμ0, D, 
                 ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
 
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
     #@show minimum(ieJ₀⁺[1:3:end,1,200,50]), minimum(ieJ₀⁻[1:3:end,1,200,50]) 
     #@show maximum(ieJ₀⁺[1:3:end,1,200,50]), maximum(ieJ₀⁻[1:3:end,1,200,50]) 
@@ -661,7 +661,7 @@ function apply_D_matrix_elemental!(RS_type::Union{RRS, RRS_plus}, ndoubl::Int, n
         n_stokes, 
         ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, 
         ndrange=size(ier⁻⁺));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -679,7 +679,7 @@ function apply_D_matrix_elemental!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
                     n_stokes, RS_type.i_λ₁λ₀_all,
                     ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, 
                     ndrange=getKernelDim(RS_type,ier⁻⁺,RS_type.i_λ₁λ₀_all));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -699,7 +699,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{RRS, RRS_plus},
                                 ieJ₀⁻, 
                                 ndrange=size(ieJ₀⁻));
         #@show "here 1.4"
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end
@@ -722,7 +722,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus
                             ndrange = getKernelDimSFI(RS_type,ieJ₀⁻,RS_type.i_λ₁λ₀_all));
                             #ndrange=size(ieJ₀⁻));
         #@show "here 1.4"
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end