From dc0bf7c41047ddd3169b8e5e2d6d0b4b15f66982 Mon Sep 17 00:00:00 2001 From: sunitisanghavi Date: Sun, 1 Dec 2024 14:14:19 -0800 Subject: [PATCH] update vSmartMOM --- Project.toml | 42 ++----------------- src/Absorption/Absorption.jl | 2 +- .../compute_absorption_cross_section.jl | 2 +- src/Architectures.jl | 17 ++++---- src/CoreRT/CoreKernel/doubling.jl | 8 ++-- src/CoreRT/CoreKernel/doubling_inelastic.jl | 12 +++--- src/CoreRT/CoreKernel/elemental.jl | 12 +++--- src/CoreRT/CoreKernel/elemental_inelastic.jl | 16 +++---- .../CoreKernel/elemental_inelastic_plus.jl | 20 ++++----- src/CoreRT/CoreKernel/interaction_ss.jl | 6 +-- src/CoreRT/CoreKernel/raman_kernel_test.jl | 4 +- src/CoreRT/CoreRT.jl | 2 +- src/CoreRT/gpu_batched.jl | 2 +- src/CoreRT/model_from_parameters.jl | 12 +++--- test/gpu_tests/elemental_test.jl | 4 +- .../O2_parameters2_SIF_grid.yaml | 2 +- 16 files changed, 64 insertions(+), 99 deletions(-) diff --git a/Project.toml b/Project.toml index 907e1c19..1c3e969b 100644 --- a/Project.toml +++ b/Project.toml @@ -4,67 +4,34 @@ authors = ["Rupesh Jeyaram and contributors"] version = "0.5.0" [deps] -AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" -AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" -ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" -CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7" -ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c" -ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" -Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0" -Decimals = "abce61dc-4473-55a0-ba07-351d65e31d42" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" -Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" -Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" -GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" -GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" -GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6" -GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326" -Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" -GridLayoutBase = "3955a311-db13-416c-9275-1d80ed98e5e9" -HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" -HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5" -Insolation = "e98cc03f-d57e-4e3c-b70c-8d51efe9e0d8" InstrumentOperator = "9e589c1b-9e01-4e00-831a-aa39ce86e3ef" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" -LegendrePolynomials = "3db4a2ba-fc88-11e8-3e01-49c72059a882" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" -NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9" NetCDF_jll = "7243133f-43d8-5620-bbf4-c2c921802cf3" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -PlotUtils = "995b91a9-d308-5afd-9ec6-746e21dbc043" -PlotlyBase = "a03496cd-edff-5a9b-9e67-9cda94a718b5" -PlotlyJS = "f0f68f2c-4968-5e81-91da-67840de0976a" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" -PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" -PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee" Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" @@ -76,26 +43,25 @@ UnitfulEquivalences = "da9c4bc3-91c8-4f02-8a40-6b990d2a7e0c" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] -CUDA = "3, 4" -CUDAKernels = "0.2, 0.3, 0.4" +CUDA = "4, 5" DataInterpolations = "3.6,4" DiffResults = "1.0" Distributions = "0.23, 0.24, 0.25" DocStringExtensions = "0.8,0.9" FastGaussQuadrature = "0.4,0.5" ForwardDiff = "0.10" -Interpolations = "0.12, 0.13, 0.14" +Interpolations = "0.14.0" JLD2 = "0.1, 0.2, 0.3, 0.4" JSON = "0.21" KernelAbstractions = "0.8,0.9" -NNlib = "0.8" +NNlib = "0.8,0.9" NetCDF = "0.10, 0.11" Parameters = "0.12" Polynomials = "2,3" ProgressMeter = "1.3" SpecialFunctions = "2" StaticArrays = "1.2" -StatsBase = "0.33" +StatsBase = "0.34.3" TimerOutputs = "0.5" YAML = "0.4" julia = "1.7,1.8,1.9" diff --git a/src/Absorption/Absorption.jl b/src/Absorption/Absorption.jl index e7642941..512cff29 100644 --- a/src/Absorption/Absorption.jl +++ b/src/Absorption/Absorption.jl @@ -15,7 +15,7 @@ using Interpolations # For interpolating in lookup tables and interpo using JLD2 # For saving and loading the interpolator using ProgressMeter # For showing progress, especially in creating interpolator using KernelAbstractions # For heterogeneous (GPU+CPU) programming -using CUDAKernels # Access to CUDADevice +using CUDA.CUDAKernels # Access to CUDADevice using CUDA # For GPU programming using ForwardDiff, DiffResults # For auto-differentiation using NetCDF # For loading NetCDF files with constants diff --git a/src/Absorption/compute_absorption_cross_section.jl b/src/Absorption/compute_absorption_cross_section.jl index c9c5b9a8..a5ea2518 100644 --- a/src/Absorption/compute_absorption_cross_section.jl +++ b/src/Absorption/compute_absorption_cross_section.jl @@ -120,7 +120,7 @@ function compute_absorption_cross_section( # Run the event on the kernel # That this, this function adds to each element in result, the contribution from this transition event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view)) - wait(device, event) + ##wait(device, event) synchronize_if_gpu() end end diff --git a/src/Architectures.jl b/src/Architectures.jl index 57da898c..6f1f6b3e 100644 --- a/src/Architectures.jl +++ b/src/Architectures.jl @@ -12,7 +12,7 @@ export using CUDA using KernelAbstractions -using CUDAKernels +using CUDA.CUDAKernels """ AbstractArchitecture @@ -42,18 +42,17 @@ macro hascuda(expr) end devi(::CPU) = KernelAbstractions.CPU() -devi(::GPU) = CUDAKernels.CUDADevice() +devi(::GPU) = CUDA.CUDABackend(; always_inline=true) - architecture(::Array) = CPU() -#@hascuda - architecture(::CuArray) = GPU() +architecture(::Array) = CPU() +@hascuda architecture(::CuArray) = GPU() - array_type(::CPU) = Array -#@hascuda - array_type(::GPU) = CuArray +array_type(::CPU) = Array +@hascuda array_type(::GPU) = CuArray default_architecture = has_cuda() ? GPU() : CPU() -synchronize_if_gpu() = has_cuda() ? synchronize() : nothing +synchronize_if_gpu() = has_cuda() ? CUDA.synchronize() : nothing + end \ No newline at end of file diff --git a/src/CoreRT/CoreKernel/doubling.jl b/src/CoreRT/CoreKernel/doubling.jl index 54944ef6..7974bf54 100644 --- a/src/CoreRT/CoreKernel/doubling.jl +++ b/src/CoreRT/CoreKernel/doubling.jl @@ -128,7 +128,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A device = devi(architecture(r⁻⁺)) applyD_kernel! = apply_D!(device) event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); - wait(device, event); + ##wait(device, event); synchronize_if_gpu(); return nothing end @@ -144,7 +144,7 @@ end device = devi(Architectures.CPU()) applyD_kernel! = apply_D!(device) event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); - wait(device, event); + #wait(device, event); return nothing end end=# @@ -154,7 +154,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where device = devi(architecture(J₀⁻)) applyD_kernel! = apply_D_SFI!(device) event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻)); - wait(device, event); + ##wait(device, event); synchronize_if_gpu(); nothing end @@ -167,7 +167,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::Array{FT,3}) where {FT} device = devi(architecture(J₀⁻)) applyD_kernel! = apply_D_SFI!(device) event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻)); - wait(device, event); + #wait(device, event); return nothing end=# \ No newline at end of file diff --git a/src/CoreRT/CoreKernel/doubling_inelastic.jl b/src/CoreRT/CoreKernel/doubling_inelastic.jl index 436fd896..134b733c 100644 --- a/src/CoreRT/CoreKernel/doubling_inelastic.jl +++ b/src/CoreRT/CoreKernel/doubling_inelastic.jl @@ -410,7 +410,7 @@ end # device = devi(architecture(r⁻⁺)) # applyD_kernel! = apply_D!(device) # event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); #Suniti: is it possible to use the same kernel for the 3D elastic and 4D inelastic terms or do we need to call two different kernels separately? -# wait(device, event); +# #wait(device, event); # synchronize_if_gpu(); # return nothing # end @@ -427,7 +427,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes applyD_kernel_IE! = apply_D_IE_VS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺,(RS_type.i_λ₁λ₀_all))); - wait(device, event); + ##wait(device, event); synchronize(); return nothing end @@ -444,7 +444,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra applyD_kernel_IE! = apply_D_IE_RRS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺)); - wait(device, event); + ##wait(device, event); synchronize(); return nothing end @@ -456,7 +456,7 @@ end # device = devi(architecture(J₀⁻)) #Suniti: how to do this so that ieJ₀⁻ can also be included? # applyD_kernel! = apply_D_SFI!(device) # event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻)); -# wait(device, event); +# #wait(device, event); # synchronize(); # # return nothing @@ -470,7 +470,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device) event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes, ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4))); - wait(device, event); + ##wait(device, event); synchronize_if_gpu() return nothing end @@ -490,7 +490,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st ieJ₀⁻, aType(RS_type.i_λ₁λ₀_all))); #@show "here 3" - wait(device, event); + ##wait(device, event); #@show "here 4" synchronize_if_gpu() return nothing diff --git a/src/CoreRT/CoreKernel/elemental.jl b/src/CoreRT/CoreKernel/elemental.jl index 2a44b742..2c9d6137 100644 --- a/src/CoreRT/CoreKernel/elemental.jl +++ b/src/CoreRT/CoreKernel/elemental.jl @@ -81,13 +81,13 @@ function elemental!(pol_type, SFI::Bool, kernel! = get_elem_rt!(device) event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺, F₀, qp_μN, wct2, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize_if_gpu() if SFI kernel! = get_elem_rt_SFI!(device) event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, F₀, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() end end @@ -153,7 +153,7 @@ function elemental!(pol_type, SFI::Bool, #@show "Start event", typeof(wct2) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); #@show "Stop event" - wait(device, event) + #wait(device, event) synchronize_if_gpu() if SFI @@ -161,7 +161,7 @@ function elemental!(pol_type, SFI::Bool, #@show size(F₀) event = kernel!(J₀⁺, J₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺, arr_type(F₀), qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(J₀⁺)) - wait(device, event) + #wait(device, event) end #ii = pol_type.n*(iμ0-1)+1 #@show 'B',iμ0, r⁻⁺[1,ii,1]/(J₀⁻[1,1,1]*wt_μ[iμ0]), r⁻⁺[1,ii,1], J₀⁻[1,1,1]*wt_μ[iμ0], J₀⁺[1,1,1]*wt_μ[iμ0] @@ -317,7 +317,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract device = devi(architecture(r⁻⁺)) applyD_kernel! = apply_D_elemental!(device) event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end @@ -329,7 +329,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst device = devi(architecture(J₀⁻)) applyD_kernel! = apply_D_elemental_SFI!(device) event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end diff --git a/src/CoreRT/CoreKernel/elemental_inelastic.jl b/src/CoreRT/CoreKernel/elemental_inelastic.jl index ee10327e..d3e401d1 100644 --- a/src/CoreRT/CoreKernel/elemental_inelastic.jl +++ b/src/CoreRT/CoreKernel/elemental_inelastic.jl @@ -224,7 +224,7 @@ function get_elem_rt!(RS_type::RRS, aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); #for j=1:1:length(qp_μN) # @show minimum(iet⁺⁺[1:3:end,j,200,50]), minimum(ier⁻⁺[1:3:end,j,200,50]) @@ -254,7 +254,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1, VS_1to0}, aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); end @@ -342,7 +342,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1, VS_1to0}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type,ieJ₀⁻)); - wait(device, event) + #wait(device, event) synchronize_if_gpu(); end @@ -441,7 +441,7 @@ function get_elem_rt_SFI!(RS_type::RRS, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type,ieJ₀⁻)); - wait(device, event) + #wait(device, event) synchronize_if_gpu(); #@show minimum(ieJ₀⁺[1:3:end,1,200,50]), minimum(ieJ₀⁻[1:3:end,1,200,50]) #@show maximum(ieJ₀⁺[1:3:end,1,200,50]), maximum(ieJ₀⁻[1:3:end,1,200,50]) @@ -661,7 +661,7 @@ function apply_D_matrix_elemental!(RS_type::Union{RRS, RRS_plus}, ndoubl::Int, n n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=size(ier⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end @@ -679,7 +679,7 @@ function apply_D_matrix_elemental!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes, RS_type.i_λ₁λ₀_all, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type,ier⁻⁺,RS_type.i_λ₁λ₀_all)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end @@ -699,7 +699,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{RRS, RRS_plus}, ieJ₀⁻, ndrange=size(ieJ₀⁻)); #@show "here 1.4" - wait(device, event); + #wait(device, event); synchronize(); return nothing end @@ -722,7 +722,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus ndrange = getKernelDimSFI(RS_type,ieJ₀⁻,RS_type.i_λ₁λ₀_all)); #ndrange=size(ieJ₀⁻)); #@show "here 1.4" - wait(device, event); + #wait(device, event); synchronize(); return nothing end diff --git a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl index 701f4edb..22f4dc0b 100644 --- a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl +++ b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl @@ -192,7 +192,7 @@ function get_elem_rt!(RS_type::RRS_plus, aType(Z⁺⁺_λ₁λ₀[:,:,bandSpecLim[iB]]), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺[:,:,RS_type.bandSpecLim[iB],:])); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); end end @@ -221,7 +221,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); #@show size(i_λ₁λ₀), size(i_λ₁λ₀_VS_n2), size(i_λ₁λ₀_VS_o2) #@show "RVS", t_ier⁻⁺[1,1,i_λ₁λ₀[findall(i_λ₁λ₀.>0)]] @@ -241,7 +241,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, aType(Z⁻⁺_λ₁λ₀_VS_n2), aType(Z⁺⁺_λ₁λ₀_VS_n2), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀_VS_n2)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); #@show "VS N2", t_ier⁻⁺[1,1,i_λ₁λ₀_VS_n2[findall(i_λ₁λ₀_VS_n2.>0)]] ier⁻⁺ .+= t_ier⁻⁺ @@ -261,7 +261,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, aType(Z⁻⁺_λ₁λ₀_VS_o2), aType(Z⁺⁺_λ₁λ₀_VS_o2), qp_μN, wct2, ndrange=getKernelDim(RS_type,ier⁻⁺, i_λ₁λ₀_VS_o2)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); #@show "VS O2", t_ier⁻⁺[1,1,i_λ₁λ₀_VS_o2[findall(i_λ₁λ₀_VS_o2.>0)]] ier⁻⁺ .+= t_ier⁻⁺ @@ -386,7 +386,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀)); #change this - wait(device, event) + #wait(device, event) synchronize_if_gpu(); t_ieJ₀⁺ = similar(ieJ₀⁻) @@ -403,7 +403,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_n2)); #change this - wait(device, event) + #wait(device, event) synchronize_if_gpu(); ieJ₀⁺ .+= t_ieJ₀⁺ @@ -419,7 +419,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_o2)); #change this - wait(device, event) + #wait(device, event) synchronize_if_gpu(); ieJ₀⁺ .+= t_ieJ₀⁺ @@ -526,7 +526,7 @@ function get_elem_rt_SFI!(RS_type::RRS_plus, qp_μN, ndoubl, wct02, nStokes, I₀, iμ0, D, ndrange=getKernelDimSFI(RS_type,ieJ₀⁻)); - wait(device, event) + #wait(device, event) synchronize_if_gpu(); end @@ -673,7 +673,7 @@ function apply_D_matrix_elemental!(RS_type::RRS_plus, ndoubl::Int, n_stokes::Int device = devi(architecture(ier⁻⁺)) applyD_kernel! = apply_D_elemental_RRS!(device) event = applyD_kernel!(ndoubl,n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=size(ier⁻⁺)); - wait(device, event); + #wait(device, event); synchronize_if_gpu(); return nothing end @@ -686,7 +686,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{RRS_plus, VS_0to1_plus, VS device = devi(architecture(ieJ₀⁻)) applyD_kernel! = apply_D_elemental_SFI!(device) event = applyD_kernel!(RS_type,ndoubl,n_stokes, ieJ₀⁻, ndrange=size(ieJ₀⁻)); - wait(device, event); + #wait(device, event); synchronize(); return nothing end diff --git a/src/CoreRT/CoreKernel/interaction_ss.jl b/src/CoreRT/CoreKernel/interaction_ss.jl index ef1d828c..3fcc34a3 100644 --- a/src/CoreRT/CoreKernel/interaction_ss.jl +++ b/src/CoreRT/CoreKernel/interaction_ss.jl @@ -36,7 +36,7 @@ function interaction_ss!(SFI::Bool, arr_type(added_layer.J₀⁺), arr_type(added_layer.J₀⁻), J₀⁺, J₀⁻, ndrange=size(J₀⁻)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() end @@ -68,7 +68,7 @@ function interaction_inelastic_ss!(RS_type::RRS, atype(added_layer.ieJ₀⁺), atype(added_layer.ieJ₀⁻), ieJ₀⁺, ieJ₀⁻, ndrange=getKernelDimSFI(RS_type, ieJ₀⁻)) - wait(device, event) + #wait(device, event) synchronize_if_gpu() end @@ -112,7 +112,7 @@ event = kernel!(τ_sum, τ_λ, qp_μN, atype(i_λ₁λ₀_all), atype(added_layer.ieJ₀⁺), atype(added_layer.ieJ₀⁻), ieJ₀⁺, ieJ₀⁻, ndrange = getKernelDimSFI(RS_type,ieJ₀⁻,RS_type.i_λ₁λ₀_all)) -wait(device, event) +#wait(device, event) synchronize_if_gpu() end diff --git a/src/CoreRT/CoreKernel/raman_kernel_test.jl b/src/CoreRT/CoreKernel/raman_kernel_test.jl index 4fe34b8a..11992f72 100644 --- a/src/CoreRT/CoreKernel/raman_kernel_test.jl +++ b/src/CoreRT/CoreKernel/raman_kernel_test.jl @@ -110,7 +110,7 @@ base_iet⁺⁺ = deepcopy(iet⁺⁺); device = CPU() kernel! = get_elem_rt!(device) event = kernel!(ier⁻⁺, iet⁺⁺, ϖ_λ, ϖ_λ₀λ₁,dτ₀, dτ₁, dτ_λ, Z⁻⁺_λ₀λ₁, Z⁺⁺_λ₀λ₁, qp_μN, wct2, ndrange=size(ier⁻⁺)); -wait(device, event) +#wait(device, event) base_ier⁻⁺ ≈ ier⁻⁺ base_iet⁺⁺ ≈ iet⁺⁺ @@ -134,7 +134,7 @@ if has_cuda() device = CUDAKernels.CUDADevice() kernel! = get_elem_rt!(device) event = kernel!(c_ier⁻⁺, c_iet⁺⁺, c_ϖ_λ, c_ϖ_λ₀λ₁,dτ₀, dτ₁, c_dτ_λ, c_Z⁻⁺_λ₀λ₁, c_Z⁺⁺_λ₀λ₁, c_qp_μN, c_wct2, ndrange=size(c_ier⁻⁺)); - wait(device, event) + #wait(device, event) cuda_iet⁺⁺ = Array(c_iet⁺⁺); cuda_ier⁻⁺ = Array(c_ier⁻⁺); diff --git a/src/CoreRT/CoreRT.jl b/src/CoreRT/CoreRT.jl index d8c22a15..a089d021 100644 --- a/src/CoreRT/CoreRT.jl +++ b/src/CoreRT/CoreRT.jl @@ -23,7 +23,7 @@ using ...Architectures # Use Architectures module using CUDA # GPU CuArrays and functions using KernelAbstractions # Abstracting code for CPU/GPU using KernelAbstractions.Extras -using CUDAKernels +using CUDA.CUDAKernels using Unitful # For parsing using UnitfulEquivalences # For converting between wavenumber / wavelength diff --git a/src/CoreRT/gpu_batched.jl b/src/CoreRT/gpu_batched.jl index d4ab9694..62dae6f8 100644 --- a/src/CoreRT/gpu_batched.jl +++ b/src/CoreRT/gpu_batched.jl @@ -3,7 +3,7 @@ This file contains implementations of batched linear algebra code =# - +@inline synchronize() = CUDA.synchronize() "Given 3D CuArrays A and B, fill in X[:,:,k] = A[:,:,k] \\ B[:,:,k]" function batch_solve!(X::CuArray{FT,3}, A::CuArray{FT,3}, B::CuArray{FT,3}) where {FT} diff --git a/src/CoreRT/model_from_parameters.jl b/src/CoreRT/model_from_parameters.jl index 89886836..1e6d9c8e 100644 --- a/src/CoreRT/model_from_parameters.jl +++ b/src/CoreRT/model_from_parameters.jl @@ -351,8 +351,8 @@ function model_from_parameters(params::vSmartMOM_Parameters) #@show aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃ kext_grid = [aerosol_optics_raw_0.k, aerosol_optics_raw_1.k] ksca_grid = [aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃] - interp_linear_kext = linear_interpolation(ν_grid, kext_grid) - interp_linear_ksca = linear_interpolation(ν_grid, ksca_grid) + interp_linear_kext = LinearInterpolation(ν_grid, kext_grid) + interp_linear_ksca = LinearInterpolation(ν_grid, ksca_grid) k = zeros(length(curr_band_λ)) ω̃ = zeros(length(curr_band_λ)) for i = 1:length(curr_band_λ) @@ -659,8 +659,8 @@ function model_from_parameters(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, #@show aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃ kext_grid = [aerosol_optics_raw_0.k, aerosol_optics_raw_1.k] ksca_grid = [aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃] - interp_linear_kext = linear_interpolation(ν_grid, kext_grid) - interp_linear_ksca = linear_interpolation(ν_grid, ksca_grid) + interp_linear_kext = LinearInterpolation(ν_grid, kext_grid) + interp_linear_ksca = LinearInterpolation(ν_grid, ksca_grid) k = zeros(length(curr_band_λ)) ω̃ = zeros(length(curr_band_λ)) for i = 1:length(curr_band_λ) @@ -941,8 +941,8 @@ function model_from_parameters(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, #@show aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃ kext_grid = [aerosol_optics_raw_0.k, aerosol_optics_raw_1.k] ksca_grid = [aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃] - interp_linear_kext = linear_interpolation(ν_grid, kext_grid) - interp_linear_ksca = linear_interpolation(ν_grid, ksca_grid) + interp_linear_kext = LinearInterpolation(ν_grid, kext_grid) + interp_linear_ksca = LinearInterpolation(ν_grid, ksca_grid) k = zeros(length(curr_band_λ)) ω̃ = zeros(length(curr_band_λ)) for i = 1:length(curr_band_λ) diff --git a/test/gpu_tests/elemental_test.jl b/test/gpu_tests/elemental_test.jl index c0f5b7fa..6f9f188c 100644 --- a/test/gpu_tests/elemental_test.jl +++ b/test/gpu_tests/elemental_test.jl @@ -43,7 +43,7 @@ kernel!(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=si function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize(); end @time test2(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) @@ -59,7 +59,7 @@ kernel! = get_r!(device) kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=size(r⁻⁺)); function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺)); - wait(device, event) + #wait(device, event) synchronize(); end @time test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w) diff --git a/test/test_parameters/O2_parameters2_SIF_grid.yaml b/test/test_parameters/O2_parameters2_SIF_grid.yaml index e2845238..7635a896 100644 --- a/test/test_parameters/O2_parameters2_SIF_grid.yaml +++ b/test/test_parameters/O2_parameters2_SIF_grid.yaml @@ -75,7 +75,7 @@ geometry: # Solar zenith angle (degrees) sza: 19.0 #19. #45. #60 #30. #60 #32.4436 # Viewing zenith angles (degrees) - vza: [60., 60., 60., 60., 60.] #[0, 30, 60, 30] #[15, 30, 45, 60] #[32.4436] #[0.072] + vza: [75., 75., 75., 75., 75.] #[0, 30, 60, 30] #[15, 30, 45, 60] #[32.4436] #[0.072] # Viewing azimuth angles (degrees) vaz: [0.0, 45., 90., 135., 180.] #, 0.0, 0.0, 180.] #[0.0, 0.0, 180., 180.] # Observation altitude (Pa)