Skip to content

Commit

Permalink
update vSmartMOM
Browse files Browse the repository at this point in the history
  • Loading branch information
sunitisanghavi committed Dec 1, 2024
1 parent bbb577a commit dc0bf7c
Show file tree
Hide file tree
Showing 16 changed files with 64 additions and 99 deletions.
42 changes: 4 additions & 38 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,34 @@ authors = ["Rupesh Jeyaram <[email protected]> and contributors"]
version = "0.5.0"

[deps]
AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7"
ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c"
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
Decimals = "abce61dc-4473-55a0-ba07-351d65e31d42"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6"
GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
GridLayoutBase = "3955a311-db13-416c-9275-1d80ed98e5e9"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
Insolation = "e98cc03f-d57e-4e3c-b70c-8d51efe9e0d8"
InstrumentOperator = "9e589c1b-9e01-4e00-831a-aa39ce86e3ef"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
LegendrePolynomials = "3db4a2ba-fc88-11e8-3e01-49c72059a882"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
NetCDF_jll = "7243133f-43d8-5620-bbf4-c2c921802cf3"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
PlotUtils = "995b91a9-d308-5afd-9ec6-746e21dbc043"
PlotlyBase = "a03496cd-edff-5a9b-9e67-9cda94a718b5"
PlotlyJS = "f0f68f2c-4968-5e81-91da-67840de0976a"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand All @@ -76,26 +43,25 @@ UnitfulEquivalences = "da9c4bc3-91c8-4f02-8a40-6b990d2a7e0c"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

[compat]
CUDA = "3, 4"
CUDAKernels = "0.2, 0.3, 0.4"
CUDA = "4, 5"
DataInterpolations = "3.6,4"
DiffResults = "1.0"
Distributions = "0.23, 0.24, 0.25"
DocStringExtensions = "0.8,0.9"
FastGaussQuadrature = "0.4,0.5"
ForwardDiff = "0.10"
Interpolations = "0.12, 0.13, 0.14"
Interpolations = "0.14.0"
JLD2 = "0.1, 0.2, 0.3, 0.4"
JSON = "0.21"
KernelAbstractions = "0.8,0.9"
NNlib = "0.8"
NNlib = "0.8,0.9"
NetCDF = "0.10, 0.11"
Parameters = "0.12"
Polynomials = "2,3"
ProgressMeter = "1.3"
SpecialFunctions = "2"
StaticArrays = "1.2"
StatsBase = "0.33"
StatsBase = "0.34.3"
TimerOutputs = "0.5"
YAML = "0.4"
julia = "1.7,1.8,1.9"
Expand Down
2 changes: 1 addition & 1 deletion src/Absorption/Absorption.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ using Interpolations # For interpolating in lookup tables and interpo
using JLD2 # For saving and loading the interpolator
using ProgressMeter # For showing progress, especially in creating interpolator
using KernelAbstractions # For heterogeneous (GPU+CPU) programming
using CUDAKernels # Access to CUDADevice
using CUDA.CUDAKernels # Access to CUDADevice
using CUDA # For GPU programming
using ForwardDiff, DiffResults # For auto-differentiation
using NetCDF # For loading NetCDF files with constants
Expand Down
2 changes: 1 addition & 1 deletion src/Absorption/compute_absorption_cross_section.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ function compute_absorption_cross_section(
# Run the event on the kernel
# That this, this function adds to each element in result, the contribution from this transition
event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view))
wait(device, event)
##wait(device, event)
synchronize_if_gpu()
end
end
Expand Down
17 changes: 8 additions & 9 deletions src/Architectures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export
using CUDA

using KernelAbstractions
using CUDAKernels
using CUDA.CUDAKernels

"""
AbstractArchitecture
Expand Down Expand Up @@ -42,18 +42,17 @@ macro hascuda(expr)
end

devi(::CPU) = KernelAbstractions.CPU()
devi(::GPU) = CUDAKernels.CUDADevice()
devi(::GPU) = CUDA.CUDABackend(; always_inline=true)

architecture(::Array) = CPU()
#@hascuda
architecture(::CuArray) = GPU()
architecture(::Array) = CPU()
@hascuda architecture(::CuArray) = GPU()

array_type(::CPU) = Array
#@hascuda
array_type(::GPU) = CuArray
array_type(::CPU) = Array
@hascuda array_type(::GPU) = CuArray

default_architecture = has_cuda() ? GPU() : CPU()

synchronize_if_gpu() = has_cuda() ? synchronize() : nothing
synchronize_if_gpu() = has_cuda() ? CUDA.synchronize() : nothing


end
8 changes: 4 additions & 4 deletions src/CoreRT/CoreKernel/doubling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A
device = devi(architecture(r⁻⁺))
applyD_kernel! = apply_D!(device)
event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
wait(device, event);
##wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand All @@ -144,7 +144,7 @@ end
device = devi(Architectures.CPU())
applyD_kernel! = apply_D!(device)
event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
wait(device, event);
#wait(device, event);
return nothing
end
end=#
Expand All @@ -154,7 +154,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where
device = devi(architecture(J₀⁻))
applyD_kernel! = apply_D_SFI!(device)
event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
wait(device, event);
##wait(device, event);
synchronize_if_gpu();
nothing
end
Expand All @@ -167,7 +167,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::Array{FT,3}) where {FT}
device = devi(architecture(J₀⁻))
applyD_kernel! = apply_D_SFI!(device)
event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
wait(device, event);
#wait(device, event);
return nothing
end=#
12 changes: 6 additions & 6 deletions src/CoreRT/CoreKernel/doubling_inelastic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ end
# device = devi(architecture(r⁻⁺))
# applyD_kernel! = apply_D!(device)
# event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); #Suniti: is it possible to use the same kernel for the 3D elastic and 4D inelastic terms or do we need to call two different kernels separately?
# wait(device, event);
# #wait(device, event);
# synchronize_if_gpu();
# return nothing
# end
Expand All @@ -427,7 +427,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes
applyD_kernel_IE! = apply_D_IE_VS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes,
ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺,(RS_type.i_λ₁λ₀_all)));
wait(device, event);
##wait(device, event);
synchronize();
return nothing
end
Expand All @@ -444,7 +444,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra
applyD_kernel_IE! = apply_D_IE_RRS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes,
ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
wait(device, event);
##wait(device, event);
synchronize();
return nothing
end
Expand All @@ -456,7 +456,7 @@ end
# device = devi(architecture(J₀⁻)) #Suniti: how to do this so that ieJ₀⁻ can also be included?
# applyD_kernel! = apply_D_SFI!(device)
# event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
# wait(device, event);
# #wait(device, event);
# synchronize();
#
# return nothing
Expand All @@ -470,7 +470,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract
applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device)
event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes,
ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4)));
wait(device, event);
##wait(device, event);
synchronize_if_gpu()
return nothing
end
Expand All @@ -490,7 +490,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st
ieJ₀⁻,
aType(RS_type.i_λ₁λ₀_all)));
#@show "here 3"
wait(device, event);
##wait(device, event);
#@show "here 4"
synchronize_if_gpu()
return nothing
Expand Down
12 changes: 6 additions & 6 deletions src/CoreRT/CoreKernel/elemental.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ function elemental!(pol_type, SFI::Bool,
kernel! = get_elem_rt!(device)
event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺, F₀,
qp_μN, wct2, ndrange=size(r⁻⁺));
wait(device, event)
#wait(device, event)
synchronize_if_gpu()

if SFI
kernel! = get_elem_rt_SFI!(device)
event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, F₀, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺))
wait(device, event)
#wait(device, event)
synchronize_if_gpu()
end
end
Expand Down Expand Up @@ -153,15 +153,15 @@ function elemental!(pol_type, SFI::Bool,
#@show "Start event", typeof(wct2)
event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺));
#@show "Stop event"
wait(device, event)
#wait(device, event)
synchronize_if_gpu()

if SFI
kernel! = get_elem_rt_SFI!(device)
#@show size(F₀)
event = kernel!(J₀⁺, J₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺,
arr_type(F₀), qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(J₀⁺))
wait(device, event)
#wait(device, event)
end
#ii = pol_type.n*(iμ0-1)+1
#@show 'B',iμ0, r⁻⁺[1,ii,1]/(J₀⁻[1,1,1]*wt_μ[iμ0]), r⁻⁺[1,ii,1], J₀⁻[1,1,1]*wt_μ[iμ0], J₀⁺[1,1,1]*wt_μ[iμ0]
Expand Down Expand Up @@ -317,7 +317,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract
device = devi(architecture(r⁻⁺))
applyD_kernel! = apply_D_elemental!(device)
event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand All @@ -329,7 +329,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst
device = devi(architecture(J₀⁻))
applyD_kernel! = apply_D_elemental_SFI!(device)
event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand Down
16 changes: 8 additions & 8 deletions src/CoreRT/CoreKernel/elemental_inelastic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ function get_elem_rt!(RS_type::RRS,
aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀),
qp_μN, wct2,
ndrange=getKernelDim(RS_type,ier⁻⁺));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
#for j=1:1:length(qp_μN)
# @show minimum(iet⁺⁺[1:3:end,j,200,50]), minimum(ier⁻⁺[1:3:end,j,200,50])
Expand Down Expand Up @@ -254,7 +254,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1, VS_1to0},
aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀),
qp_μN, wct2,
ndrange=getKernelDim(RS_type,ier⁻⁺));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
end

Expand Down Expand Up @@ -342,7 +342,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1, VS_1to0},
qp_μN, ndoubl, wct02, nStokes,
I₀, iμ0, D,
ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
wait(device, event)
#wait(device, event)
synchronize_if_gpu();
end

Expand Down Expand Up @@ -441,7 +441,7 @@ function get_elem_rt_SFI!(RS_type::RRS,
I₀, iμ0, D,
ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));

wait(device, event)
#wait(device, event)
synchronize_if_gpu();
#@show minimum(ieJ₀⁺[1:3:end,1,200,50]), minimum(ieJ₀⁻[1:3:end,1,200,50])
#@show maximum(ieJ₀⁺[1:3:end,1,200,50]), maximum(ieJ₀⁻[1:3:end,1,200,50])
Expand Down Expand Up @@ -661,7 +661,7 @@ function apply_D_matrix_elemental!(RS_type::Union{RRS, RRS_plus}, ndoubl::Int, n
n_stokes,
ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻,
ndrange=size(ier⁻⁺));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand All @@ -679,7 +679,7 @@ function apply_D_matrix_elemental!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
n_stokes, RS_type.i_λ₁λ₀_all,
ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻,
ndrange=getKernelDim(RS_type,ier⁻⁺,RS_type.i_λ₁λ₀_all));
wait(device, event);
#wait(device, event);
synchronize_if_gpu();
return nothing
end
Expand All @@ -699,7 +699,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{RRS, RRS_plus},
ieJ₀⁻,
ndrange=size(ieJ₀⁻));
#@show "here 1.4"
wait(device, event);
#wait(device, event);
synchronize();
return nothing
end
Expand All @@ -722,7 +722,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus
ndrange = getKernelDimSFI(RS_type,ieJ₀⁻,RS_type.i_λ₁λ₀_all));
#ndrange=size(ieJ₀⁻));
#@show "here 1.4"
wait(device, event);
#wait(device, event);
synchronize();
return nothing
end
Expand Down
Loading

0 comments on commit dc0bf7c

Please sign in to comment.