From dc0bf7c41047ddd3169b8e5e2d6d0b4b15f66982 Mon Sep 17 00:00:00 2001
From: sunitisanghavi <suniti.sanghavi@gmail.com>
Date: Sun, 1 Dec 2024 14:14:19 -0800
Subject: [PATCH] update vSmartMOM

---
 Project.toml                                  | 42 ++-----------------
 src/Absorption/Absorption.jl                  |  2 +-
 .../compute_absorption_cross_section.jl       |  2 +-
 src/Architectures.jl                          | 17 ++++----
 src/CoreRT/CoreKernel/doubling.jl             |  8 ++--
 src/CoreRT/CoreKernel/doubling_inelastic.jl   | 12 +++---
 src/CoreRT/CoreKernel/elemental.jl            | 12 +++---
 src/CoreRT/CoreKernel/elemental_inelastic.jl  | 16 +++----
 .../CoreKernel/elemental_inelastic_plus.jl    | 20 ++++-----
 src/CoreRT/CoreKernel/interaction_ss.jl       |  6 +--
 src/CoreRT/CoreKernel/raman_kernel_test.jl    |  4 +-
 src/CoreRT/CoreRT.jl                          |  2 +-
 src/CoreRT/gpu_batched.jl                     |  2 +-
 src/CoreRT/model_from_parameters.jl           | 12 +++---
 test/gpu_tests/elemental_test.jl              |  4 +-
 .../O2_parameters2_SIF_grid.yaml              |  2 +-
 16 files changed, 64 insertions(+), 99 deletions(-)

diff --git a/Project.toml b/Project.toml
index 907e1c19..1c3e969b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,67 +4,34 @@ authors = ["Rupesh Jeyaram <rjeyaram@caltech.edu> and contributors"]
 version = "0.5.0"
 
 [deps]
-AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
-AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
-ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
-CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CanopyOptics = "a18e34a6-5dbe-4f38-a44b-e5141852e7a7"
-ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c"
-ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
-Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
-Decimals = "abce61dc-4473-55a0-ba07-351d65e31d42"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
-Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
-GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
-GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6"
-GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
-Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
-GridLayoutBase = "3955a311-db13-416c-9275-1d80ed98e5e9"
-HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
-HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
-Insolation = "e98cc03f-d57e-4e3c-b70c-8d51efe9e0d8"
 InstrumentOperator = "9e589c1b-9e01-4e00-831a-aa39ce86e3ef"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
-LegendrePolynomials = "3db4a2ba-fc88-11e8-3e01-49c72059a882"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9"
 NetCDF_jll = "7243133f-43d8-5620-bbf4-c2c921802cf3"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-PlotUtils = "995b91a9-d308-5afd-9ec6-746e21dbc043"
-PlotlyBase = "a03496cd-edff-5a9b-9e67-9cda94a718b5"
-PlotlyJS = "f0f68f2c-4968-5e81-91da-67840de0976a"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
-PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
-PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -76,26 +43,25 @@ UnitfulEquivalences = "da9c4bc3-91c8-4f02-8a40-6b990d2a7e0c"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 
 [compat]
-CUDA = "3,  4"
-CUDAKernels = "0.2, 0.3, 0.4"
+CUDA = "4, 5"
 DataInterpolations = "3.6,4"
 DiffResults = "1.0"
 Distributions = "0.23, 0.24, 0.25"
 DocStringExtensions = "0.8,0.9"
 FastGaussQuadrature = "0.4,0.5"
 ForwardDiff = "0.10"
-Interpolations = "0.12, 0.13, 0.14"
+Interpolations = "0.14.0"
 JLD2 = "0.1, 0.2, 0.3, 0.4"
 JSON = "0.21"
 KernelAbstractions = "0.8,0.9"
-NNlib = "0.8"
+NNlib = "0.8,0.9"
 NetCDF = "0.10, 0.11"
 Parameters = "0.12"
 Polynomials = "2,3"
 ProgressMeter = "1.3"
 SpecialFunctions = "2"
 StaticArrays = "1.2"
-StatsBase = "0.33"
+StatsBase = "0.34.3"
 TimerOutputs = "0.5"
 YAML = "0.4"
 julia = "1.7,1.8,1.9"
diff --git a/src/Absorption/Absorption.jl b/src/Absorption/Absorption.jl
index e7642941..512cff29 100644
--- a/src/Absorption/Absorption.jl
+++ b/src/Absorption/Absorption.jl
@@ -15,7 +15,7 @@ using Interpolations            # For interpolating in lookup tables and interpo
 using JLD2                      # For saving and loading the interpolator
 using ProgressMeter             # For showing progress, especially in creating interpolator
 using KernelAbstractions        # For heterogeneous (GPU+CPU) programming
-using CUDAKernels               # Access to CUDADevice
+using CUDA.CUDAKernels               # Access to CUDADevice
 using CUDA                      # For GPU programming
 using ForwardDiff, DiffResults  # For auto-differentiation
 using NetCDF                    # For loading NetCDF files with constants
diff --git a/src/Absorption/compute_absorption_cross_section.jl b/src/Absorption/compute_absorption_cross_section.jl
index c9c5b9a8..a5ea2518 100644
--- a/src/Absorption/compute_absorption_cross_section.jl
+++ b/src/Absorption/compute_absorption_cross_section.jl
@@ -120,7 +120,7 @@ function compute_absorption_cross_section(
             # Run the event on the kernel 
             # That this, this function adds to each element in result, the contribution from this transition
             event = kernel!(result_view, array_type(architecture)(grid_view), ν, γ_d, γ_l, y, S, broadening, CEF, ndrange=length(grid_view))
-            wait(device, event)
+            ##wait(device, event)
             synchronize_if_gpu()
         end
     end
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 57da898c..6f1f6b3e 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -12,7 +12,7 @@ export
 using CUDA
 
 using KernelAbstractions
-using CUDAKernels
+using CUDA.CUDAKernels
 
 """
     AbstractArchitecture
@@ -42,18 +42,17 @@ macro hascuda(expr)
 end
 
 devi(::CPU) = KernelAbstractions.CPU()
-devi(::GPU) = CUDAKernels.CUDADevice()
+devi(::GPU) = CUDA.CUDABackend(; always_inline=true)
 
-         architecture(::Array)   = CPU()
-#@hascuda 
-        architecture(::CuArray) = GPU()
+architecture(::Array)   = CPU()
+@hascuda architecture(::CuArray) = GPU()
 
-         array_type(::CPU) = Array
-#@hascuda 
-        array_type(::GPU) = CuArray
+array_type(::CPU) = Array
+@hascuda array_type(::GPU) = CuArray
 
 default_architecture = has_cuda() ? GPU() : CPU()
 
-synchronize_if_gpu() = has_cuda() ? synchronize() : nothing
+synchronize_if_gpu() = has_cuda() ? CUDA.synchronize() : nothing
+
 
 end
\ No newline at end of file
diff --git a/src/CoreRT/CoreKernel/doubling.jl b/src/CoreRT/CoreKernel/doubling.jl
index 54944ef6..7974bf54 100644
--- a/src/CoreRT/CoreKernel/doubling.jl
+++ b/src/CoreRT/CoreKernel/doubling.jl
@@ -128,7 +128,7 @@ function apply_D_matrix!(n_stokes::Int, r⁻⁺::AbstractArray{FT,3}, t⁺⁺::A
         device = devi(architecture(r⁻⁺))
         applyD_kernel! = apply_D!(device)
         event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-        wait(device, event);
+        ##wait(device, event);
         synchronize_if_gpu();
         return nothing
     end
@@ -144,7 +144,7 @@ end
         device = devi(Architectures.CPU())
         applyD_kernel! = apply_D!(device)
         event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-        wait(device, event);
+        #wait(device, event);
         return nothing
     end
 end=#
@@ -154,7 +154,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::AbstractArray{FT,3}) where
     device = devi(architecture(J₀⁻))
     applyD_kernel! = apply_D_SFI!(device)
     event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-    wait(device, event);
+    ##wait(device, event);
     synchronize_if_gpu();
     nothing
 end
@@ -167,7 +167,7 @@ function apply_D_matrix_SFI!(n_stokes::Int, J₀⁻::Array{FT,3}) where {FT}
     device = devi(architecture(J₀⁻))
     applyD_kernel! = apply_D_SFI!(device)
     event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-    wait(device, event);
+    #wait(device, event);
     
     return nothing
 end=#
\ No newline at end of file
diff --git a/src/CoreRT/CoreKernel/doubling_inelastic.jl b/src/CoreRT/CoreKernel/doubling_inelastic.jl
index 436fd896..134b733c 100644
--- a/src/CoreRT/CoreKernel/doubling_inelastic.jl
+++ b/src/CoreRT/CoreKernel/doubling_inelastic.jl
@@ -410,7 +410,7 @@ end
 #        device = devi(architecture(r⁻⁺))
 #        applyD_kernel! = apply_D!(device)
 #        event = applyD_kernel!(n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺)); #Suniti: is it possible to  use the same kernel for the 3D elastic and 4D inelastic terms or do we need to call two different kernels separately? 
-#        wait(device, event);
+#        #wait(device, event);
 #        synchronize_if_gpu();
 #        return nothing
 #    end
@@ -427,7 +427,7 @@ function apply_D_matrix_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_stokes
         applyD_kernel_IE! = apply_D_IE_VS!(device)
         event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀_all), n_stokes, 
             ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺,(RS_type.i_λ₁λ₀_all)));
-        wait(device, event);
+        ##wait(device, event);
         synchronize();
         return nothing
     end
@@ -444,7 +444,7 @@ function apply_D_matrix_IE!(RS_type::RRS, n_stokes::Int, ier⁻⁺::AbstractArra
         applyD_kernel_IE! = apply_D_IE_RRS!(device)
         event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀), n_stokes, 
             ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=getKernelDim(RS_type, ier⁻⁺));
-        wait(device, event);
+        ##wait(device, event);
         synchronize();
         return nothing
     end
@@ -456,7 +456,7 @@ end
 #    device = devi(architecture(J₀⁻)) #Suniti: how to do this so that ieJ₀⁻ can also be included?
 #    applyD_kernel! = apply_D_SFI!(device)
 #    event = applyD_kernel!(n_stokes, J₀⁻, ndrange=size(J₀⁻));
-#    wait(device, event);
+#    #wait(device, event);
 #    synchronize();
 #    
 #    return nothing
@@ -470,7 +470,7 @@ function apply_D_matrix_SFI_IE!(RS_type::RRS, n_stokes::Int, ieJ₀⁻::Abstract
     applyD_kernel_IE! = apply_D_SFI_IE_RRS!(device)
     event = applyD_kernel_IE!(aType(RS_type.i_λ₁λ₀),n_stokes, 
                     ieJ₀⁻, ndrange=(size(ieJ₀⁻,1), size(ieJ₀⁻,3), size(ieJ₀⁻,4)));
-    wait(device, event);
+    ##wait(device, event);
     synchronize_if_gpu()
     return nothing
 end
@@ -490,7 +490,7 @@ function apply_D_matrix_SFI_IE!(RS_type::Union{VS_0to1_plus, VS_1to0_plus}, n_st
                             ieJ₀⁻, 
                             aType(RS_type.i_λ₁λ₀_all)));
     #@show "here 3"
-    wait(device, event);
+    ##wait(device, event);
     #@show "here 4"
     synchronize_if_gpu()
     return nothing
diff --git a/src/CoreRT/CoreKernel/elemental.jl b/src/CoreRT/CoreKernel/elemental.jl
index 2a44b742..2c9d6137 100644
--- a/src/CoreRT/CoreKernel/elemental.jl
+++ b/src/CoreRT/CoreKernel/elemental.jl
@@ -81,13 +81,13 @@ function elemental!(pol_type, SFI::Bool,
             kernel! = get_elem_rt!(device)
             event = kernel!(r⁻⁺, t⁺⁺, ϖ_λ, dτ_λ, Z⁻⁺, Z⁺⁺, F₀,
                 qp_μN, wct2, ndrange=size(r⁻⁺)); 
-            wait(device, event)
+            #wait(device, event)
             synchronize_if_gpu()
 
             if SFI
                 kernel! = get_elem_rt_SFI!(device)
                 event = kernel!(J₀⁺, J₀⁻, ϖ_λ, dτ_λ, τ_sum, Z⁻⁺, Z⁺⁺, F₀, qp_μN, ndoubl, wct02, pol_type.n, arr_type(pol_type.I₀), iμ₀, D, ndrange=size(J₀⁺))
-                wait(device, event)
+                #wait(device, event)
                 synchronize_if_gpu()
             end
         end
@@ -153,7 +153,7 @@ function elemental!(pol_type, SFI::Bool,
         #@show "Start event",   typeof(wct2)
         event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, qp_μN, wct2, ndrange=size(r⁻⁺)); 
         #@show "Stop event"
-        wait(device, event)
+        #wait(device, event)
         synchronize_if_gpu()
 
         if SFI
@@ -161,7 +161,7 @@ function elemental!(pol_type, SFI::Bool,
             #@show size(F₀)
             event = kernel!(J₀⁺, J₀⁻, ϖ, dτ, arr_type(τ_sum), Z⁻⁺, Z⁺⁺, 
             arr_type(F₀), qp_μN, ndoubl, wct02, pol_type.n, I₀, iμ₀, D, ndrange=size(J₀⁺))
-            wait(device, event)
+            #wait(device, event)
         end
         #ii = pol_type.n*(iμ0-1)+1
         #@show 'B',iμ0,  r⁻⁺[1,ii,1]/(J₀⁻[1,1,1]*wt_μ[iμ0]), r⁻⁺[1,ii,1], J₀⁻[1,1,1]*wt_μ[iμ0], J₀⁺[1,1,1]*wt_μ[iμ0]
@@ -317,7 +317,7 @@ function apply_D_matrix_elemental!(ndoubl::Int, n_stokes::Int, r⁻⁺::Abstract
     device = devi(architecture(r⁻⁺))
     applyD_kernel! = apply_D_elemental!(device)
     event = applyD_kernel!(ndoubl,n_stokes, r⁻⁺, t⁺⁺, r⁺⁻, t⁻⁻, ndrange=size(r⁻⁺));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -329,7 +329,7 @@ function apply_D_matrix_elemental_SFI!(ndoubl::Int, n_stokes::Int, J₀⁻::Abst
         device = devi(architecture(J₀⁻))
         applyD_kernel! = apply_D_elemental_SFI!(device)
         event = applyD_kernel!(ndoubl,n_stokes, J₀⁻, ndrange=size(J₀⁻));
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
         return nothing
     end
diff --git a/src/CoreRT/CoreKernel/elemental_inelastic.jl b/src/CoreRT/CoreKernel/elemental_inelastic.jl
index ee10327e..d3e401d1 100644
--- a/src/CoreRT/CoreKernel/elemental_inelastic.jl
+++ b/src/CoreRT/CoreKernel/elemental_inelastic.jl
@@ -224,7 +224,7 @@ function get_elem_rt!(RS_type::RRS,
                     aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), 
                     qp_μN, wct2, 
                     ndrange=getKernelDim(RS_type,ier⁻⁺)); 
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
         #for j=1:1:length(qp_μN)
         #    @show minimum(iet⁺⁺[1:3:end,j,200,50]), minimum(ier⁻⁺[1:3:end,j,200,50]) 
@@ -254,7 +254,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1, VS_1to0},
         aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
 end
 
@@ -342,7 +342,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1, VS_1to0},
     qp_μN, ndoubl, wct02, nStokes, 
     I₀, iμ0, D, 
     ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 end
 
@@ -441,7 +441,7 @@ function get_elem_rt_SFI!(RS_type::RRS,
                 I₀, iμ0, D, 
                 ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
     
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
     #@show minimum(ieJ₀⁺[1:3:end,1,200,50]), minimum(ieJ₀⁻[1:3:end,1,200,50]) 
     #@show maximum(ieJ₀⁺[1:3:end,1,200,50]), maximum(ieJ₀⁻[1:3:end,1,200,50]) 
@@ -661,7 +661,7 @@ function apply_D_matrix_elemental!(RS_type::Union{RRS, RRS_plus}, ndoubl::Int, n
         n_stokes, 
         ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, 
         ndrange=size(ier⁻⁺));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -679,7 +679,7 @@ function apply_D_matrix_elemental!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
                     n_stokes, RS_type.i_λ₁λ₀_all,
                     ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, 
                     ndrange=getKernelDim(RS_type,ier⁻⁺,RS_type.i_λ₁λ₀_all));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -699,7 +699,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{RRS, RRS_plus},
                                 ieJ₀⁻, 
                                 ndrange=size(ieJ₀⁻));
         #@show "here 1.4"
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end
@@ -722,7 +722,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus
                             ndrange = getKernelDimSFI(RS_type,ieJ₀⁻,RS_type.i_λ₁λ₀_all));
                             #ndrange=size(ieJ₀⁻));
         #@show "here 1.4"
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end
diff --git a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl
index 701f4edb..22f4dc0b 100644
--- a/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl
+++ b/src/CoreRT/CoreKernel/elemental_inelastic_plus.jl
@@ -192,7 +192,7 @@ function get_elem_rt!(RS_type::RRS_plus,
                     aType(Z⁺⁺_λ₁λ₀[:,:,bandSpecLim[iB]]), 
                     qp_μN, wct2, 
                     ndrange=getKernelDim(RS_type,ier⁻⁺[:,:,RS_type.bandSpecLim[iB],:])); 
-        wait(device, event);
+        #wait(device, event);
         synchronize_if_gpu();
     end
 end
@@ -221,7 +221,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         aType(Z⁻⁺_λ₁λ₀), aType(Z⁺⁺_λ₁λ₀), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     #@show size(i_λ₁λ₀), size(i_λ₁λ₀_VS_n2), size(i_λ₁λ₀_VS_o2)
     #@show "RVS", t_ier⁻⁺[1,1,i_λ₁λ₀[findall(i_λ₁λ₀.>0)]]
@@ -241,7 +241,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         aType(Z⁻⁺_λ₁λ₀_VS_n2), aType(Z⁺⁺_λ₁λ₀_VS_n2), 
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺,i_λ₁λ₀_VS_n2)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     #@show "VS N2", t_ier⁻⁺[1,1,i_λ₁λ₀_VS_n2[findall(i_λ₁λ₀_VS_n2.>0)]]
     ier⁻⁺ .+= t_ier⁻⁺
@@ -261,7 +261,7 @@ function get_elem_rt!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         aType(Z⁻⁺_λ₁λ₀_VS_o2), aType(Z⁺⁺_λ₁λ₀_VS_o2),
         qp_μN, wct2, 
         ndrange=getKernelDim(RS_type,ier⁻⁺, i_λ₁λ₀_VS_o2)); 
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     #@show "VS O2", t_ier⁻⁺[1,1,i_λ₁λ₀_VS_o2[findall(i_λ₁λ₀_VS_o2.>0)]]
     ier⁻⁺ .+= t_ier⁻⁺
@@ -386,7 +386,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         qp_μN, ndoubl, wct02, nStokes, 
         I₀, iμ0, D, 
         ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀)); #change this
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 
     t_ieJ₀⁺ = similar(ieJ₀⁻)
@@ -403,7 +403,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         qp_μN, ndoubl, wct02, nStokes, 
         I₀, iμ0, D, 
         ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_n2)); #change this
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
     
     ieJ₀⁺ .+= t_ieJ₀⁺
@@ -419,7 +419,7 @@ function get_elem_rt_SFI!(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
         qp_μN, ndoubl, wct02, nStokes, 
         I₀, iμ0, D, 
         ndrange=getKernelDimSFI(RS_type, ieJ₀⁻, i_λ₁λ₀_VS_o2)); #change this
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
         
     ieJ₀⁺ .+= t_ieJ₀⁺
@@ -526,7 +526,7 @@ function get_elem_rt_SFI!(RS_type::RRS_plus,
                 qp_μN, ndoubl, wct02, nStokes, 
                 I₀, iμ0, D, 
                 ndrange=getKernelDimSFI(RS_type,ieJ₀⁻));
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu();
 end
 
@@ -673,7 +673,7 @@ function apply_D_matrix_elemental!(RS_type::RRS_plus, ndoubl::Int, n_stokes::Int
     device = devi(architecture(ier⁻⁺))
     applyD_kernel! = apply_D_elemental_RRS!(device)
     event = applyD_kernel!(ndoubl,n_stokes, ier⁻⁺, iet⁺⁺, ier⁺⁻, iet⁻⁻, ndrange=size(ier⁻⁺));
-    wait(device, event);
+    #wait(device, event);
     synchronize_if_gpu();
     return nothing
 end
@@ -686,7 +686,7 @@ function apply_D_matrix_elemental_SFI!(RS_type::Union{RRS_plus, VS_0to1_plus, VS
         device = devi(architecture(ieJ₀⁻))
         applyD_kernel! = apply_D_elemental_SFI!(device)
         event = applyD_kernel!(RS_type,ndoubl,n_stokes, ieJ₀⁻, ndrange=size(ieJ₀⁻));
-        wait(device, event);
+        #wait(device, event);
         synchronize();
         return nothing
     end
diff --git a/src/CoreRT/CoreKernel/interaction_ss.jl b/src/CoreRT/CoreKernel/interaction_ss.jl
index ef1d828c..3fcc34a3 100644
--- a/src/CoreRT/CoreKernel/interaction_ss.jl
+++ b/src/CoreRT/CoreKernel/interaction_ss.jl
@@ -36,7 +36,7 @@ function interaction_ss!(SFI::Bool,
                     arr_type(added_layer.J₀⁺), 
                     arr_type(added_layer.J₀⁻),
                     J₀⁺, J₀⁻, ndrange=size(J₀⁻))
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu()
 end
 
@@ -68,7 +68,7 @@ function interaction_inelastic_ss!(RS_type::RRS,
                 atype(added_layer.ieJ₀⁺), atype(added_layer.ieJ₀⁻),
                 ieJ₀⁺, ieJ₀⁻,
                 ndrange=getKernelDimSFI(RS_type, ieJ₀⁻))
-    wait(device, event)
+    #wait(device, event)
     synchronize_if_gpu()
 end
 
@@ -112,7 +112,7 @@ event = kernel!(τ_sum, τ_λ, qp_μN, atype(i_λ₁λ₀_all),
             atype(added_layer.ieJ₀⁺), atype(added_layer.ieJ₀⁻),
             ieJ₀⁺, ieJ₀⁻,
             ndrange = getKernelDimSFI(RS_type,ieJ₀⁻,RS_type.i_λ₁λ₀_all))
-wait(device, event)
+#wait(device, event)
 synchronize_if_gpu()
 end
 
diff --git a/src/CoreRT/CoreKernel/raman_kernel_test.jl b/src/CoreRT/CoreKernel/raman_kernel_test.jl
index 4fe34b8a..11992f72 100644
--- a/src/CoreRT/CoreKernel/raman_kernel_test.jl
+++ b/src/CoreRT/CoreKernel/raman_kernel_test.jl
@@ -110,7 +110,7 @@ base_iet⁺⁺ = deepcopy(iet⁺⁺);
 device = CPU()
 kernel! = get_elem_rt!(device)
 event = kernel!(ier⁻⁺, iet⁺⁺, ϖ_λ, ϖ_λ₀λ₁,dτ₀, dτ₁, dτ_λ, Z⁻⁺_λ₀λ₁, Z⁺⁺_λ₀λ₁, qp_μN, wct2, ndrange=size(ier⁻⁺)); 
-wait(device, event)
+#wait(device, event)
 
 base_ier⁻⁺ ≈ ier⁻⁺
 base_iet⁺⁺ ≈ iet⁺⁺
@@ -134,7 +134,7 @@ if has_cuda()
     device = CUDAKernels.CUDADevice()
     kernel! = get_elem_rt!(device)
     event = kernel!(c_ier⁻⁺, c_iet⁺⁺, c_ϖ_λ, c_ϖ_λ₀λ₁,dτ₀, dτ₁, c_dτ_λ, c_Z⁻⁺_λ₀λ₁, c_Z⁺⁺_λ₀λ₁, c_qp_μN, c_wct2, ndrange=size(c_ier⁻⁺)); 
-    wait(device, event)
+    #wait(device, event)
 
     cuda_iet⁺⁺ = Array(c_iet⁺⁺);
     cuda_ier⁻⁺ = Array(c_ier⁻⁺);
diff --git a/src/CoreRT/CoreRT.jl b/src/CoreRT/CoreRT.jl
index d8c22a15..a089d021 100644
--- a/src/CoreRT/CoreRT.jl
+++ b/src/CoreRT/CoreRT.jl
@@ -23,7 +23,7 @@ using ...Architectures             # Use Architectures module
 using CUDA                         # GPU CuArrays and functions
 using KernelAbstractions           # Abstracting code for CPU/GPU
 using KernelAbstractions.Extras
-using CUDAKernels
+using CUDA.CUDAKernels
 
 using Unitful                      # For parsing 
 using UnitfulEquivalences          # For converting between wavenumber / wavelength
diff --git a/src/CoreRT/gpu_batched.jl b/src/CoreRT/gpu_batched.jl
index d4ab9694..62dae6f8 100644
--- a/src/CoreRT/gpu_batched.jl
+++ b/src/CoreRT/gpu_batched.jl
@@ -3,7 +3,7 @@
 This file contains implementations of batched linear algebra code
 
 =#
-
+@inline synchronize() = CUDA.synchronize()
 "Given 3D CuArrays A and B, fill in X[:,:,k] = A[:,:,k] \\ B[:,:,k]" 
 function batch_solve!(X::CuArray{FT,3}, A::CuArray{FT,3}, B::CuArray{FT,3}) where {FT}
 
diff --git a/src/CoreRT/model_from_parameters.jl b/src/CoreRT/model_from_parameters.jl
index 89886836..1e6d9c8e 100644
--- a/src/CoreRT/model_from_parameters.jl
+++ b/src/CoreRT/model_from_parameters.jl
@@ -351,8 +351,8 @@ function model_from_parameters(params::vSmartMOM_Parameters)
                 #@show aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃
                 kext_grid = [aerosol_optics_raw_0.k, aerosol_optics_raw_1.k]
                 ksca_grid = [aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃] 
-                interp_linear_kext = linear_interpolation(ν_grid, kext_grid)
-                interp_linear_ksca = linear_interpolation(ν_grid, ksca_grid)
+                interp_linear_kext = LinearInterpolation(ν_grid, kext_grid)
+                interp_linear_ksca = LinearInterpolation(ν_grid, ksca_grid)
                 k = zeros(length(curr_band_λ))
                 ω̃ = zeros(length(curr_band_λ))
                 for i = 1:length(curr_band_λ)
@@ -659,8 +659,8 @@ function model_from_parameters(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
                 #@show aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃
                 kext_grid = [aerosol_optics_raw_0.k, aerosol_optics_raw_1.k]
                 ksca_grid = [aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃] 
-                interp_linear_kext = linear_interpolation(ν_grid, kext_grid)
-                interp_linear_ksca = linear_interpolation(ν_grid, ksca_grid)
+                interp_linear_kext = LinearInterpolation(ν_grid, kext_grid)
+                interp_linear_ksca = LinearInterpolation(ν_grid, ksca_grid)
                 k = zeros(length(curr_band_λ))
                 ω̃ = zeros(length(curr_band_λ))
                 for i = 1:length(curr_band_λ)
@@ -941,8 +941,8 @@ function model_from_parameters(RS_type::Union{VS_0to1_plus, VS_1to0_plus},
                 #@show aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃
                 kext_grid = [aerosol_optics_raw_0.k, aerosol_optics_raw_1.k]
                 ksca_grid = [aerosol_optics_raw_0.k*aerosol_optics_raw_0.ω̃, aerosol_optics_raw_1.k*aerosol_optics_raw_1.ω̃] 
-                interp_linear_kext = linear_interpolation(ν_grid, kext_grid)
-                interp_linear_ksca = linear_interpolation(ν_grid, ksca_grid)
+                interp_linear_kext = LinearInterpolation(ν_grid, kext_grid)
+                interp_linear_ksca = LinearInterpolation(ν_grid, ksca_grid)
                 k = zeros(length(curr_band_λ))
                 ω̃ = zeros(length(curr_band_λ))
                 for i = 1:length(curr_band_λ)
diff --git a/test/gpu_tests/elemental_test.jl b/test/gpu_tests/elemental_test.jl
index c0f5b7fa..6f9f188c 100644
--- a/test/gpu_tests/elemental_test.jl
+++ b/test/gpu_tests/elemental_test.jl
@@ -43,7 +43,7 @@ kernel!(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=si
 
 function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
     event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺));
-    wait(device, event)
+    #wait(device, event)
     synchronize();
 end
 @time test2(r⁻⁺_CPU, t⁺⁺_CPU, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
@@ -59,7 +59,7 @@ kernel! = get_r!(device)
 kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w,0, ndrange=size(r⁻⁺));
 function test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
     event = kernel!(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w, 0, ndrange=size(r⁻⁺));
-    wait(device, event)
+    #wait(device, event)
     synchronize();
 end
 @time test2(r⁻⁺, t⁺⁺, ϖ, dτ, Z⁻⁺, Z⁺⁺, μ, w)
diff --git a/test/test_parameters/O2_parameters2_SIF_grid.yaml b/test/test_parameters/O2_parameters2_SIF_grid.yaml
index e2845238..7635a896 100644
--- a/test/test_parameters/O2_parameters2_SIF_grid.yaml
+++ b/test/test_parameters/O2_parameters2_SIF_grid.yaml
@@ -75,7 +75,7 @@ geometry:
   # Solar zenith angle (degrees)
   sza:                19.0 #19. #45. #60 #30. #60 #32.4436
   # Viewing zenith angles (degrees)
-  vza:                [60., 60., 60., 60., 60.] #[0, 30, 60, 30] #[15, 30, 45, 60] #[32.4436] #[0.072]
+  vza:                [75., 75., 75., 75., 75.] #[0, 30, 60, 30] #[15, 30, 45, 60] #[32.4436] #[0.072]
   # Viewing azimuth angles (degrees)
   vaz:                [0.0, 45., 90., 135., 180.] #, 0.0, 0.0, 180.] #[0.0, 0.0, 180., 180.]
   # Observation altitude (Pa)