From 60127f79644250ccfb2db5cfaca65afcc8e32ad5 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 27 Aug 2024 12:53:53 -0400
Subject: [PATCH] fix: remove deprecated deps

---
 .buildkite/pipeline.yml                    |   2 +-
 .github/workflows/CI.yml                   |   2 -
 docs/Project.toml                          |   3 +-
 docs/src/examples/mnist_conv_neural_ode.md |  39 +++----
 docs/src/examples/mnist_neural_ode.md      | 112 +++++++++------------
 test/neural_de_tests.jl                    |   1 -
 6 files changed, 66 insertions(+), 93 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index dd8ce199a..b7fd951b7 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,5 +1,5 @@
 steps:
-  - label: "Julia 1"
+  - label: "Julia 1 (CUDA)"
     plugins:
       - JuliaCI/julia#v1:
           version: "1.10"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index eb9553e34..5c60daf78 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -46,8 +46,6 @@ jobs:
           coverage: false
         env:
           GROUP: ${{ matrix.group }}
-          RETESTITEMS_NWORKERS: 0
-          RETESTITEMS_TESTITEM_TIMEOUT: 3600
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v4
         with:
diff --git a/docs/Project.toml b/docs/Project.toml
index b4a9d80a3..25450fcef 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -14,10 +14,10 @@ IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
-MLDataUtils = "cc2ba9b6-d476-5e6d-8eaf-a92d5412d41d"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
@@ -52,7 +52,6 @@ IterTools = "1"
 LinearAlgebra = "1"
 Lux = "0.5.5"
 LuxCUDA = "0.3"
-MLDataUtils = "0.5"
 MLDatasets = "0.7"
 MLUtils = "0.4"
 NNlib = "0.9"
diff --git a/docs/src/examples/mnist_conv_neural_ode.md b/docs/src/examples/mnist_conv_neural_ode.md
index 023658593..740c12a37 100644
--- a/docs/src/examples/mnist_conv_neural_ode.md
+++ b/docs/src/examples/mnist_conv_neural_ode.md
@@ -8,10 +8,9 @@ using Fully Connected Layers.
 
 ```@example mnist_cnn
 using DiffEqFlux, Statistics, ComponentArrays, CUDA, Zygote, MLDatasets, OrdinaryDiffEq,
-      Printf, Test, LuxCUDA, Random
+      Printf, Test, LuxCUDA, Random, MLUtils, OneHotArrays
 using Optimization, OptimizationOptimisers
 using MLDatasets: MNIST
-using MLDataUtils: LabelEnc, convertlabel, stratifiedobs, batchview
 
 const cdev = cpu_device()
 const gdev = gpu_device()
@@ -22,26 +21,21 @@ ENV["DATADEPS_ALWAYS_ACCEPT"] = true
 logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ; dims = 1); dims = 1))
 
 function loadmnist(batchsize = bs)
-    # Use MLDataUtils LabelEnc for natural onehot conversion
-    function onehot(labels_raw)
-        convertlabel(LabelEnc.OneOfK, labels_raw, LabelEnc.NativeLabels(collect(0:9)))
-    end
     # Load MNIST
-    mnist = MNIST(; split = :train)
-    imgs, labels_raw = mnist.features, mnist.targets
+    dataset = MNIST(; split = :train)
+    imgs = dataset.features
+    labels_raw = dataset.targets
+
     # Process images into (H,W,C,BS) batches
-    x_train = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3))) |>
-              gdev
-    x_train = batchview(x_train, batchsize)
-    # Onehot and batch the labels
-    y_train = onehot(labels_raw) |> gdev
-    y_train = batchview(y_train, batchsize)
-    return x_train, y_train
+    x_data = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3)))
+    y_data = onehotbatch(labels_raw, 0:9)
+
+    return DataLoader((x_data, y_data); batchsize, shuffle = true)
 end
 
 # Main
 const bs = 32
-x_train, y_train = loadmnist(bs)
+dataloader = loadmnist(bs)
 
 down = Chain(Conv((3, 3), 1 => 64, relu; stride = 1), GroupNorm(64, 64),
     Conv((4, 4), 64 => 64, relu; stride = 2, pad = 1),
@@ -56,9 +50,7 @@ fc = Chain(GroupNorm(64, 64), x -> relu.(x), MeanPool((6, 6)),
 nn_ode = NeuralODE(dudt, (0.0f0, 1.0f0), Tsit5(); save_everystep = false,
     reltol = 1e-3, abstol = 1e-3, save_start = false)
 
-function DiffEqArray_to_Array(x)
-    xarr = gdev(x.u[1])
-end
+DiffEqArray_to_Array(x) = x.u[end]
 
 # Build our over-all model topology
 m = Chain(down,                 # (28, 28, 1, BS) -> (6, 6, 64, BS)
@@ -70,8 +62,9 @@ ps = ComponentArray(ps) |> gdev
 st = st |> gdev
 
 # To understand the intermediate NN-ODE layer, we can examine it's dimensionality
-img = x_train[1][:, :, :, 1:1] |> gdev
-lab = y_train[1][:, 1:1] |> gdev
+x_train1, y_train1 = first(dataloader)
+img = x_train1[:, :, :, 1:1] |> gdev
+lab = y_train1[:, 1:1] |> gdev
 
 x_m, _ = m(img, ps, st)
 
@@ -91,7 +84,7 @@ function accuracy(model, data, ps, st; n_batches = 10)
 end
 
 # burn in accuracy
-accuracy(m, zip(x_train, y_train), ps, st)
+accuracy(m, ((x_train1, y_train1),), ps, st)
 
 function loss_function(ps, x, y)
     pred, st_ = m(x, ps, st)
@@ -99,7 +92,7 @@ function loss_function(ps, x, y)
 end
 
 #burn in loss
-loss_function(ps, x_train[1], y_train[1])
+loss_function(ps, x_train1, y_train1)
 
 opt = OptimizationOptimisers.Adam(0.05)
 iter = 0
diff --git a/docs/src/examples/mnist_neural_ode.md b/docs/src/examples/mnist_neural_ode.md
index b120db59f..77e879579 100644
--- a/docs/src/examples/mnist_neural_ode.md
+++ b/docs/src/examples/mnist_neural_ode.md
@@ -1,15 +1,15 @@
 # [GPU-based MNIST Neural ODE Classifier](@id mnist)
 
-Training a classifier for **MNIST** using a neural ordinary differential equation **NeuralODE**
-on **GPUs** with **minibatching**.
+Training a classifier for **MNIST** using a neural ordinary differential equation
+**NeuralODE** on **GPUs** with **minibatching**.
 
 (Step-by-step description below)
 
 ```@example mnist
-using DiffEqFlux, CUDA, Zygote, MLDataUtils, NNlib, OrdinaryDiffEq, Test, Lux, Statistics,
-      ComponentArrays, Random, Optimization, OptimizationOptimisers, LuxCUDA
+using DiffEqFlux, CUDA, Zygote, NNlib, OrdinaryDiffEq, Test, Lux, Statistics,
+      ComponentArrays, Random, Optimization, OptimizationOptimisers, LuxCUDA,
+      MLUtils, OneHotArrays
 using MLDatasets: MNIST
-using MLDataUtils: LabelEnc, convertlabel, stratifiedobs
 
 CUDA.allowscalar(false)
 ENV["DATADEPS_ALWAYS_ACCEPT"] = true
@@ -20,26 +20,21 @@ const gdev = gpu_device()
 logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ; dims = 1); dims = 1))
 
 function loadmnist(batchsize = bs)
-    # Use MLDataUtils LabelEnc for natural onehot conversion
-    function onehot(labels_raw)
-        convertlabel(LabelEnc.OneOfK, labels_raw, LabelEnc.NativeLabels(collect(0:9)))
-    end
     # Load MNIST
-    mnist = MNIST(; split = :train)
-    imgs, labels_raw = mnist.features, mnist.targets
+    dataset = MNIST(; split = :train)
+    imgs = dataset.features
+    labels_raw = dataset.targets
+
     # Process images into (H,W,C,BS) batches
-    x_train = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3))) |>
-              gdev
-    x_train = batchview(x_train, batchsize)
-    # Onehot and batch the labels
-    y_train = onehot(labels_raw) |> gdev
-    y_train = batchview(y_train, batchsize)
-    return x_train, y_train
+    x_data = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3)))
+    y_data = onehotbatch(labels_raw, 0:9)
+
+    return DataLoader((x_data, y_data); batchsize, shuffle = true)
 end
 
 # Main
-const bs = 128
-x_train, y_train = loadmnist(bs)
+const bs = 32
+dataloader = loadmnist(bs)
 
 down = Lux.Chain(Lux.FlattenLayer(), Lux.Dense(784, 20, tanh))
 nn = Lux.Chain(Lux.Dense(20, 10, tanh), Lux.Dense(10, 10, tanh), Lux.Dense(10, 20, tanh))
@@ -48,30 +43,29 @@ fc = Lux.Dense(20, 10)
 nn_ode = NeuralODE(nn, (0.0f0, 1.0f0), Tsit5(); save_everystep = false,
     reltol = 1e-3, abstol = 1e-3, save_start = false)
 
-function DiffEqArray_to_Array(x)
-    xarr = gdev(x.u[1])
-    return xarr
-end
+DiffEqArray_to_Array(x) = x.u[end]
 
-#Build our over-all model topology
+# Build our over-all model topology
 m = Lux.Chain(; down, nn_ode, convert = Lux.WrappedFunction(DiffEqArray_to_Array), fc)
 ps, st = Lux.setup(Xoshiro(0), m)
 ps = ComponentArray(ps) |> gdev
 st = st |> gdev
 
-#We can also build the model topology without a NN-ODE
+# We can also build the model topology without a NN-ODE
 m_no_ode = Lux.Chain(; down, nn, fc)
 ps_no_ode, st_no_ode = Lux.setup(Xoshiro(0), m_no_ode)
 ps_no_ode = ComponentArray(ps_no_ode) |> gdev
 st_no_ode = st_no_ode |> gdev
 
-#To understand the intermediate NN-ODE layer, we can examine it's dimensionality
-x_d = first(down(x_train[1], ps.down, st.down))
+x_train1, y_train1 = first(dataloader)
+
+# To understand the intermediate NN-ODE layer, we can examine it's dimensionality
+x_d = first(down(x_train1, ps.down, st.down))
 
 # We can see that we can compute the forward pass through the NN topology featuring an NNODE layer.
-x_m = first(m(x_train[1], ps, st))
-#Or without the NN-ODE layer.
-x_m = first(m_no_ode(x_train[1], ps_no_ode, st_no_ode))
+x_m = first(m(x_train1, ps, st))
+# Or without the NN-ODE layer.
+x_m = first(m_no_ode(x_train1, ps_no_ode, st_no_ode))
 
 classify(x) = argmax.(eachcol(x))
 
@@ -87,16 +81,15 @@ function accuracy(model, data, ps, st; n_batches = 100)
     end
     return total_correct / total
 end
-#burn in accuracy
-accuracy(m, zip(x_train, y_train), ps, st)
+
+accuracy(m, ((x_train1, y_train1),), ps, st) # burn in accuracy
 
 function loss_function(ps, x, y)
     pred, st_ = m(x, ps, st)
     return logitcrossentropy(pred, y), pred
 end
 
-#burn in loss
-loss_function(ps, x_train[1], y_train[1])
+loss_function(ps, x_train1, y_train1) # burn in loss
 
 opt = OptimizationOptimisers.Adam(0.05)
 iter = 0
@@ -107,8 +100,8 @@ opt_prob = OptimizationProblem(opt_func, ps)
 
 function callback(ps, l, pred)
     global iter += 1
-    #Monitor that the weights do infact update
-    #Every 10 training iterations show accuracy
+    # Monitor that the weights do infact update
+    # Every 10 training iterations show accuracy
     if (iter % 10 == 0)
         @info "[MNIST GPU] Accuracy: $(accuracy(m, zip(x_train, y_train), ps, st))"
     end
@@ -125,10 +118,10 @@ res = Optimization.solve(opt_prob, opt, zip(x_train, y_train); callback)
 ### Load Packages
 
 ```@example mnist
-using DiffEqFlux, CUDA, Zygote, MLDataUtils, NNlib, OrdinaryDiffEq, Test, Lux, Statistics,
-      ComponentArrays, Random, Optimization, OptimizationOptimisers, LuxCUDA
+using DiffEqFlux, CUDA, Zygote, NNlib, OrdinaryDiffEq, Test, Lux, Statistics,
+      ComponentArrays, Random, Optimization, OptimizationOptimisers, LuxCUDA,
+      MLUtils, OneHotArrays
 using MLDatasets: MNIST
-using MLDataUtils: LabelEnc, convertlabel, stratifiedobs
 ```
 
 ### GPU
@@ -163,21 +156,16 @@ meaning that every minibatch will contain 128 images with a single color channel
 logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ; dims = 1); dims = 1))
 
 function loadmnist(batchsize = bs)
-    # Use MLDataUtils LabelEnc for natural onehot conversion
-    function onehot(labels_raw)
-        convertlabel(LabelEnc.OneOfK, labels_raw, LabelEnc.NativeLabels(collect(0:9)))
-    end
     # Load MNIST
-    mnist = MNIST(; split = :train)
-    imgs, labels_raw = mnist.features, mnist.targets
+    dataset = MNIST(; split = :train)
+    imgs = dataset.features
+    labels_raw = dataset.targets
+
     # Process images into (H,W,C,BS) batches
-    x_train = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3))) |>
-              gdev
-    x_train = batchview(x_train, batchsize)
-    # Onehot and batch the labels
-    y_train = onehot(labels_raw) |> gdev
-    y_train = batchview(y_train, batchsize)
-    return x_train, y_train
+    x_data = Float32.(reshape(imgs, size(imgs, 1), size(imgs, 2), 1, size(imgs, 3)))
+    y_data = onehotbatch(labels_raw, 0:9)
+
+    return DataLoader((x_data, y_data); batchsize, shuffle = true)
 end
 ```
 
@@ -185,8 +173,8 @@ and then loaded from main:
 
 ```@example mnist
 # Main
-const bs = 128
-x_train, y_train = loadmnist(bs)
+const bs = 32
+dataloader = loadmnist(bs)
 ```
 
 ### Layers
@@ -222,10 +210,7 @@ a Matrix (CuArray), and reduces the matrix from 3 to 2 dimensions for use in the
 nn_ode = NeuralODE(nn, (0.0f0, 1.0f0), Tsit5(); save_everystep = false,
     reltol = 1e-3, abstol = 1e-3, save_start = false)
 
-function DiffEqArray_to_Array(x)
-    xarr = gdev(x.u[1])
-    return xarr
-end
+DiffEqArray_to_Array(x) = x.u[end]
 ```
 
 For CPU: If this function does not automatically fall back to CPU when no GPU is present, we can
@@ -269,7 +254,7 @@ function accuracy(model, data, ps, st; n_batches = 100)
     end
     return total_correct / total
 end
-#burn in accuracy
+
 accuracy(m, zip(x_train, y_train), ps, st)
 ```
 
@@ -290,8 +275,7 @@ function loss_function(ps, x, y)
     return logitcrossentropy(pred, y), pred
 end
 
-#burn in loss
-loss_function(ps, x_train[1], y_train[1])
+loss_function(ps, x_train1, y_train1)
 ```
 
 #### Optimizer
@@ -316,8 +300,8 @@ opt_prob = OptimizationProblem(opt_func, ps)
 
 function callback(ps, l, pred)
     global iter += 1
-    #Monitor that the weights do infact update
-    #Every 10 training iterations show accuracy
+    # Monitor that the weights do infact update
+    # Every 10 training iterations show accuracy
     if (iter % 10 == 0)
         @info "[MNIST GPU] Accuracy: $(accuracy(m, zip(x_train, y_train), ps, st))"
     end
diff --git a/test/neural_de_tests.jl b/test/neural_de_tests.jl
index 8bbd35a23..b32b0d498 100644
--- a/test/neural_de_tests.jl
+++ b/test/neural_de_tests.jl
@@ -277,7 +277,6 @@ end
                     pd = ComponentArray(pd) |> gdev
                     st = st |> gdev
                     broken = hasfield(typeof(kwargs), :sensealg) &&
-                             ndims(u0) == 2 &&
                              kwargs.sensealg isa TrackerAdjoint
                     @test begin
                         grads = Zygote.gradient(sum ∘ last ∘ first ∘ node, u0, pd, st)