diff --git a/core/include/detray/definitions/pdg_particle.hpp b/core/include/detray/definitions/pdg_particle.hpp
index 36aa99759..95951b372 100644
--- a/core/include/detray/definitions/pdg_particle.hpp
+++ b/core/include/detray/definitions/pdg_particle.hpp
@@ -28,13 +28,13 @@ struct pdg_particle {
           m_charge(static_cast<scalar_t>(charge)) {}
 
     DETRAY_HOST_DEVICE
-    std::int32_t pdg_num() const { return m_pdg_num; }
+    constexpr std::int32_t pdg_num() const { return m_pdg_num; }
 
     DETRAY_HOST_DEVICE
-    scalar_type mass() const { return m_mass; }
+    constexpr scalar_type mass() const { return m_mass; }
 
     DETRAY_HOST_DEVICE
-    scalar_type charge() const { return m_charge; }
+    constexpr scalar_type charge() const { return m_charge; }
 
     private:
     std::int32_t m_pdg_num;
@@ -42,6 +42,23 @@ struct pdg_particle {
     scalar_type m_charge;
 };
 
+/// Apply the charge conjugation operator to a particle hypothesis @param ptc
+template <typename scalar_t>
+DETRAY_HOST_DEVICE constexpr pdg_particle<scalar_t> charge_conjugation(
+    const pdg_particle<scalar_t>& ptc) {
+    return (ptc.charge() != 0)
+               ? detray::pdg_particle<scalar_t>{-ptc.pdg_num(), ptc.mass(),
+                                                -ptc.charge()}
+               : ptc;
+}
+
+/// @returns an updated particle hypothesis according to the track qop
+template <typename scalar_t, typename track_t>
+DETRAY_HOST_DEVICE constexpr pdg_particle<scalar_t> update_particle_hypothesis(
+    const pdg_particle<scalar_t>& ptc, const track_t& params) {
+    return (ptc.charge() * params.qop() > 0.f) ? ptc : charge_conjugation(ptc);
+}
+
 // Macro for declaring the particle
 #define DETRAY_DECLARE_PARTICLE(PARTICLE_NAME, PDG_NUM, MASS, CHARGE)   \
     template <typename scalar_t>                                        \
diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
index 846ee04d4..12940e393 100644
--- a/tests/benchmarks/CMakeLists.txt
+++ b/tests/benchmarks/CMakeLists.txt
@@ -32,7 +32,7 @@ target_include_directories(
 
 target_link_libraries(
     detray_benchmarks
-    INTERFACE benchmark::benchmark vecmem::core detray::core detray::test_common
+    INTERFACE benchmark::benchmark vecmem::core detray::core detray::test_utils
 )
 
 unset(_detray_benchmarks_headers)
@@ -46,5 +46,5 @@ endif()
 # Set up all of the "device" benchmarks.
 if(DETRAY_BUILD_CUDA)
     add_subdirectory(cuda)
-    #add_subdirectory( include/detray/benchmarks/device )
+    add_subdirectory(include/detray/benchmarks/device)
 endif()
diff --git a/tests/benchmarks/cpu/CMakeLists.txt b/tests/benchmarks/cpu/CMakeLists.txt
index cfb38d945..03140c4af 100644
--- a/tests/benchmarks/cpu/CMakeLists.txt
+++ b/tests/benchmarks/cpu/CMakeLists.txt
@@ -46,6 +46,25 @@ macro(detray_add_cpu_benchmark algebra)
             PRIVATE DETRAY_BENCHMARK_PRINTOUTS
         )
     endif()
+
+    # Build the benchmark executable for the propagation
+    detray_add_executable( benchmark_cpu_propagation_${algebra}
+       "propagation.cpp"
+       LINK_LIBRARIES detray::benchmark_cpu benchmark::benchmark_main
+                     vecmem::core detray::core_${algebra} detray::test_utils
+    )
+
+    target_compile_options(
+        detray_benchmark_cpu_propagation_${algebra}
+        PRIVATE "-march=native" "-ftree-vectorize"
+    )
+
+    if(OpenMP_CXX_FOUND)
+        target_link_libraries(
+            detray_benchmark_cpu_propagation_${algebra}
+            PRIVATE OpenMP::OpenMP_CXX
+        )
+    endif()
 endmacro()
 
 # Build the array benchmark.
diff --git a/tests/benchmarks/cpu/propagation.cpp b/tests/benchmarks/cpu/propagation.cpp
index d00dc6f1d..ec98ba096 100644
--- a/tests/benchmarks/cpu/propagation.cpp
+++ b/tests/benchmarks/cpu/propagation.cpp
@@ -6,7 +6,6 @@
  */
 
 // Project include(s)
-#include "detray/benchmarks/cpu/propagation_benchmark.hpp"
 #include "detray/detectors/bfield.hpp"
 #include "detray/navigation/navigator.hpp"
 #include "detray/propagator/actor_chain.hpp"
@@ -17,6 +16,9 @@
 #include "detray/propagator/rk_stepper.hpp"
 #include "detray/tracks/tracks.hpp"
 
+// Detray benchmark include(s)
+#include "detray/benchmarks/cpu/propagation_benchmark.hpp"
+
 // Detray test include(s).
 #include "detray/test/utils/detectors/build_toy_detector.hpp"
 #include "detray/test/utils/detectors/build_wire_chamber.hpp"
@@ -63,7 +65,7 @@ int main(int argc, char** argv) {
 
     // Configure toy detector
     toy_det_config toy_cfg{};
-    toy_cfg.use_material_maps(true).n_brl_layers(4u).n_edc_layers(7u);
+    toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u);
 
     std::cout << toy_cfg << std::endl;
 
@@ -80,36 +82,32 @@ int main(int argc, char** argv) {
     std::cout << prop_cfg << std::endl;
 
     // Benchmark config
-    detray::benchmark_base::configuration bench_cfg{};
+    detray::benchmarks::benchmark_base::configuration bench_cfg{};
 
-    std::vector<int> n_tracks{8 * 8,   16 * 16,   32 * 32,
-                              64 * 64, 128 * 128, 256 * 256};
+    std::vector<int> n_tracks{8 * 8,     16 * 16,   32 * 32,  64 * 64,
+                              128 * 128, 256 * 256, 512 * 512};
 
-    int n_trks{*std::max_element(std::begin(n_tracks), std::end(n_tracks))};
-    std::cout << n_trks << std::endl;
+    auto trk_cfg =
+        detray::benchmarks::get_default_trk_gen_config<track_generator_t>(
+            n_tracks);
+
+    // Specific configuration for the random track generation
+    trk_cfg.seed(42u);
+
+    // Add additional tracks for warmup
+    std::size_t n_trks{trk_cfg.n_tracks()};
     bench_cfg.n_warmup(
         static_cast<int>(std::ceil(0.1f * static_cast<float>(n_trks))));
     // Add tracks for warmup
-    n_trks += bench_cfg.do_warmup() ? bench_cfg.n_warmup() : 0;
-
-    // Generate tracks
-    track_generator_t::configuration trk_cfg{};
-    trk_cfg.seed(42u);
+    n_trks += static_cast<std::size_t>(
+        bench_cfg.do_warmup() ? bench_cfg.n_warmup() : 0);
     trk_cfg.n_tracks(n_trks);
-    trk_cfg.randomize_charge(true);
-    trk_cfg.phi_range(-constant<scalar_t>::pi, constant<scalar_t>::pi);
-    trk_cfg.eta_range(-3.f, 3.f);
-    trk_cfg.mom_range(1.f * unit<scalar_t>::GeV, 100.f * unit<scalar_t>::GeV);
-    trk_cfg.origin({0.f, 0.f, 0.f});
-    trk_cfg.origin_stddev({0.f * unit<scalar_t>::mm, 0.f * unit<scalar_t>::mm,
-                           0.f * unit<scalar_t>::mm});
-
-    std::cout << trk_cfg << std::endl;
 
     //
     // Prepare data
     //
-    auto tracks = generate_tracks<track_generator_t>(&host_mr, trk_cfg);
+    auto tracks = detray::benchmarks::generate_tracks<track_generator_t>(
+        &host_mr, trk_cfg);
 
     const auto [toy_det, names] = build_toy_detector(host_mr, toy_cfg);
     const auto [wire_chamber, _] =
@@ -133,24 +131,28 @@ int main(int argc, char** argv) {
               << "----------------------\n\n";
 
     prop_cfg.stepping.do_covariance_transport = true;
-    register_benchmark<propagation_bm, stepper_t, default_chain>(
+    detray::benchmarks::register_benchmark<
+        detray::benchmarks::host_propagation_bm, stepper_t, default_chain>(
         "TOY_DETECTOR_W_COV_TRANSPORT", bench_cfg, prop_cfg, toy_det, bfield,
-        actor_states, tracks, n_tracks);
+        tracks, n_tracks, &actor_states);
 
     prop_cfg.stepping.do_covariance_transport = false;
-    register_benchmark<propagation_bm, stepper_t, empty_chain_t>(
-        "TOY_DETECTOR", bench_cfg, prop_cfg, toy_det, bfield, empty_state,
-        tracks, n_tracks);
+    detray::benchmarks::register_benchmark<
+        detray::benchmarks::host_propagation_bm, stepper_t, empty_chain_t>(
+        "TOY_DETECTOR", bench_cfg, prop_cfg, toy_det, bfield, tracks, n_tracks,
+        &empty_state);
 
     prop_cfg.stepping.do_covariance_transport = true;
-    register_benchmark<propagation_bm, stepper_t, default_chain>(
+    detray::benchmarks::register_benchmark<
+        detray::benchmarks::host_propagation_bm, stepper_t, default_chain>(
         "WIRE_CHAMBER_W_COV_TRANSPORT", bench_cfg, prop_cfg, wire_chamber,
-        bfield, actor_states, tracks, n_tracks);
+        bfield, tracks, n_tracks, &actor_states);
 
     prop_cfg.stepping.do_covariance_transport = false;
-    register_benchmark<propagation_bm, stepper_t, empty_chain_t>(
-        "WIRE_CHAMBER", bench_cfg, prop_cfg, wire_chamber, bfield, empty_state,
-        tracks, n_tracks);
+    detray::benchmarks::register_benchmark<
+        detray::benchmarks::host_propagation_bm, stepper_t, empty_chain_t>(
+        "WIRE_CHAMBER", bench_cfg, prop_cfg, wire_chamber, bfield, tracks,
+        n_tracks, &empty_state);
 
     // Run benchmarks
     ::benchmark::Initialize(&argc, argv);
diff --git a/tests/benchmarks/cuda/CMakeLists.txt b/tests/benchmarks/cuda/CMakeLists.txt
index d34039a18..a101f0854 100644
--- a/tests/benchmarks/cuda/CMakeLists.txt
+++ b/tests/benchmarks/cuda/CMakeLists.txt
@@ -26,26 +26,24 @@ if(DETRAY_EIGEN_PLUGIN)
 endif()
 
 foreach(algebra ${algebras})
-    detray_add_executable(benchmark_cuda_${algebra}
-   "benchmark_propagator_cuda_kernel.hpp"
-       "benchmark_propagator_cuda.cpp"
-       "benchmark_propagator_cuda_kernel.cu"
-       LINK_LIBRARIES detray::benchmarks detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
+    detray_add_executable(benchmark_cuda_propagation_${algebra}
+      "propagation.cpp"
+       LINK_LIBRARIES detray::benchmark_cuda detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils
     )
 
     target_compile_definitions(
-        detray_benchmark_cuda_${algebra}
+        detray_benchmark_cuda_propagation_${algebra}
         PRIVATE ${algebra}=${algebra}
     )
 
     target_compile_options(
-        detray_benchmark_cuda_${algebra}
+        detray_benchmark_cuda_propagation_${algebra}
         PRIVATE "-march=native" "-ftree-vectorize"
     )
 
     if(OpenMP_CXX_FOUND)
         target_link_libraries(
-            detray_benchmark_cuda_${algebra}
+            detray_benchmark_cuda_propagation_${algebra}
             PRIVATE OpenMP::OpenMP_CXX
         )
     endif()
diff --git a/tests/benchmarks/cuda/benchmark_propagator_cuda.cpp b/tests/benchmarks/cuda/benchmark_propagator_cuda.cpp
deleted file mode 100644
index a19739c4e..000000000
--- a/tests/benchmarks/cuda/benchmark_propagator_cuda.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/** Detray library, part of the ACTS project (R&D line)
- *
- * (c) 2022-2024 CERN for the benefit of the ACTS project
- *
- * Mozilla Public License Version 2.0
- */
-
-// Project include(s)
-#include "benchmark_propagator_cuda_kernel.hpp"
-
-// Detray test include(s).
-#include "detray/test/utils/detectors/build_toy_detector.hpp"
-#include "detray/test/utils/simulation/event_generator/track_generators.hpp"
-#include "detray/test/utils/types.hpp"
-
-// Vecmem include(s)
-#include <vecmem/memory/binary_page_memory_resource.hpp>
-#include <vecmem/memory/cuda/device_memory_resource.hpp>
-#include <vecmem/memory/cuda/managed_memory_resource.hpp>
-#include <vecmem/utils/cuda/copy.hpp>
-
-// Google include(s).
-#include <benchmark/benchmark.h>
-
-using namespace detray;
-
-// VecMem memory resource(s)
-vecmem::host_memory_resource host_mr;
-vecmem::cuda::managed_memory_resource mng_mr;
-vecmem::cuda::device_memory_resource dev_mr;
-vecmem::binary_page_memory_resource bp_mng_mr(mng_mr);
-
-// detector configuration
-auto toy_cfg = toy_det_config{}
-                   .n_brl_layers(4u)
-                   .n_edc_layers(7u)
-                   .do_check(false)
-                   .use_material_maps(true);
-
-void fill_tracks(vecmem::vector<free_track_parameters<algebra_t>> &tracks,
-                 const std::size_t n_tracks, bool do_sort = true) {
-    using scalar_t = dscalar<algebra_t>;
-    using uniform_gen_t =
-        detail::random_numbers<scalar_t,
-                               std::uniform_real_distribution<scalar_t>>;
-    using trk_generator_t =
-        random_track_generator<free_track_parameters<algebra_t>, uniform_gen_t>;
-
-    trk_generator_t::configuration trk_gen_cfg{};
-    trk_gen_cfg.seed(42u);
-    trk_gen_cfg.n_tracks(n_tracks);
-    trk_gen_cfg.randomize_charge(true);
-    trk_gen_cfg.phi_range(-constant<scalar_t>::pi, constant<scalar_t>::pi);
-    trk_gen_cfg.eta_range(-3.f, 3.f);
-    trk_gen_cfg.mom_range(1.f * unit<scalar_t>::GeV,
-                          100.f * unit<scalar_t>::GeV);
-    trk_gen_cfg.origin({0.f, 0.f, 0.f});
-    trk_gen_cfg.origin_stddev({0.f * unit<scalar_t>::mm,
-                               0.f * unit<scalar_t>::mm,
-                               0.f * unit<scalar_t>::mm});
-
-    // Iterate through uniformly distributed momentum directions
-    for (auto traj : trk_generator_t{trk_gen_cfg}) {
-        tracks.push_back(traj);
-    }
-
-    if (do_sort) {
-        // Sort by theta angle
-        const auto traj_comp = [](const auto &lhs, const auto &rhs) {
-            constexpr auto pi_2{constant<scalar_t>::pi_2};
-            return math::fabs(pi_2 - vector::theta(lhs.dir())) <
-                   math::fabs(pi_2 - vector::theta(rhs.dir()));
-        };
-
-        std::ranges::sort(tracks, traj_comp);
-    }
-}
-
-template <propagate_option opt>
-static void BM_PROPAGATOR_CPU(benchmark::State &state) {
-
-    // Create the toy geometry and bfield
-    auto [det, names] = build_toy_detector(host_mr, toy_cfg);
-    test::vector3 B{0.f, 0.f, 2.f * unit<scalar>::T};
-    auto bfield = bfield::create_const_field(B);
-
-    // Create propagator
-    propagation::config cfg{};
-    cfg.navigation.search_window = {3u, 3u};
-    propagator_host_type p{cfg};
-
-    std::size_t total_tracks = 0;
-
-    // Get tracks
-    vecmem::vector<free_track_parameters<algebra_t>> tracks(&host_mr);
-    fill_tracks(tracks, static_cast<std::size_t>(state.range(0)),
-                static_cast<std::size_t>(state.range(0)));
-
-    total_tracks += tracks.size();
-
-    for (auto _ : state) {
-
-#pragma omp parallel for
-        for (auto &track : tracks) {
-
-            parameter_transporter<algebra_t>::state transporter_state{};
-            pointwise_material_interactor<algebra_t>::state interactor_state{};
-            parameter_resetter<algebra_t>::state resetter_state{};
-
-            auto actor_states =
-                tie(transporter_state, interactor_state, resetter_state);
-
-            // Create the propagator state
-            propagator_host_type::state p_state(track, bfield, det);
-
-            // Run propagation
-            if constexpr (opt == propagate_option::e_unsync) {
-                ::benchmark::DoNotOptimize(p.propagate(p_state, actor_states));
-            } else if constexpr (opt == propagate_option::e_sync) {
-                ::benchmark::DoNotOptimize(
-                    p.propagate_sync(p_state, actor_states));
-            }
-        }
-    }
-
-    state.counters["TracksPropagated"] = benchmark::Counter(
-        static_cast<double>(total_tracks), benchmark::Counter::kIsRate);
-}
-
-template <propagate_option opt>
-static void BM_PROPAGATOR_CUDA(benchmark::State &state) {
-
-    std::size_t n_tracks{static_cast<std::size_t>(state.range(0)) *
-                         static_cast<std::size_t>(state.range(0))};
-
-    // Create the toy geometry
-    auto [det, names] = build_toy_detector(host_mr, toy_cfg);
-    test::vector3 B{0.f, 0.f, 2.f * unit<scalar>::T};
-    auto bfield = bfield::create_const_field(B);
-
-    // vecmem copy helper object
-    vecmem::cuda::copy cuda_cpy;
-
-    // Copy detector to device
-    auto det_buff = detray::get_buffer(det, dev_mr, cuda_cpy);
-    auto det_view = detray::get_data(det_buff);
-
-    std::size_t total_tracks = 0;
-
-    // Get tracks
-    vecmem::vector<free_track_parameters<algebra_t>> tracks(&bp_mng_mr);
-    fill_tracks(tracks, static_cast<std::size_t>(state.range(0)),
-                static_cast<std::size_t>(state.range(0)));
-
-    total_tracks += tracks.size();
-
-    for (auto _ : state) {
-
-        // Get tracks data
-        auto tracks_data = vecmem::get_data(tracks);
-
-        // Run the propagator test for GPU device
-        propagator_benchmark(det_view, bfield, tracks_data, opt);
-    }
-
-    state.counters["TracksPropagated"] = benchmark::Counter(
-        static_cast<double>(total_tracks), benchmark::Counter::kIsRate);
-}
-
-BENCHMARK_TEMPLATE(BM_PROPAGATOR_CUDA, propagate_option::e_unsync)
-    ->Name("CUDA unsync propagation")
-    ->RangeMultiplier(2)
-    ->Range(8, 256);
-BENCHMARK_TEMPLATE(BM_PROPAGATOR_CUDA, propagate_option::e_sync)
-    ->Name("CUDA sync propagation")
-    ->RangeMultiplier(2)
-    ->Range(8, 256);
-
-BENCHMARK_MAIN();
diff --git a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu b/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu
deleted file mode 100644
index 6bba06fd1..000000000
--- a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-/** Detray library, part of the ACTS project (R&D line)
- *
- * (c) 2022 CERN for the benefit of the ACTS project
- *
- * Mozilla Public License Version 2.0
- */
-
-#include "benchmark_propagator_cuda_kernel.hpp"
-#include "detray/definitions/detail/cuda_definitions.hpp"
-
-namespace detray {
-
-__global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
-    typename detector_host_type::view_type det_data,
-    covfie::field_view<bfield::const_bknd_t> field_data,
-    vecmem::data::vector_view<free_track_parameters<algebra_t>> tracks_data,
-    const propagate_option opt) {
-
-    int gid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    detector_device_type det(det_data);
-    vecmem::device_vector<free_track_parameters<algebra_t>> tracks(tracks_data);
-
-    if (gid >= tracks.size()) {
-        return;
-    }
-
-    // Create propagator
-    propagation::config cfg{};
-    cfg.navigation.search_window = {3u, 3u};
-    propagator_device_type p{cfg};
-
-    parameter_transporter<algebra_t>::state transporter_state{};
-    pointwise_material_interactor<algebra_t>::state interactor_state{};
-    parameter_resetter<algebra_t>::state resetter_state{};
-
-    // Create the actor states
-    auto actor_states =
-        detray::tie(transporter_state, interactor_state, resetter_state);
-    // Create the propagator state
-    propagator_device_type::state p_state(tracks.at(gid), field_data, det);
-
-    // Run propagation
-    if (opt == propagate_option::e_unsync) {
-        p.propagate(p_state, actor_states);
-    } else if (opt == propagate_option::e_sync) {
-        p.propagate_sync(p_state, actor_states);
-    }
-}
-
-void propagator_benchmark(
-    typename detector_host_type::view_type det_data,
-    covfie::field_view<bfield::const_bknd_t> field_data,
-    vecmem::data::vector_view<free_track_parameters<algebra_t>>& tracks_data,
-    const propagate_option opt) {
-
-    constexpr int thread_dim = 256;
-    int block_dim =
-        static_cast<int>(tracks_data.size() + thread_dim - 1) / thread_dim;
-
-    // run the test kernel
-    propagator_benchmark_kernel<<<block_dim, thread_dim>>>(det_data, field_data,
-                                                           tracks_data, opt);
-
-    // cuda error check
-    DETRAY_CUDA_ERROR_CHECK(cudaGetLastError());
-    DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
-}
-
-}  // namespace detray
diff --git a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp b/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp
deleted file mode 100644
index ee2f505cb..000000000
--- a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/** Detray library, part of the ACTS project (R&D line)
- *
- * (c) 2022-2024 CERN for the benefit of the ACTS project
- *
- * Mozilla Public License Version 2.0
- */
-
-#pragma once
-
-// Project include(s)
-#include "detray/definitions/detail/algebra.hpp"
-#include "detray/definitions/units.hpp"
-#include "detray/detectors/bfield.hpp"
-#include "detray/detectors/toy_metadata.hpp"
-#include "detray/navigation/navigator.hpp"
-#include "detray/propagator/actor_chain.hpp"
-#include "detray/propagator/actors/aborters.hpp"
-#include "detray/propagator/actors/parameter_resetter.hpp"
-#include "detray/propagator/actors/parameter_transporter.hpp"
-#include "detray/propagator/actors/pointwise_material_interactor.hpp"
-#include "detray/propagator/base_actor.hpp"
-#include "detray/propagator/propagator.hpp"
-#include "detray/propagator/rk_stepper.hpp"
-#include "detray/tracks/tracks.hpp"
-
-using algebra_t = ALGEBRA_PLUGIN<detray::scalar>;
-
-using detector_host_type =
-    detray::detector<detray::toy_metadata, detray::host_container_types>;
-using detector_device_type =
-    detray::detector<detray::toy_metadata, detray::device_container_types>;
-
-using navigator_host_type = detray::navigator<detector_host_type>;
-using navigator_device_type = detray::navigator<detector_device_type>;
-using field_type = detray::bfield::const_field_t;
-using rk_stepper_type = detray::rk_stepper<field_type::view_t, algebra_t>;
-using actor_chain_t =
-    detray::actor_chain<detray::tuple, detray::parameter_transporter<algebra_t>,
-                        detray::pointwise_material_interactor<algebra_t>,
-                        detray::parameter_resetter<algebra_t>>;
-using propagator_host_type =
-    detray::propagator<rk_stepper_type, navigator_host_type, actor_chain_t>;
-using propagator_device_type =
-    detray::propagator<rk_stepper_type, navigator_device_type, actor_chain_t>;
-
-enum class propagate_option {
-    e_unsync = 0,
-    e_sync = 1,
-};
-
-namespace detray {
-
-/// test function for propagator with single state
-void propagator_benchmark(
-    typename detector_host_type::view_type det_data,
-    typename field_type::view_t field_data,
-    vecmem::data::vector_view<free_track_parameters<algebra_t>>& tracks_data,
-    const propagate_option opt);
-
-}  // namespace detray
diff --git a/tests/benchmarks/cuda/propagation.cpp b/tests/benchmarks/cuda/propagation.cpp
new file mode 100644
index 000000000..6ceebd7b7
--- /dev/null
+++ b/tests/benchmarks/cuda/propagation.cpp
@@ -0,0 +1,139 @@
+/** Detray library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+// Project include(s)
+#include "detray/detectors/bfield.hpp"
+#include "detray/navigation/navigator.hpp"
+#include "detray/propagator/actor_chain.hpp"
+#include "detray/propagator/actors/aborters.hpp"
+#include "detray/propagator/actors/parameter_resetter.hpp"
+#include "detray/propagator/actors/parameter_transporter.hpp"
+#include "detray/propagator/actors/pointwise_material_interactor.hpp"
+#include "detray/propagator/rk_stepper.hpp"
+#include "detray/tracks/tracks.hpp"
+
+// Detray benchmark include(s)
+#include "detray/benchmarks/device/cuda/propagation_benchmark.hpp"
+
+// Detray test include(s).
+#include "detray/test/utils/detectors/build_toy_detector.hpp"
+#include "detray/test/utils/detectors/build_wire_chamber.hpp"
+#include "detray/test/utils/simulation/event_generator/track_generators.hpp"
+#include "detray/test/utils/types.hpp"
+
+// Vecmem include(s)
+#include <vecmem/memory/cuda/device_memory_resource.hpp>
+#include <vecmem/memory/host_memory_resource.hpp>
+
+// System include(s)
+#include <iostream>
+#include <string>
+
+using namespace detray;
+
+int main(int argc, char** argv) {
+
+    using toy_detector_t = detector<toy_metadata>;
+    using algebra_t = typename toy_detector_t::algebra_type;
+    using scalar_t = dscalar<algebra_t>;
+    using vector3_t = dvector3D<algebra_t>;
+    using free_track_parameters_t = free_track_parameters<algebra_t>;
+    using uniform_gen_t =
+        detail::random_numbers<scalar_t,
+                               std::uniform_real_distribution<scalar_t>>;
+    using track_generator_t =
+        random_track_generator<free_track_parameters_t, uniform_gen_t>;
+    using field_bknd_t = bfield::const_bknd_t;
+
+    vecmem::host_memory_resource host_mr;
+    vecmem::cuda::device_memory_resource dev_mr;
+
+    //
+    // Configuration
+    //
+
+    // Constant magnetic field
+    vector3_t B{0.f, 0.f, 2.f * unit<scalar_t>::T};
+
+    // Configure toy detector
+    toy_det_config toy_cfg{};
+    toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u);
+
+    std::cout << toy_cfg << std::endl;
+
+    // Configure wire chamber
+    wire_chamber_config wire_chamber_cfg{};
+    wire_chamber_cfg.half_z(500.f * unit<scalar>::mm);
+
+    std::cout << wire_chamber_cfg << std::endl;
+
+    // Configure propagation
+    propagation::config prop_cfg{};
+    prop_cfg.navigation.search_window = {3u, 3u};
+
+    std::cout << prop_cfg << std::endl;
+
+    // Benchmark config
+    detray::benchmarks::benchmark_base::configuration bench_cfg{};
+
+    std::vector<int> n_tracks{8 * 8,     16 * 16,   32 * 32,  64 * 64,
+                              128 * 128, 256 * 256, 512 * 512};
+
+    auto trk_cfg =
+        detray::benchmarks::get_default_trk_gen_config<track_generator_t>(
+            n_tracks);
+
+    // Specific configuration for the random track generation
+    trk_cfg.seed(42u);
+
+    // Add additional tracks for warmup
+    std::size_t n_trks{trk_cfg.n_tracks()};
+    bench_cfg.n_warmup(
+        static_cast<int>(std::ceil(0.1f * static_cast<float>(n_trks))));
+    // Add tracks for warmup
+    n_trks += static_cast<std::size_t>(
+        bench_cfg.do_warmup() ? bench_cfg.n_warmup() : 0);
+    trk_cfg.n_tracks(n_trks);
+
+    //
+    // Prepare data
+    //
+    auto tracks = detray::benchmarks::generate_tracks<track_generator_t>(
+        &host_mr, trk_cfg, true);
+
+    const auto [toy_det, names] = build_toy_detector(host_mr, toy_cfg);
+    const auto [wire_chamber, _] =
+        build_wire_chamber(host_mr, wire_chamber_cfg);
+
+    auto bfield = bfield::create_const_field(B);
+
+    //
+    // Register benchmarks
+    //
+    std::cout << "Propagation Benchmarks\n"
+              << "----------------------\n\n";
+
+    prop_cfg.stepping.do_covariance_transport = true;
+    detray::benchmarks::register_benchmark<
+        detray::benchmarks::cuda_propagation_bm,
+        detray::benchmarks::cuda_propagator_type<toy_metadata, field_bknd_t>>(
+        "TOY_DETECTOR_W_COV_TRANSPORT", bench_cfg, prop_cfg, toy_det, bfield,
+        tracks, n_tracks, &dev_mr);
+
+    prop_cfg.stepping.do_covariance_transport = true;
+    detray::benchmarks::register_benchmark<
+        detray::benchmarks::cuda_propagation_bm,
+        detray::benchmarks::cuda_propagator_type<default_metadata,
+                                                 field_bknd_t>>(
+        "WIRE_CHAMBER_W_COV_TRANSPORT", bench_cfg, prop_cfg, wire_chamber,
+        bfield, tracks, n_tracks, &dev_mr);
+
+    // Run benchmarks
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+    ::benchmark::Shutdown();
+}
diff --git a/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp b/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp
index 3d1b79d58..c867b6f19 100644
--- a/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp
+++ b/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2023 CERN for the benefit of the ACTS project
+ * (c) 2023-2024 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -14,14 +14,14 @@
 #include <ostream>
 #include <string>
 
-namespace detray {
+namespace detray::benchmarks {
 
-/// Base type for linear algebra benchmarks with google benchmark
+/// Base type for detray benchmarks with google benchmark
 struct benchmark_base {
     /// Local configuration type
     struct configuration {
         /// Size of data sample to be used in benchmark
-        int m_samples{100u};
+        int m_samples{100};
         /// Run a number of operations before the benchmark
         bool m_warmup = true;
         // Size of data in warm-up round
@@ -46,14 +46,24 @@ struct benchmark_base {
 
         /// Getters
         /// @{
-        int n_samples() const { return m_samples; }
+        constexpr int n_samples() const { return m_samples; }
         constexpr bool do_warmup() const { return m_warmup; }
         constexpr int n_warmup() const { return m_n_warmup; }
         /// @}
 
-        /// Print configuration
+        private:
+        /// Print the benchmark setup
         friend std::ostream& operator<<(std::ostream& os,
-                                        const configuration& c);
+                                        const configuration& cfg) {
+            os << " -> running:\t " << cfg.n_samples() << " samples"
+               << std::endl;
+            if (cfg.do_warmup()) {
+                os << " -> warmup: \t " << cfg.n_warmup() << " samples"
+                   << std::endl;
+            }
+            os << std::endl;
+            return os;
+        }
     };
 
     /// Default construction
@@ -63,14 +73,4 @@ struct benchmark_base {
     virtual ~benchmark_base() = default;
 };
 
-std::ostream& operator<<(std::ostream& os,
-                         const benchmark_base::configuration& cfg) {
-    os << " -> running:\t " << cfg.n_samples() << " samples" << std::endl;
-    if (cfg.do_warmup()) {
-        os << " -> warmup: \t " << cfg.n_warmup() << " samples" << std::endl;
-    }
-    os << std::endl;
-    return os;
-}
-
-}  // namespace detray
+}  // namespace detray::benchmarks
diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt
index e245ce843..c2bc1a0d8 100644
--- a/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt
+++ b/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt
@@ -4,7 +4,7 @@
 #
 # Mozilla Public License Version 2.0
 
-# Set the CUDA build flags.
+# Set the CPU build flags.
 include(detray-compiler-options-cpp)
 
 # Set up a test library, which the "new style" benchmarks and tests could use.
@@ -12,7 +12,4 @@ add_library(detray_benchmark_cpu INTERFACE "propagation_benchmark.hpp")
 
 add_library(detray::benchmark_cpu ALIAS detray_benchmark_cpu)
 
-target_link_libraries(
-    detray_benchmark_cpu
-    INTERFACE detray::benchmarks detray::test_common
-)
+target_link_libraries(detray_benchmark_cpu INTERFACE detray::benchmarks)
diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
index d9ead60c9..4e886c90d 100644
--- a/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
+++ b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
@@ -5,16 +5,16 @@
  * Mozilla Public License Version 2.0
  */
 
+#pragma once
+
 // Project include(s)
+#include "detray/definitions/detail/algebra.hpp"
+#include "detray/tracks/tracks.hpp"
+
+// Detray benchmark include(s)
 #include "detray/benchmarks/benchmark_base.hpp"
 #include "detray/benchmarks/propagation_benchmark_config.hpp"
 #include "detray/benchmarks/propagation_benchmark_utils.hpp"
-#include "detray/core/detail/container_views.hpp"
-#include "detray/definitions/detail/algebra.hpp"
-#include "detray/definitions/detail/containers.hpp"
-#include "detray/definitions/units.hpp"
-#include "detray/detectors/bfield.hpp"
-#include "detray/tracks/tracks.hpp"
 
 // Benchmark include
 #include <benchmark/benchmark.h>
@@ -25,11 +25,12 @@
 #include <random>
 #include <string>
 
-namespace detray {
+namespace detray::benchmarks {
 
 template <typename propagator_t, typename bfield_t,
-          propagate_option opt = propagate_option::e_unsync>
-struct propagation_bm : public benchmark_base {
+          detray::benchmarks::propagate_option opt =
+              detray::benchmarks::propagate_option::e_unsync>
+struct host_propagation_bm : public benchmark_base {
     /// Detector dependent types
     using algebra_t = typename propagator_t::detector_type::algebra_type;
     using scalar_t = dscalar<algebra_t>;
@@ -42,41 +43,30 @@ struct propagation_bm : public benchmark_base {
     configuration m_cfg{};
 
     /// Default construction
-    propagation_bm() = default;
+    host_propagation_bm() = default;
 
     /// Construct from an externally provided configuration @param cfg
-    propagation_bm(configuration cfg) : m_cfg{cfg} {}
+    explicit host_propagation_bm(const configuration &cfg) : m_cfg{cfg} {}
 
     /// @return the benchmark configuration
     configuration &config() { return m_cfg; }
 
     /// Prepare data and run benchmark loop
-    inline void operator()(
-        ::benchmark::State &state,
-        dvector<free_track_parameters<algebra_t>> *tracks_ptr,
-        const typename propagator_t::detector_type *det_ptr,
-        const bfield_t *bfield_ptr,
-        typename propagator_t::actor_chain_type::state_tuple *actor_states_ptr)
-        const {
+    inline void operator()(::benchmark::State &state,
+                           dvector<free_track_parameters<algebra_t>> *tracks,
+                           const typename propagator_t::detector_type *det,
+                           const bfield_t *bfield,
+                           typename propagator_t::actor_chain_type::state_tuple
+                               *input_actor_states) const {
 
         using actor_states_t =
             typename propagator_t::actor_chain_type::state_tuple;
 
-        auto &tracks{*tracks_ptr};
-        const auto &det{*det_ptr};
-        const auto &bfield{*bfield_ptr};
-        auto &input_actor_states{*actor_states_ptr};
-
         const int n_samples{m_cfg.benchmark().n_samples()};
         const int n_warmup{m_cfg.benchmark().n_warmup()};
 
-        assert(static_cast<std::size_t>(n_samples + n_warmup) <= tracks.size());
-
-        // Shuffle the sample
-        std::random_device rd;
-        std::mt19937 gen(rd());
-
-        std::shuffle(std::begin(tracks), std::end(tracks), gen);
+        assert(static_cast<std::size_t>(n_samples + n_warmup) <=
+               tracks->size());
 
         // Create propagator
         propagator_t p{m_cfg.propagation()};
@@ -85,8 +75,9 @@ struct propagation_bm : public benchmark_base {
         if (m_cfg.benchmark().do_warmup()) {
 #pragma omp parallel for
             for (int i = 0; i < n_warmup; ++i) {
+                const auto i_u{static_cast<std::size_t>(i)};
                 // Fresh copy of actor states
-                actor_states_t actor_state_tuple(input_actor_states);
+                actor_states_t actor_state_tuple(*input_actor_states);
                 // Tuple of references to pass to the propagator
                 typename propagator_t::actor_chain_type::state actor_states =
                     setup_actor_states<propagator_t>(
@@ -95,13 +86,20 @@ struct propagation_bm : public benchmark_base {
                             std::size_t,
                             detail::tuple_size_v<actor_states_t>>{});
 
-                typename propagator_t::state p_state(tracks[i], bfield, det);
+                typename propagator_t::state p_state((*tracks)[i_u], *bfield,
+                                                     *det);
+                // Particle hypothesis
+                auto &ptc = p_state._stepping.particle_hypothesis();
+                p_state.set_particle(
+                    update_particle_hypothesis(ptc, (*tracks)[i_u]));
 
                 // Run propagation
-                if constexpr (opt == propagate_option::e_unsync) {
+                if constexpr (opt ==
+                              detray::benchmarks::propagate_option::e_unsync) {
                     ::benchmark::DoNotOptimize(
                         p.propagate(p_state, actor_states));
-                } else if constexpr (opt == propagate_option::e_sync) {
+                } else if constexpr (opt == detray::benchmarks::
+                                                propagate_option::e_sync) {
                     ::benchmark::DoNotOptimize(
                         p.propagate_sync(p_state, actor_states));
                 }
@@ -109,35 +107,46 @@ struct propagation_bm : public benchmark_base {
         }
 
         // Run the benchmark
+        std::size_t total_tracks = 0u;
         for (auto _ : state) {
 #pragma omp parallel for
             for (int i = n_warmup; i < n_samples + n_warmup; ++i) {
+                const auto i_u{static_cast<std::size_t>(i)};
+
                 // Fresh copy of actor states
-                actor_states_t actor_state_tuple(input_actor_states);
+                actor_states_t actor_state_tuple(*input_actor_states);
                 // Tuple of references to pass to the propagator
                 typename propagator_t::actor_chain_type::state actor_states =
-                    setup_actor_states<propagator_t>(
+                    detray::benchmarks::setup_actor_states<propagator_t>(
                         actor_state_tuple,
                         std::make_integer_sequence<
                             std::size_t,
                             detail::tuple_size_v<actor_states_t>>{});
 
-                typename propagator_t::state p_state(tracks[i], bfield, det);
+                typename propagator_t::state p_state((*tracks)[i_u], *bfield,
+                                                     *det);
+                // Particle hypothesis
+                auto &ptc = p_state._stepping.particle_hypothesis();
+                p_state.set_particle(
+                    update_particle_hypothesis(ptc, (*tracks)[i_u]));
 
                 // Run propagation
-                if constexpr (opt == propagate_option::e_unsync) {
+                if constexpr (opt ==
+                              detray::benchmarks::propagate_option::e_unsync) {
                     ::benchmark::DoNotOptimize(
                         p.propagate(p_state, actor_states));
-                } else if constexpr (opt == propagate_option::e_sync) {
+                } else if constexpr (opt == detray::benchmarks::
+                                                propagate_option::e_sync) {
                     ::benchmark::DoNotOptimize(
                         p.propagate_sync(p_state, actor_states));
                 }
             }
+            total_tracks += static_cast<std::size_t>(n_samples);
         }
         // Report throughput
         state.counters["TracksPropagated"] = benchmark::Counter(
-            static_cast<double>(n_samples), benchmark::Counter::kIsRate);
+            static_cast<double>(total_tracks), benchmark::Counter::kIsRate);
     }
 };
 
-}  // namespace detray
+}  // namespace detray::benchmarks
diff --git a/tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt
new file mode 100644
index 000000000..71bce8dfa
--- /dev/null
+++ b/tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Detray library, part of the ACTS project (R&D line)
+#
+# (c) 2024 CERN for the benefit of the ACTS project
+#
+# Mozilla Public License Version 2.0
+
+if(DETRAY_BUILD_CUDA)
+    add_subdirectory(cuda)
+endif()
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
new file mode 100644
index 000000000..9548b0dd7
--- /dev/null
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Detray library, part of the ACTS project (R&D line)
+#
+# (c) 2024 CERN for the benefit of the ACTS project
+#
+# Mozilla Public License Version 2.0
+
+# C++17 support for CUDA requires CMake 3.18.
+cmake_minimum_required(VERSION 3.18)
+
+# Enable CUDA as a language.
+enable_language(CUDA)
+
+# Set the CUDA build flags.
+include(detray-compiler-options-cuda)
+
+# Set up a benchamrk library for CUDA
+add_library(
+    detray_benchmark_cuda
+    STATIC
+    "propagation_benchmark.hpp"
+    "propagation_benchmark.cu"
+)
+
+add_library(detray::benchmark_cuda ALIAS detray_benchmark_cuda)
+
+target_link_libraries(
+    detray_benchmark_cuda
+    PUBLIC detray::benchmarks detray::core_array vecmem::cuda
+)
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu
new file mode 100644
index 000000000..024698cf6
--- /dev/null
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu
@@ -0,0 +1,104 @@
+/** Detray library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022-2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include "detray/benchmarks/device/cuda/propagation_benchmark.hpp"
+#include "detray/core/detector_metadata.hpp"
+#include "detray/definitions/detail/cuda_definitions.hpp"
+#include "detray/detectors/toy_metadata.hpp"
+
+namespace detray::benchmarks {
+
+template <typename propagator_t>
+__global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel(
+    propagation::config cfg,
+    typename propagator_t::detector_type::view_type det_view,
+    typename propagator_t::stepper_type::magnetic_field_type field_view,
+    vecmem::data::vector_view<
+        free_track_parameters<typename propagator_t::algebra_type>>
+        tracks_view,
+    const detray::benchmarks::propagate_option opt) {
+
+    using detector_device_t =
+        detector<typename propagator_t::detector_type::metadata,
+                 device_container_types>;
+    using algebra_t = typename detector_device_t::algebra_type;
+    using propagator_device_t =
+        propagator<typename propagator_t::stepper_type,
+                   navigator<detector_device_t>,
+                   typename propagator_t::actor_chain_type>;
+
+    detector_device_t det(det_view);
+    vecmem::device_vector<free_track_parameters<algebra_t>> tracks(tracks_view);
+
+    int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (gid >= tracks.size()) {
+        return;
+    }
+
+    // Create propagator
+    propagator_device_t p{cfg};
+
+    typename parameter_transporter<algebra_t>::state transporter_state{};
+    typename pointwise_material_interactor<algebra_t>::state interactor_state{};
+    typename parameter_resetter<algebra_t>::state resetter_state{};
+
+    // Create the actor states
+    auto actor_states =
+        tie(transporter_state, interactor_state, resetter_state);
+
+    // Create the propagator state
+    typename propagator_device_t::state p_state(tracks.at(gid), field_view,
+                                                det);
+    // Particle hypothesis
+    auto& ptc = p_state._stepping.particle_hypothesis();
+    p_state.set_particle(update_particle_hypothesis(ptc, tracks.at(gid)));
+
+    // Run propagation
+    if (opt == detray::benchmarks::propagate_option::e_unsync) {
+        p.propagate(p_state, actor_states);
+    } else if (opt == detray::benchmarks::propagate_option::e_sync) {
+        p.propagate_sync(p_state, actor_states);
+    }
+}
+
+template <typename propagator_t>
+void run_propagation_kernel(
+    const propagation::config& cfg,
+    typename propagator_t::detector_type::view_type det_view,
+    typename propagator_t::stepper_type::magnetic_field_type field_view,
+    vecmem::data::vector_view<
+        free_track_parameters<typename propagator_t::algebra_type>>
+        tracks_view,
+    const int n_samples, const detray::benchmarks::propagate_option opt) {
+
+    constexpr int thread_dim = 256;
+    int block_dim = (n_samples + thread_dim - 1) / thread_dim;
+
+    // run the test kernel
+    propagator_benchmark_kernel<propagator_t><<<block_dim, thread_dim>>>(
+        cfg, det_view, field_view, tracks_view, opt);
+
+    // cuda error check
+    DETRAY_CUDA_ERROR_CHECK(cudaGetLastError());
+    DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+}
+
+/// Macro declaring the template instantiations for the different detector types
+#define DECLARE_PROPAGATION_BENCHMARK(METADATA, FIELD)                \
+                                                                      \
+    template void                                                     \
+    run_propagation_kernel<cuda_propagator_type<METADATA, FIELD>>(    \
+        const propagation::config&, detector<METADATA>::view_type,    \
+        covfie::field_view<FIELD>,                                    \
+        vecmem::data::vector_view<                                    \
+            free_track_parameters<detector<METADATA>::algebra_type>>, \
+        const int, const detray::benchmarks::propagate_option);
+
+DECLARE_PROPAGATION_BENCHMARK(default_metadata, bfield::const_bknd_t)
+DECLARE_PROPAGATION_BENCHMARK(toy_metadata, bfield::const_bknd_t)
+
+}  // namespace detray::benchmarks
diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
new file mode 100644
index 000000000..577860123
--- /dev/null
+++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp
@@ -0,0 +1,136 @@
+/** Detray library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+// Project include(s)
+#include "detray/definitions/detail/algebra.hpp"
+#include "detray/detectors/bfield.hpp"
+#include "detray/navigation/navigator.hpp"
+#include "detray/propagator/actor_chain.hpp"
+#include "detray/propagator/actors/aborters.hpp"
+#include "detray/propagator/actors/parameter_resetter.hpp"
+#include "detray/propagator/actors/parameter_transporter.hpp"
+#include "detray/propagator/actors/pointwise_material_interactor.hpp"
+#include "detray/propagator/propagator.hpp"
+#include "detray/propagator/rk_stepper.hpp"
+#include "detray/tracks/tracks.hpp"
+
+// Detray benchmark include(s)
+#include "detray/benchmarks/benchmark_base.hpp"
+#include "detray/benchmarks/propagation_benchmark_config.hpp"
+#include "detray/benchmarks/propagation_benchmark_utils.hpp"
+
+// Vecmem include(s)
+#include <vecmem/memory/cuda/device_memory_resource.hpp>
+#include <vecmem/memory/host_memory_resource.hpp>
+#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/utils/cuda/copy.hpp>
+
+// Benchmark include
+#include <benchmark/benchmark.h>
+
+// System include(s)
+#include <algorithm>
+#include <cassert>
+#include <random>
+#include <string>
+
+namespace detray::benchmarks {
+
+// Define propagator type
+template <typename algebra_t>
+using default_chain = actor_chain<dtuple, parameter_transporter<algebra_t>,
+                                  pointwise_material_interactor<algebra_t>,
+                                  parameter_resetter<algebra_t>>;
+
+template <typename metadata_t, typename bfield_t>
+using cuda_propagator_type =
+    propagator<rk_stepper<covfie::field_view<bfield_t>,
+                          typename detector<metadata_t>::algebra_type>,
+               navigator<detector<metadata_t>>,
+               default_chain<typename detector<metadata_t>::algebra_type>>;
+
+/// Launch the propagation kernelfor benchmarking
+///
+/// @param cfg the propagation configuration
+/// @param det_view the detector vecmem view
+/// @param field_data the magentic field view (maybe an empty field)
+/// @param tracks_data the track collection view
+/// @param navigation_cache_view the navigation cache vecemem view
+/// @param opt which propagation to run (sync vs. unsync)
+template <typename propagator_t>
+void run_propagation_kernel(
+    const propagation::config &cfg,
+    typename propagator_t::detector_type::view_type det_view,
+    typename propagator_t::stepper_type::magnetic_field_type field_data,
+    vecmem::data::vector_view<
+        free_track_parameters<typename propagator_t::algebra_type>>
+        tracks_data,
+    const int n_samples, const detray::benchmarks::propagate_option opt);
+
+/// Device Propagation becnhmark
+template <typename propagator_t, typename bfield_bknd_t,
+          detray::benchmarks::propagate_option opt =
+              detray::benchmarks::propagate_option::e_unsync>
+struct cuda_propagation_bm : public benchmark_base {
+    /// Detector dependent types
+    using algebra_t = typename propagator_t::detector_type::algebra_type;
+    using scalar_t = dscalar<algebra_t>;
+    using vector3_t = dvector3D<algebra_t>;
+
+    /// Local configuration type
+    using configuration = propagation_benchmark_config;
+
+    /// The benchmark configuration
+    configuration m_cfg{};
+
+    /// Default construction
+    cuda_propagation_bm() = default;
+
+    /// Construct from an externally provided configuration @param cfg
+    explicit cuda_propagation_bm(const configuration &cfg) : m_cfg{cfg} {}
+
+    /// @return the benchmark configuration
+    configuration &config() { return m_cfg; }
+
+    /// Prepare data and run benchmark loop
+    inline void operator()(::benchmark::State &state,
+                           vecmem::memory_resource *dev_mr,
+                           dvector<free_track_parameters<algebra_t>> *tracks,
+                           const typename propagator_t::detector_type *det,
+                           const bfield_bknd_t *bfield) const {
+
+        // Helper object for performing memory copies (to CUDA devices)
+        vecmem::cuda::copy cuda_cpy;
+
+        const int n_samples{m_cfg.benchmark().n_samples()};
+
+        // Copy the track collection to device
+        auto track_buffer =
+            detray::get_buffer(vecmem::get_data(*tracks), *dev_mr, cuda_cpy);
+
+        // Copy the detector to device and get its view
+        auto det_buffer = detray::get_buffer(*det, *dev_mr, cuda_cpy);
+        auto det_view = detray::get_data(det_buffer);
+
+        std::size_t total_tracks = 0u;
+        for (auto _ : state) {
+            // Launch the propagator test for GPU device
+            run_propagation_kernel<propagator_t>(m_cfg.propagation(), det_view,
+                                                 *bfield, track_buffer,
+                                                 n_samples, opt);
+
+            total_tracks += static_cast<std::size_t>(n_samples);
+        }
+        // Report throughput
+        state.counters["TracksPropagated"] = benchmark::Counter(
+            static_cast<double>(total_tracks), benchmark::Counter::kIsRate);
+    }
+};
+
+}  // namespace detray::benchmarks
diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp
index 4c2f52c66..32dbcc617 100644
--- a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp
+++ b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp
@@ -5,14 +5,17 @@
  * Mozilla Public License Version 2.0
  */
 
+#pragma once
+
 // Project include(s)
 #include "detray/benchmarks/benchmark_base.hpp"
 #include "detray/propagator/propagation_config.hpp"
 
 // System include(s)
 #include <string>
+#include <string_view>
 
-namespace detray {
+namespace detray::benchmarks {
 
 /// Configuration for propagation benchmarks
 struct propagation_benchmark_config {
@@ -27,8 +30,8 @@ struct propagation_benchmark_config {
     propagation_benchmark_config() = default;
 
     /// Construct from a base configuration
-    propagation_benchmark_config(
-        const detray::benchmark_base::configuration& bench_cfg)
+    explicit propagation_benchmark_config(
+        const benchmark_base::configuration& bench_cfg)
         : m_benchmark(bench_cfg) {}
 
     /// Getters
@@ -44,11 +47,11 @@ struct propagation_benchmark_config {
 
     /// Setters
     /// @{
-    propagation_benchmark_config& name(std::string& n) {
+    propagation_benchmark_config& name(const std::string_view n) {
         m_name = n;
         return *this;
     }
     /// @}
 };
 
-}  // namespace detray
+}  // namespace detray::benchmarks
diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
index 80f065be1..dff6f844c 100644
--- a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
+++ b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
@@ -5,10 +5,10 @@
  * Mozilla Public License Version 2.0
  */
 
+#pragma once
+
 // Project include(s)
-#include "detray/core/detail/container_views.hpp"
 #include "detray/definitions/detail/algebra.hpp"
-#include "detray/definitions/detail/containers.hpp"
 #include "detray/navigation/navigator.hpp"
 #include "detray/propagator/actor_chain.hpp"
 #include "detray/propagator/propagator.hpp"
@@ -22,10 +22,11 @@
 #include <benchmark/benchmark.h>
 
 // System include(s)
+#include <type_traits>
 #include <utility>
 #include <vector>
 
-namespace detray {
+namespace detray::benchmarks {
 
 /// Which propagate function to run
 enum class propagate_option {
@@ -33,20 +34,42 @@ enum class propagate_option {
     e_sync = 1,
 };
 
-/// Define propagator type
-template <typename stepper_t, typename detector_t,
-          typename actor_chain_t = actor_chain<>>
-using propagator_t =
-    propagator<stepper_t, navigator<detector_t>, actor_chain_t>;
+/// @returns the default track generation configuration for detray benchmarks
+template <typename track_generator_t>
+inline typename track_generator_t::configuration get_default_trk_gen_config(
+    const std::vector<int> &n_tracks) {
+
+    using track_t = typename track_generator_t::track_type;
+    using scalar_t = dscalar<typename track_t::algebra_type>;
+
+    int n_trks{*std::ranges::max_element(n_tracks)};
+
+    // Generate tracks
+    typename track_generator_t::configuration trk_cfg{};
+    trk_cfg.n_tracks(static_cast<std::size_t>(n_trks));
+    trk_cfg.randomize_charge(true);
+    trk_cfg.phi_range(-constant<scalar_t>::pi, constant<scalar_t>::pi);
+    trk_cfg.eta_range(-3.f, 3.f);
+    trk_cfg.mom_range(1.f * unit<scalar_t>::GeV, 100.f * unit<scalar_t>::GeV);
+    trk_cfg.origin({0.f, 0.f, 0.f});
+    trk_cfg.origin_stddev({0.f * unit<scalar_t>::mm, 0.f * unit<scalar_t>::mm,
+                           0.f * unit<scalar_t>::mm});
+
+    return trk_cfg;
+}
 
 /// Precompute the tracks
 template <typename track_generator_t>
 inline auto generate_tracks(
     vecmem::memory_resource *mr,
-    const typename track_generator_t::configuration &cfg = {}) {
+    const typename track_generator_t::configuration &cfg = {},
+    bool do_sort = true) {
+
+    using track_t = typename track_generator_t::track_type;
+    using scalar_t = dscalar<typename track_t::algebra_type>;
 
     // Track collection
-    dvector<typename track_generator_t::track_type> tracks(mr);
+    dvector<track_t> tracks(mr);
 
     // Iterate through uniformly distributed momentum directions
     for (auto track : track_generator_t{cfg}) {
@@ -54,12 +77,23 @@ inline auto generate_tracks(
         tracks.push_back(track);
     }
 
+    if (do_sort) {
+        // Sort by theta angle
+        const auto traj_comp = [](const auto &lhs, const auto &rhs) {
+            constexpr auto pi_2{constant<scalar_t>::pi_2};
+            return math::fabs(pi_2 - vector::theta(lhs.dir())) <
+                   math::fabs(pi_2 - vector::theta(rhs.dir()));
+        };
+
+        std::ranges::sort(tracks, traj_comp);
+    }
+
     return tracks;
 }
 
 /// Tie the actor states for the propagation
 template <typename propagator_t, std::size_t... I>
-inline constexpr auto setup_actor_states(
+constexpr auto setup_actor_states(
     typename propagator_t::actor_chain_type::state_tuple &input_actor_states,
     std::index_sequence<I...>) {
 
@@ -74,9 +108,8 @@ inline constexpr auto setup_actor_states(
 /// Register a propagation benchmark of type @tparam benchmark_t
 ///
 /// @tparam benchmark_t the propagation benchmark functor
-/// @tparam stepper_t the stepper to use fro track parameter transport
-/// @tparam actor_chain_t types of actors
-/// @tparam detector_t detector type
+/// @tparam propagator_t full propagator type
+/// @tparam detector_t host detector type
 /// @tparam bfield_t covfie magnetic field type
 ///
 /// @param name name for the benchmark
@@ -88,34 +121,82 @@ inline constexpr auto setup_actor_states(
 ///                     actor_chain_t)
 /// @param tracks the pre-computed test tracks
 /// @param n_samples the number of track to run
-template <template <typename, typename> class benchmark_t, typename stepper_t,
-          typename actor_chain_t, typename detector_t, typename bfield_t>
+template <template <typename, typename, detray::benchmarks::propagate_option>
+          class benchmark_t,
+          typename propagator_t, typename detector_t, typename bfield_bknd_t,
+          detray::benchmarks::propagate_option kOPT =
+              detray::benchmarks::propagate_option::e_sync>
 inline void register_benchmark(
     const std::string &name, benchmark_base::configuration &bench_cfg,
-    propagation::config &prop_cfg, const detector_t &det,
-    const bfield_t &bfield, typename actor_chain_t::state_tuple &actor_states,
+    propagation::config &prop_cfg, const detector_t &det, bfield_bknd_t &bfield,
     dvector<free_track_parameters<typename detector_t::algebra_type>> &tracks,
-    const std::vector<int> &n_samples = {10000}) {
+    const std::vector<int> &n_samples = {10000},
+    vecmem::memory_resource *dev_mr = nullptr,
+    typename propagator_t::actor_chain_type::state_tuple *actor_states =
+        nullptr) {
+
+    using algebra_t = typename detector_t::algebra_type;
+    using propagation_benchmark_t =
+        benchmark_t<propagator_t, bfield_bknd_t, kOPT>;
 
     for (int n : n_samples) {
 
         bench_cfg.n_samples(n);
 
-        // Without covariance transport
-        benchmark_t<propagator_t<stepper_t, detector_t, actor_chain_t>,
-                    bfield_t>
-            prop_benchmark{bench_cfg};
-        prop_benchmark.config().propagation() = prop_cfg;
+        typename propagation_benchmark_t::configuration prop_bm_cfg{bench_cfg};
+        prop_bm_cfg.propagation() = prop_cfg;
+
+        // Configure the benchmark
+        propagation_benchmark_t prop_benchmark{prop_bm_cfg};
 
         std::string bench_name = prop_benchmark.config().name() + "_" + name +
                                  "_" + std::to_string(n) + "_TRACKS";
 
         std::cout << bench_name << "\n" << bench_cfg;
 
-        ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
-                                       &tracks, &det, &bfield, &actor_states);
-        //->MeasureProcessCPUTime();
+        // Cpu benchmark
+        if constexpr (std::is_invocable_v<
+                          decltype(prop_benchmark), ::benchmark::State &,
+                          dvector<free_track_parameters<algebra_t>> *,
+                          const detector_t *, const bfield_bknd_t *,
+                          typename propagator_t::actor_chain_type::state_tuple
+                              *>) {
+            ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
+                                           &tracks, &det, &bfield,
+                                           actor_states);
+            //->MeasureProcessCPUTime();
+        } else {
+
+            ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
+                                           dev_mr, &tracks, &det, &bfield);
+            //->MeasureProcessCPUTime();
+        }
     }
 }
 
-}  // namespace detray
+/// Register a propagation benchmark of type @tparam benchmark_t
+///
+/// @tparam benchmark_t the propagation benchmark functor
+/// @tparam stepper_t the stepper to use fro track parameter transport
+/// @tparam actor_chain_t types of actors
+template <template <typename, typename, detray::benchmarks::propagate_option>
+          class benchmark_t,
+          typename stepper_t, typename actor_chain_t, typename detector_t,
+          typename bfield_bknd_t,
+          detray::benchmarks::propagate_option kOPT =
+              detray::benchmarks::propagate_option::e_sync>
+inline void register_benchmark(
+    const std::string &name, benchmark_base::configuration &bench_cfg,
+    propagation::config &prop_cfg, const detector_t &det, bfield_bknd_t &bfield,
+    dvector<free_track_parameters<typename detector_t::algebra_type>> &tracks,
+    const std::vector<int> &n_samples = {10000},
+    typename actor_chain_t::state_tuple *actor_states = nullptr) {
+
+    using propagator_t =
+        propagator<stepper_t, navigator<detector_t>, actor_chain_t>;
+    register_benchmark<benchmark_t, propagator_t, detector_t, bfield_bknd_t,
+                       kOPT>(name, bench_cfg, prop_cfg, det, bfield, tracks,
+                             n_samples, nullptr, actor_states);
+}
+
+}  // namespace detray::benchmarks