Merge pull request acts-project#909 from niermann999/feat-benchmark-s…

…cripts feat: benchmark scripts
niermann999 · Jan 27, 2025 · f011686 · f011686
2 parents e8d7baa + 0641ed1
commit f011686
Show file tree

Hide file tree

Showing 14 changed files with 1,076 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -102,13 +102,15 @@ detray-build/bin/detray_detector_validation \
 ```
 In case of failures, this command will give a detailed debug output in the form of a log file, as well as an SVG representation of the failed tracks. The grid file is optional, but will trigger the use of spacial grids as acceleration structures during the navigation run.
 
+Note: The `search_window` option defines the size of lookup area of the grid acceleration struction and is therefore detector dependent! Use `--search_window 3 3` (or larger) for the *toy detector* and *wire chamber* example detectors and `--search_window 0 0` otherwise.
+
 ### Material Validation
 
 This tool checks whether the navigator picks up the material correctly by comparing the material found during a ray scan with the material collected during navigation by a specialized actor:
 ```shell
 detray-build/bin/detray_material_validation \
-    --geometry_file toy_detector/toy_detector_geometry.json \
-    --material_file toy_detector/toy_detector_homogeneous_material.json \
+    --geometry_file ./toy_detector/toy_detector_geometry.json \
+    --material_file ./toy_detector/toy_detector_homogeneous_material.json \
     --phi_steps 100 --eta_steps 100 --eta_range -4 4
 ```
 Note: The correct material file must be loaded in addition to the geometry file!
@@ -122,9 +124,24 @@ detray-build/bin/detray_propagation_benchmark_<backend>_<algebra> \
     --geometry_file ./toy_detector/toy_detector_geometry.json \
     --grid_file ./toy_detector/toy_detector_surface_grids.json \
     --material_file ./toy_detector/toy_detector_homogeneous_material.json \
-    --sort_tracks --randomize_charge --eta_range -3 3 -pT_range 1 100
+    --sort_tracks --randomize_charge --eta_range -3 3 --pT_range 0.5 100 \
+    --search_window 3 3 --covariance_transport
+```
+For every algebra-plugin that was built, a corresponding benchmark executable will be present. The CPU-backend benchmark is built by default and the CUDA-backend benchmark will be available if detray was built with CUDA enabled (`-DDETRAY_BUILD_CUDA=ON`). This executable can additionally be configured with any arguments targeted at [google benchmark](https://github.com/google/benchmark/blob/main/docs/user_guide.md).
+
+If the data is dumped into json files using the options `--benchmark_out_format=json` and `--benchmark_out=<detector_name>_benchmark_data_<backend>_<algebra>.json`, it can afterwards be plotted with e.g.:
+```shell
+python3 detray/tests/tools/python/propagation_benchmarks.py \
+    --geometry_file ./toy_detector/toy_detector_geometry.json \
+    --algebra_plugins array eigen \
+    --cuda \
+    --data_files ./toy_detector_benchmark_data_cpu_array.json \
+    ./toy_detector_benchmark_data_cpu_eigen.json \
+    ./toy_detector_benchmark_data_cuda_array.json \
+    ./toy_detector_benchmark_data_cuda_eigen.json
 ```
-For every algebra-plugin that was built, a corresponding benchmark executable will be present. The CPU-backend benchmark is built by default and the CUDA-backend benchmark will be available if detray was built with CUDA enabled (`-DDETRAY_BUILD_CUDA=ON`).
+
+using the *std::array* and [*Eigen3*](https://eigen.tuxfamily.org) based linear algebra plugins with CUDA backend as an example.
 
 ### Continuous benchmark
 

diff --git a/core/include/detray/propagator/propagation_config.hpp b/core/include/detray/propagator/propagation_config.hpp
@@ -33,7 +33,7 @@ struct config {
             << "----------------------------\n"
             << cfg.stepping << "\nGeometry Context\n"
             << "----------------------------\n"
-            << cfg.context.get() << "\n";
+            << "  No.                   : " << cfg.context.get() << "\n";
 
         return out;
     }

diff --git a/core/include/detray/utils/type_list.hpp b/core/include/detray/utils/type_list.hpp
@@ -1,6 +1,6 @@
 /** Detray library, part of the ACTS project (R&D line)
  *
- * (c) 2023 CERN for the benefit of the ACTS project
+ * (c) 2023-2025 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -12,6 +12,7 @@
 #include "detray/utils/type_traits.hpp"
 
 // System include(s)
+#include <ranges>
 #include <string>
 #include <string_view>
 #include <type_traits>
@@ -142,27 +143,65 @@ std::string demangle_type_name() {
     return std::string{function.substr(start, size)};
 }
 
+/// @returns the name of a type as string
+/// @tparam T the type
+template <typename T>
+std::string get_name(bool full = false) {
+    std::string tp_str{""};
+    try {
+        tp_str = detray::types::demangle_type_name<T>();
+    } catch (...) {
+        return "unknown";
+    }
+
+    if (tp_str.empty()) {
+        return "unknown";
+    }
+
+    if (full) {
+        return tp_str;
+    }
+
+    // Remove the template argument list
+    dvector<std::string> tokens{};
+    for (const auto t : std::views::split(tp_str, '<')) {
+        tokens.emplace_back(t.begin(), t.end());
+    }
+
+    // Split a the first ocurrence of '<'
+    tp_str = tokens.front();
+    tokens.clear();
+
+    // Strip the namespaces and qualifiers
+    for (const auto t : std::views::split(tp_str, ':')) {
+        tokens.emplace_back(t.begin(), t.end());
+    }
+
+    // Split at the last occurrence of ':'
+    return tokens.back();
+}
+
 template <typename = void>
 struct print {};
 
 template <typename... Ts>
 struct print<list<Ts...>> {
 
     template <typename P = void, typename... Ps>
-    void print_typeid() {
+    void print_typeid(bool full) {
 
-        std::printf("%s", demangle_type_name<P>().c_str());
+        std::printf("%s", get_name<P>(full).c_str());
 
         // Keep unrolling the pack
         if constexpr (sizeof...(Ps) > 0) {
             std::printf(", ");
-            return print_typeid<Ps...>();
+            return print_typeid<Ps...>(full);
         }
     }
 
-    print() {
+    print(bool full = true) {
         std::printf("type_list<");
-        print_typeid<Ts...>();
+        print_typeid<Ts...>(full);
         std::printf(">\n");
     }
 };

diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp
@@ -19,6 +19,11 @@
 // Benchmark include
 #include <benchmark/benchmark.h>
 
+#ifdef _OPENMP
+// openMP
+#include <omp.h>
+#endif
+
 // System include(s)
 #include <algorithm>
 #include <cassert>
@@ -57,7 +62,8 @@ struct host_propagation_bm : public benchmark_base {
         const dvector<free_track_parameters<algebra_t>> *tracks,
         const typename propagator_t::detector_type *det, const bfield_t *bfield,
         const typename propagator_t::actor_chain_type::state_tuple
-            *input_actor_states) const {
+            *input_actor_states,
+        const int n_threads, const int thread_schedule) const {
         using actor_chain_t = typename propagator_t::actor_chain_type;
         using actor_states_t = typename actor_chain_t::state_tuple;
 
@@ -66,6 +72,14 @@ struct host_propagation_bm : public benchmark_base {
         assert(bfield != nullptr);
         assert(input_actor_states != nullptr);
 
+#ifdef _OPENMP
+        // Set the number of threads fo the openMP parallel regions
+        omp_set_num_threads(n_threads);
+        const int chunk_size{static_cast<int>(tracks->size() / n_threads)};
+        omp_set_schedule(static_cast<omp_sched_t>(thread_schedule), chunk_size);
+// std::cout << "CHUNK SIZE " << chunk_size << std::endl;
+#endif
+
         const int n_samples{m_cfg.benchmark().n_samples()};
         const int n_warmup{m_cfg.benchmark().n_warmup()};
 
@@ -108,8 +122,11 @@ struct host_propagation_bm : public benchmark_base {
             stride = (stride == 0) ? 10 : stride;
             assert(stride > 0);
 
-#pragma omp parallel for schedule(dynamic)
+#pragma omp parallel for if (n_threads > 1)
             for (int i = 0; i < n_samples; i += stride) {
+                /*std::cout << "n threads " << omp_get_num_threads() <<
+                std::endl; omp_sched_t kind; int c; omp_get_schedule(&kind, &c);
+                std::cout << "chunk size " << c << std::endl;*/
                 // The track gets copied into the stepper state, so that the
                 // original track sample vector remains unchanged
                 run_propagation((*tracks)[static_cast<std::size_t>(i)]);
@@ -126,12 +143,12 @@ struct host_propagation_bm : public benchmark_base {
         // https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
         std::size_t total_tracks = 0u;
         for (auto _ : state) {
-#pragma omp parallel for schedule(dynamic)
+#pragma omp parallel for if (n_threads > 1)
             for (int i = 0; i < n_samples; ++i) {
                 run_propagation((*tracks)[static_cast<std::size_t>(i)]);
             }
-            total_tracks += static_cast<std::size_t>(n_samples);
         }
+        total_tracks += static_cast<std::size_t>(n_samples);
         // Report throughput
         state.counters["TracksPropagated"] = benchmark::Counter(
             static_cast<double>(total_tracks), benchmark::Counter::kIsRate);

diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp
@@ -22,6 +22,7 @@
 #include <benchmark/benchmark.h>
 
 // System include(s)
+#include <thread>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -145,17 +146,26 @@ inline void register_benchmark(
         dvector<free_track_parameters<typename detector_t::algebra_type>>>
         &track_samples,
     const std::vector<int> &n_samples = {10000},
-    vecmem::memory_resource *dev_mr = nullptr) {
+    vecmem::memory_resource *dev_mr = nullptr,
+    const std::vector<int> &n_host_threads = {static_cast<int>(
+        std::thread::hardware_concurrency())},
+    int openmp_sched = 2) {
 
     using algebra_t = typename detector_t::algebra_type;
     using propagation_benchmark_t =
         benchmark_t<propagator_t, bfield_bknd_t, kOPT>;
 
     assert(track_samples.size() == n_samples.size());
 
-    for (const auto [i, n] : detray::views::enumerate(n_samples)) {
+    const std::size_t bench_range{
+        math::max(n_samples.size(), n_host_threads.size())};
+    for (std::size_t i = 0u; i < bench_range; ++i) {
 
-        auto &tracks = track_samples[i];
+        auto &tracks =
+            track_samples.size() == 1u ? track_samples[0] : track_samples[i];
+        int host_threads{n_host_threads.size() == 1u ? n_host_threads[0]
+                                                     : n_host_threads[i]};
+        const int n{n_samples.size() == 1u ? n_samples[0] : n_samples[i]};
         assert(static_cast<std::size_t>(n) <= tracks.size());
 
         bench_cfg.n_samples(n);
@@ -176,11 +186,12 @@ inline void register_benchmark(
                           dvector<free_track_parameters<algebra_t>> *,
                           const detector_t *, const bfield_bknd_t *,
                           typename propagator_t::actor_chain_type::state_tuple
-                              *>) {
+                              *,
+                          int, int>) {
             // Cpu benchmark
             ::benchmark::RegisterBenchmark(bench_name.c_str(), prop_benchmark,
-                                           &tracks, &det, &bfield,
-                                           actor_states);
+                                           &tracks, &det, &bfield, actor_states,
+                                           host_threads, openmp_sched);
             //->MeasureProcessCPUTime();
         } else {
             // Device benchmark
@@ -210,13 +221,17 @@ inline void register_benchmark(
     std::vector<
         dvector<free_track_parameters<typename detector_t::algebra_type>>>
         &tracks,
-    const std::vector<int> &n_samples = {10000}) {
+    const std::vector<int> &n_samples = {10000},
+    const std::vector<int> &n_host_threads = {static_cast<int>(
+        std::thread::hardware_concurrency())},
+    int openmp_sched = 2) {
 
     using propagator_t =
         propagator<stepper_t, navigator<detector_t>, actor_chain_t>;
     register_benchmark<benchmark_t, propagator_t, detector_t, bfield_bknd_t,
                        kOPT>(name, bench_cfg, prop_cfg, det, bfield,
-                             actor_states, tracks, n_samples, nullptr);
+                             actor_states, tracks, n_samples, nullptr,
+                             n_host_threads, openmp_sched);
 }
 
 }  // namespace detray::benchmarks
diff --git a/tests/tools/python/impl/__init__.py b/tests/tools/python/impl/__init__.py
@@ -1,3 +1,9 @@
+from .plot_benchmark_results import (
+    read_benchmark_data,
+    prepare_benchmark_data,
+    plot_benchmark_case,
+    plot_benchmark_data,
+)
 from .plot_navigation_validation import (
     read_scan_data,
     read_navigation_data,