Fix tuner for heterogeneous GPUs and auto precision.

Fix full tuner for heterogeneous GPUs and auto precision detection. --full-tuner implies --tune-only --full-tuner requires an explicit precision Fixes leela-zero#1973. Pull request leela-zero#2004.
alreadydone · alreadydone · Aug 6, 2018 · Aug 9, 2018 · Aug 10, 2018 · Aug 14, 2018
commit 6f58159a6b8166bead0968fa9d715209293197b0
diff --git a/src/Leela.cpp b/src/Leela.cpp
@@ -92,8 +92,9 @@ static void parse_commandline(int argc, char *argv[]) {
         ("full-tuner", "Try harder to find an optimal OpenCL tuning.")
         ("tune-only", "Tune OpenCL only and then exit.")
 #ifdef USE_HALF
-        ("precision", po::value<std::string>(), "Floating-point precision (single/half/auto).\n"
-                                                "Default is to auto which automatically determines which one to use.")
+        ("precision", po::value<std::string>(),
+            "Floating-point precision (single/half/auto).\n"
+            "Default is to auto which automatically determines which one to use.")
 #endif
         ;
 #endif
@@ -218,6 +219,11 @@ static void parse_commandline(int argc, char *argv[]) {
 
     if (vm.count("full-tuner")) {
         cfg_sgemm_exhaustive = true;
+
+        // --full-tuner auto-implies --tune-only.  The full tuner is so slow
+        // that nobody will wait for it to finish befure running a game.
+        // This simply prevents some edge cases from confusing other people.
+        cfg_tune_only = true;
     }
 
     if (vm.count("tune-only")) {
@@ -238,6 +244,14 @@ static void parse_commandline(int argc, char *argv[]) {
             exit(EXIT_FAILURE);
         }
     }
+    if (cfg_precision == precision_t::AUTO) {
+        // Auto precision is not supported for full tuner cases.
+        if (cfg_sgemm_exhaustive) {
+            printf("Automatic precision not supported when doing exhaustive tuning\n");
+            printf("Please add '--precision single' or '--precision half'\n");
+            exit(EXIT_FAILURE);
+        }
+    }
 #endif
 #endif
 

diff --git a/src/OpenCL.cpp b/src/OpenCL.cpp
@@ -790,11 +790,13 @@ void OpenCL<net_t>::initialize(const int channels) {
     auto sgemm_tuners =
         t.load_sgemm_tuners(channels, WINOGRAD_P, channels, WINOGRAD_TILE);
 
-    // Exit immediately after tuning. Some NVIDIA drivers are buggy
-    // and will fail to compile the rest of the kernels after a tuning
-    // run. See #729.
+    // Some NVIDIA drivers are buggy and will fail to compile the rest of the
+    // kernels after a tuning run.
     if (cfg_tune_only) {
-        exit(EXIT_SUCCESS);
+        // Originally this was an exit() but this will make the tuner
+        // only tune the first GPU.  Return instead.  Exit will be called
+        // after all GPUs are created.
+        return;
     }
 
     // Build program for these specific devices

diff --git a/src/OpenCLScheduler.cpp b/src/OpenCLScheduler.cpp
@@ -113,6 +113,12 @@ void OpenCLScheduler<net_t>::initialize(const int channels) {
         }
         gnum++;
     }
+
+    // Exit immediately after tuning.  We should exit here because we skipped
+    // initializing rest of the kernels due to some NVIDIA drivers crashing.
+    if (cfg_tune_only) {
+        exit(EXIT_SUCCESS);
+    }
 }
 
 template<typename net_t>

diff --git a/src/Tuner.cpp b/src/Tuner.cpp
@@ -40,6 +40,9 @@
 
 const auto TUNER_FILE_LOCAL = std::string("leelaz_opencl_tuning");
 
+template <typename net_t>
+std::vector<std::string> Tuner<net_t>::tuned_devices;
+
 #ifndef USE_BLAS
 // Eigen helpers
 template <typename T>
@@ -579,7 +582,24 @@ std::string Tuner<net_t>::load_sgemm_tuners(const int m, const int n, const int
                                      const int batch_size) {
     auto tuner_file = leelaz_file(TUNER_FILE_LOCAL);
     auto file = std::ifstream{tuner_file};
-    if (!cfg_sgemm_exhaustive && file.good()) {
+
+    auto try_prior_tuning = file.good();
+
+    // If we want full tuning, don't reuse previously tuned results
+    // except if the tuning was created from this run from a different
+    // GPU instance with the same name.  This prevents the tuner running
+    // for multiple times if the system has multiple same GPUs.
+    if (try_prior_tuning && cfg_sgemm_exhaustive) {
+        auto dev = m_opencl.get_device_name();
+        try_prior_tuning = std::any_of(
+            begin(tuned_devices),
+            end(tuned_devices),
+            [&dev](const std::string & x) { return dev == x; }
+        );
+    }
+    tuned_devices.emplace_back(m_opencl.get_device_name());
+
+    if (try_prior_tuning) {
         auto line = std::string{};
         while (std::getline(file, line)) {
             auto tuners = sgemm_tuners_from_line(line, m, n, k, batch_size);

diff --git a/src/Tuner.h b/src/Tuner.h
@@ -40,6 +40,10 @@ class Tuner {
     std::string load_sgemm_tuners(const int m, const int n, const int k,
                                   const int batch_size);
 
+    // list of device types that was tuned in this run.
+    // This is to prevent the same device from being tuned multiple times.
+    static std::vector<std::string> tuned_devices;
+
     static constexpr auto TUNER_VERSION = 0;
     Tuner(OpenCL<net_t> & opencl, cl::Context context, cl::Device device) :
         m_opencl(opencl), m_context(context), m_device(device) {}