alreadydone · alreadydone · Feb 15, 2018 · Feb 15, 2018 · Feb 15, 2018 · Feb 19, 2018
diff --git a/src/CuDNN.cpp b/src/CuDNN.cpp
diff --git a/src/CuDNN.h b/src/CuDNN.h
@@ -0,0 +1,149 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2017 Henrik Forsten
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef CUDNN_H_INCLUDED
+#define CUDNN_H_INCLUDED
+
+#include "config.h"
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+#include <mutex>
+#include <cudnn.h>
+
+struct conv_descriptor {
+	cudnnTensorDescriptor_t input_descriptor;
+	cudnnTensorDescriptor_t output_descriptor;
+	cudnnTensorDescriptor_t bias_descriptor;
+	cudnnFilterDescriptor_t kernel_descriptor;
+	cudnnActivationDescriptor_t activation_descriptor;
+	cudnnConvolutionDescriptor_t convolution_descriptor;
+	cudnnConvolutionFwdAlgo_t convolution_algorithm;
+};
+
+class CuDNN;
+
+class Layer {
+    friend class OpenCL_Network;
+private:
+    unsigned int channels{0};
+    unsigned int outputs{0};
+    unsigned int filter_size{0};
+    bool is_input_convolution{false};
+    bool is_residual_block{false};
+    bool is_convolve1{false};
+	conv_descriptor conv_desc;
+    std::vector<net_t*> weights;
+	size_t workspace_size;
+};
+
+class ThreadData {
+    friend class CuDNN;
+    friend class OpenCL_Network;
+private:
+    float *m_workspace;
+    float *m_InBuffer;
+    float *m_OutBuffer;
+    float *m_ResidualBuffer;
+    bool m_is_initialized{false};
+    bool m_buffers_allocated{false};
+};
+
+class OpenCL_Network {
+public:
+    OpenCL_Network(CuDNN & opencl) : m_cudnn(opencl) {}
+    CuDNN & getCuDNN() {
+        return m_cudnn;
+    }
+
+    void push_input_convolution(unsigned int filter_size,
+                       unsigned int channels,
+                       unsigned int outputs,
+                       const std::vector<float>& weights,
+                       const std::vector<float>& biases);
+
+    void push_residual(unsigned int filter_size,
+                       unsigned int channels,
+                       unsigned int outputs,
+                       const std::vector<float>& weights_1,
+                       const std::vector<float>& biases_1,
+                       const std::vector<float>& weights_2,
+                       const std::vector<float>& biases_2);
+
+    void push_convolve1(unsigned int channels,
+                       unsigned int outputs,
+                       const std::vector<float>& weights);
+
+    size_t get_layer_count() const {
+        return m_layers.size();
+    }
+
+    void forward(const std::vector<net_t>& input,
+            std::vector<net_t>& output_pol,
+            std::vector<net_t>& output_val);
+
+private:
+
+    void push_weights(size_t layer, const std::vector<float>& weights) {
+        add_weights(layer, weights.size(), weights.data());
+    }
+    void add_weights(size_t layer, size_t size, const float* weights);
+
+    CuDNN & m_cudnn;
+    std::vector<Layer> m_layers;
+
+};
+
+class CuDNN {
+    friend class OpenCL_Network;
+public:
+    void initialize(const int channels, const std::vector<int> & gpus,
+                    bool silent = false);
+    void ensure_thread_initialized(void);
+    std::string get_device_name();
+
+    std::vector<size_t> get_sgemm_tuners(void);
+
+private:
+
+void convolve(net_t *bufferIn,
+			  net_t *bufferOut,
+			  net_t *weights,
+			  net_t *workspace,
+			  size_t workspace_bytes,
+			  const conv_descriptor& conv_desc);
+
+void convolveActivation(net_t *bufferIn,
+					 net_t *bufferOut,
+					 net_t *weights,
+					 net_t *residualBuffer,
+					 net_t *biases,
+					 net_t *workspace,
+					 size_t workspace_bytes,
+					 const conv_descriptor& conv_desc);
+
+size_t convolve_init(int channels, int outputs, int kernel_size,
+		conv_descriptor& conv_desc);
+
+	cudnnHandle_t m_handle;
+    bool m_init_ok{false};
+};
+
+extern thread_local ThreadData opencl_thread_data;
+#endif
diff --git a/src/GTP.cpp b/src/GTP.cpp
@@ -57,7 +57,7 @@ int cfg_noise;
 int cfg_random_cnt;
 std::uint64_t cfg_rng_seed;
 bool cfg_dumbpass;
-#ifdef USE_OPENCL
+#ifdef USE_GPU
 std::vector<int> cfg_gpus;
 bool cfg_sgemm_exhaustive;
 bool cfg_tune_only;
@@ -79,7 +79,7 @@ void GTP::setup_default_parameters() {
     cfg_max_visits = std::numeric_limits<decltype(cfg_max_visits)>::max();
     cfg_timemanage = TimeManagement::AUTO;
     cfg_lagbuffer_cs = 100;
-#ifdef USE_OPENCL
+#ifdef USE_GPU
     cfg_gpus = { };
     cfg_sgemm_exhaustive = false;
     cfg_tune_only = false;

diff --git a/src/GTP.h b/src/GTP.h
@@ -40,7 +40,7 @@ extern int cfg_noise;
 extern int cfg_random_cnt;
 extern std::uint64_t cfg_rng_seed;
 extern bool cfg_dumbpass;
-#ifdef USE_OPENCL
+#ifdef USE_GPU
 extern std::vector<int> cfg_gpus;
 extern bool cfg_sgemm_exhaustive;
 extern bool cfg_tune_only;

diff --git a/src/Leela.cpp b/src/Leela.cpp
@@ -82,7 +82,7 @@ static void parse_commandline(int argc, char *argv[]) {
         ("logfile,l", po::value<std::string>(), "File to log input/output to.")
         ("quiet,q", "Disable all diagnostic output.")
         ("noponder", "Disable thinking on opponent's time.")
-#ifdef USE_OPENCL
+#ifdef USE_GPU
         ("gpu",  po::value<std::vector<int> >(),
                 "ID of the OpenCL device(s) to use (disables autodetection).")
         ("full-tuner", "Try harder to find an optimal OpenCL tuning.")
@@ -244,7 +244,7 @@ static void parse_commandline(int argc, char *argv[]) {
         }
     }
 
-#ifdef USE_OPENCL
+#ifdef USE_GPU
     if (vm.count("gpu")) {
         cfg_gpus = vm["gpu"].as<std::vector<int> >();
     }

diff --git a/src/Makefile b/src/Makefile
@@ -21,14 +21,17 @@ clang:
 		LDFLAGS='$(LDFLAGS) -flto -fuse-linker-plugin' \
 		leelaz
 
-DYNAMIC_LIBS = -lboost_program_options -lpthread -lz
+DYNAMIC_LIBS = -lboost_program_options -lpthread -lz -lcuda -lcudart -lcudnn
 LIBS =
 
 ifeq ($(THE_OS),Linux)
 # for Linux with OpenBLAS
-	CXXFLAGS += -I/usr/include/openblas
+	CUDNN = /usr/local/cuda/include
+	CUDART = /usr/local/cuda/lib64
+	CXXFLAGS += -I/usr/include/openblas -I$(CUDNN)
 	DYNAMIC_LIBS += -lopenblas
-	DYNAMIC_LIBS += -lOpenCL
+	#DYNAMIC_LIBS += -lOpenCL
+	DYNAMIC_LIBS += -L$(CUDART)
 endif
 ifeq ($(THE_OS),Darwin)
 # for macOS (comment out the Linux part)
@@ -50,7 +53,7 @@ sources = Network.cpp FullBoard.cpp KoState.cpp Training.cpp \
 	  SGFParser.cpp Timing.cpp Utils.cpp FastBoard.cpp \
 	  SGFTree.cpp Zobrist.cpp FastState.cpp GTP.cpp Random.cpp \
 	  SMP.cpp UCTNode.cpp OpenCL.cpp OpenCLScheduler.cpp \
-	  NNCache.cpp Tuner.cpp
+	  NNCache.cpp Tuner.cpp CuDNN.cpp
 
 objects = $(sources:.cpp=.o)
 deps = $(sources:%.cpp=%.d)