Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuDNN #6

Open
wants to merge 6 commits into
base: br2gpu
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
498 changes: 498 additions & 0 deletions src/CuDNN.cpp

Large diffs are not rendered by default.

149 changes: 149 additions & 0 deletions src/CuDNN.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
This file is part of Leela Zero.
Copyright (C) 2017 Henrik Forsten

Leela Zero is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Leela Zero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Leela Zero. If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef CUDNN_H_INCLUDED
#define CUDNN_H_INCLUDED

#include "config.h"
#include <cstddef>
#include <memory>
#include <string>
#include <vector>
#include <mutex>
#include <cudnn.h>

struct conv_descriptor {
cudnnTensorDescriptor_t input_descriptor;
cudnnTensorDescriptor_t output_descriptor;
cudnnTensorDescriptor_t bias_descriptor;
cudnnFilterDescriptor_t kernel_descriptor;
cudnnActivationDescriptor_t activation_descriptor;
cudnnConvolutionDescriptor_t convolution_descriptor;
cudnnConvolutionFwdAlgo_t convolution_algorithm;
};

class CuDNN;

class Layer {
friend class OpenCL_Network;
private:
unsigned int channels{0};
unsigned int outputs{0};
unsigned int filter_size{0};
bool is_input_convolution{false};
bool is_residual_block{false};
bool is_convolve1{false};
conv_descriptor conv_desc;
std::vector<net_t*> weights;
size_t workspace_size;
};

class ThreadData {
friend class CuDNN;
friend class OpenCL_Network;
private:
float *m_workspace;
float *m_InBuffer;
float *m_OutBuffer;
float *m_ResidualBuffer;
bool m_is_initialized{false};
bool m_buffers_allocated{false};
};

class OpenCL_Network {
public:
OpenCL_Network(CuDNN & opencl) : m_cudnn(opencl) {}
CuDNN & getCuDNN() {
return m_cudnn;
}

void push_input_convolution(unsigned int filter_size,
unsigned int channels,
unsigned int outputs,
const std::vector<float>& weights,
const std::vector<float>& biases);

void push_residual(unsigned int filter_size,
unsigned int channels,
unsigned int outputs,
const std::vector<float>& weights_1,
const std::vector<float>& biases_1,
const std::vector<float>& weights_2,
const std::vector<float>& biases_2);

void push_convolve1(unsigned int channels,
unsigned int outputs,
const std::vector<float>& weights);

size_t get_layer_count() const {
return m_layers.size();
}

void forward(const std::vector<net_t>& input,
std::vector<net_t>& output_pol,
std::vector<net_t>& output_val);

private:

void push_weights(size_t layer, const std::vector<float>& weights) {
add_weights(layer, weights.size(), weights.data());
}
void add_weights(size_t layer, size_t size, const float* weights);

CuDNN & m_cudnn;
std::vector<Layer> m_layers;

};

class CuDNN {
friend class OpenCL_Network;
public:
void initialize(const int channels, const std::vector<int> & gpus,
bool silent = false);
void ensure_thread_initialized(void);
std::string get_device_name();

std::vector<size_t> get_sgemm_tuners(void);

private:

void convolve(net_t *bufferIn,
net_t *bufferOut,
net_t *weights,
net_t *workspace,
size_t workspace_bytes,
const conv_descriptor& conv_desc);

void convolveActivation(net_t *bufferIn,
net_t *bufferOut,
net_t *weights,
net_t *residualBuffer,
net_t *biases,
net_t *workspace,
size_t workspace_bytes,
const conv_descriptor& conv_desc);

size_t convolve_init(int channels, int outputs, int kernel_size,
conv_descriptor& conv_desc);

cudnnHandle_t m_handle;
bool m_init_ok{false};
};

extern thread_local ThreadData opencl_thread_data;
#endif
4 changes: 2 additions & 2 deletions src/GTP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ int cfg_noise;
int cfg_random_cnt;
std::uint64_t cfg_rng_seed;
bool cfg_dumbpass;
#ifdef USE_OPENCL
#ifdef USE_GPU
std::vector<int> cfg_gpus;
bool cfg_sgemm_exhaustive;
bool cfg_tune_only;
Expand All @@ -79,7 +79,7 @@ void GTP::setup_default_parameters() {
cfg_max_visits = std::numeric_limits<decltype(cfg_max_visits)>::max();
cfg_timemanage = TimeManagement::AUTO;
cfg_lagbuffer_cs = 100;
#ifdef USE_OPENCL
#ifdef USE_GPU
cfg_gpus = { };
cfg_sgemm_exhaustive = false;
cfg_tune_only = false;
Expand Down
2 changes: 1 addition & 1 deletion src/GTP.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ extern int cfg_noise;
extern int cfg_random_cnt;
extern std::uint64_t cfg_rng_seed;
extern bool cfg_dumbpass;
#ifdef USE_OPENCL
#ifdef USE_GPU
extern std::vector<int> cfg_gpus;
extern bool cfg_sgemm_exhaustive;
extern bool cfg_tune_only;
Expand Down
4 changes: 2 additions & 2 deletions src/Leela.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ static void parse_commandline(int argc, char *argv[]) {
("logfile,l", po::value<std::string>(), "File to log input/output to.")
("quiet,q", "Disable all diagnostic output.")
("noponder", "Disable thinking on opponent's time.")
#ifdef USE_OPENCL
#ifdef USE_GPU
("gpu", po::value<std::vector<int> >(),
"ID of the OpenCL device(s) to use (disables autodetection).")
("full-tuner", "Try harder to find an optimal OpenCL tuning.")
Expand Down Expand Up @@ -244,7 +244,7 @@ static void parse_commandline(int argc, char *argv[]) {
}
}

#ifdef USE_OPENCL
#ifdef USE_GPU
if (vm.count("gpu")) {
cfg_gpus = vm["gpu"].as<std::vector<int> >();
}
Expand Down
11 changes: 7 additions & 4 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@ clang:
LDFLAGS='$(LDFLAGS) -flto -fuse-linker-plugin' \
leelaz

DYNAMIC_LIBS = -lboost_program_options -lpthread -lz
DYNAMIC_LIBS = -lboost_program_options -lpthread -lz -lcuda -lcudart -lcudnn
LIBS =

ifeq ($(THE_OS),Linux)
# for Linux with OpenBLAS
CXXFLAGS += -I/usr/include/openblas
CUDNN = /usr/local/cuda/include
CUDART = /usr/local/cuda/lib64
CXXFLAGS += -I/usr/include/openblas -I$(CUDNN)
DYNAMIC_LIBS += -lopenblas
DYNAMIC_LIBS += -lOpenCL
#DYNAMIC_LIBS += -lOpenCL
DYNAMIC_LIBS += -L$(CUDART)
endif
ifeq ($(THE_OS),Darwin)
# for macOS (comment out the Linux part)
Expand All @@ -50,7 +53,7 @@ sources = Network.cpp FullBoard.cpp KoState.cpp Training.cpp \
SGFParser.cpp Timing.cpp Utils.cpp FastBoard.cpp \
SGFTree.cpp Zobrist.cpp FastState.cpp GTP.cpp Random.cpp \
SMP.cpp UCTNode.cpp OpenCL.cpp OpenCLScheduler.cpp \
NNCache.cpp Tuner.cpp
NNCache.cpp Tuner.cpp CuDNN.cpp

objects = $(sources:.cpp=.o)
deps = $(sources:%.cpp=%.d)
Expand Down
Loading