Skip to content

Commit

Permalink
added option to select gpu device based on free memory
Browse files Browse the repository at this point in the history
  • Loading branch information
brodyh committed Feb 12, 2015
1 parent 9384dd8 commit f7da4ba
Showing 1 changed file with 38 additions and 2 deletions.
40 changes: 38 additions & 2 deletions src/caffe/util/mpi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
#include <mpi.h>
#endif
#include <cstdlib>
#include <vector>
#include <utility>
#include <algorithm>

#include "caffe/common.hpp"
#include "caffe/util/mpi.hpp"


#ifdef USE_MPI
// MPI: various checks for different function calls.
#define MPI_CHECK(condition) \
Expand All @@ -23,6 +25,7 @@ namespace caffe {

static bool distributed = false;

static int get_free_deviceid(int local_rank);

void caffe_init_mpi(int* pargc, char*** pargv) {
const char* local_rank_env = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");
Expand All @@ -31,7 +34,8 @@ void caffe_init_mpi(int* pargc, char*** pargv) {
if (local_rank_env) {
#ifdef USE_MPI
distributed = true;
int local_rank = std::atoi(local_rank_env);
int local_rank = (std::atoi(local_rank_env));
local_rank = get_free_deviceid(local_rank);
Caffe::SetDevice(local_rank);
int provided, requested = MPI_THREAD_MULTIPLE;
MPI_CHECK(MPI_Init_thread(pargc, pargv, requested, &provided));
Expand Down Expand Up @@ -60,6 +64,38 @@ void caffe_init_mpi(int* pargc, char*** pargv) {
}


struct sort_pred {
bool operator()(const std::pair<size_t, int>& left,
const std::pair<size_t, int>& right) {
return left.first > right.first;
}
};


// This is somewhat experimental because there is no guarantee all
// ranks will return the same info for free GPUs
static int get_free_deviceid(int local_rank) {
int num_devices, orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaGetDeviceCount(&num_devices));
std::vector<std::pair<size_t, int> > device_mems(num_devices);
for (int devid = 0; devid < num_devices; ++devid) {
Caffe::SetDevice(devid);
size_t free_mem = Caffe::DeviceMemoryFree();
device_mems[devid] = std::pair<int, size_t>(free_mem, devid);
}
std::sort(device_mems.begin(), device_mems.end(), sort_pred());
Caffe::SetDevice(orig_device);
std::ostringstream info;
for (int devid = 0; devid < num_devices; ++devid) {
info << "(" << device_mems[devid].second << ", "
<< device_mems[devid].first / (1024.*1024.) << ") ";
}
LOG(INFO) << "Rank "<< local_rank << " " << info.str();
return device_mems[local_rank].second;
}


void caffe_finalize_mpi() {
#ifdef USE_MPI
if (distributed)
Expand Down

0 comments on commit f7da4ba

Please sign in to comment.