Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

One (cuda) kernel to run them all #13

Open
wants to merge 2 commits into
base: fpga
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions cpp-cuda/makefile

This file was deleted.

84 changes: 8 additions & 76 deletions cpp-cuda/cpp-cuda.cu → cuda/cpp-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
#include <sys/time.h>

// Cuda includes
#include <curand.h>
#include <curand_kernel.h>
#include "kernel.cuh"

#define BINS_MAX 30
#define ALPHA 0.1

double gpu_double_reduction(const int array_size, const double *target_array) {
Expand All @@ -21,77 +19,6 @@ double gpu_double_reduction(const int array_size, const double *target_array) {
return res;
}

__global__
void events_kernel(double *all_randoms, double *all_xwgts, int n, int n_events, double xjac, double *all_res, double *all_res2) {
int block_id = blockIdx.x;
int thread_id = threadIdx.x;
int block_size = blockDim.x;

int index = block_id*block_size + thread_id;
int grid_dim = gridDim.x;
int stride = block_size * grid_dim;

// shared lepage
double a = 0.1;
double pref = pow(1.0/a/sqrt(M_PI), n);

for (int i = index; i < n_events; i += stride) {
double wgt = all_xwgts[i]*xjac;

double coef = 0.0;
for (int j = 0; j < n; j++) {
coef += pow( (all_randoms[i*n + j] - 1.0/2.0)/a, 2 );
}
double lepage = pref*exp(-coef);

double tmp = wgt*lepage;
all_res[i] = tmp;
all_res2[i] = pow(tmp,2);
}
}

__global__
void generate_random_array_kernel(const int n_events, const int n_dim, const double *divisions, double *all_randoms, double *all_wgts, int *all_div_indexes) {
double reg_i = 0.0;
double reg_f = 1.0;

int block_id = blockIdx.x;
int thread_id = threadIdx.x;
int block_size = blockDim.x;

int index = block_id*block_size + thread_id;
int grid_dim = gridDim.x;
int stride = block_size * grid_dim;

// Use curandState_t to keep track of the seed, which is thread dependent
curandState_t state;
// seed-sequence-offset
curand_init(index, 0, 0, &state);

for (int j = index; j < n_events; j+= stride) {
double wgt = 1.0;
for (int i = 0; i < n_dim; i++) {
double rn = (double) curand(&state)/RAND_MAX/2; //unsigned?
double xn = BINS_MAX*(1.0 - rn);
int int_xn = MAX(0, MIN( (int) xn, BINS_MAX));
double aux_rand = xn - int_xn;
double x_ini = 0.0;
if (int_xn > 0) {
x_ini = divisions[BINS_MAX*i + int_xn-1];
}
double xdelta = divisions[BINS_MAX*i + int_xn] - x_ini;
double rand_x = x_ini + xdelta*aux_rand;
wgt *= xdelta*BINS_MAX;
// Now we need to add an offset to the arrays
all_randoms[j*n_dim + i] = reg_i + rand_x*(reg_f - reg_i);
all_div_indexes[j*n_dim + i] = int_xn;
}
all_wgts[j] = wgt;
}
}



void rebin(const double *rw, const double rc, double *subdivisions) {
int k = -1;
double dr = 0.0;
Expand Down Expand Up @@ -237,8 +164,13 @@ int vegas(const int warmup, const int n_dim, const int n_iter, const int n_event
}

int main(int argc, char *argv[]) {
int n_dim = 7;
int n_events = (int) 1e6;

if (argc < 2) {
fprintf(stderr, "usage %s number_of_events number_of_dimensions\n", argv[0]);
exit(0);
}
int n_events = atoi(argv[1]);
int n_dim = atoi(argv[2]);
int n_iter = 5;

double res, sigma;
Expand Down
77 changes: 77 additions & 0 deletions cuda/kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <curand.h>
#include <curand_kernel.h>

#define BINS_MAX 30
#define ALPHA 0.1

extern "C" {
__global__
void events_kernel(double *all_randoms, double *all_xwgts, int n, int n_events, double xjac, double *all_res, double *all_res2) {
int block_id = blockIdx.x;
int thread_id = threadIdx.x;
int block_size = blockDim.x;

int index = block_id*block_size + thread_id;
int grid_dim = gridDim.x;
int stride = block_size * grid_dim;

// shared lepage
double a = 0.1;
double pref = pow(1.0/a/sqrt(M_PI), n);

for (int i = index; i < n_events; i += stride) {
double wgt = all_xwgts[i]*xjac;

double coef = 0.0;
for (int j = 0; j < n; j++) {
coef += pow( (all_randoms[i*n + j] - 1.0/2.0)/a, 2 );
}
double lepage = pref*exp(-coef);

double tmp = wgt*lepage;
all_res[i] = tmp;
all_res2[i] = pow(tmp,2);
}
}

__global__
void generate_random_array_kernel(const int n_events, const int n_dim, const double *divisions, double *all_randoms, double *all_wgts, int *all_div_indexes) {
double reg_i = 0.0;
double reg_f = 1.0;

int block_id = blockIdx.x;
int thread_id = threadIdx.x;
int block_size = blockDim.x;

int index = block_id*block_size + thread_id;
int grid_dim = gridDim.x;
int stride = block_size * grid_dim;

// Use curandState_t to keep track of the seed, which is thread dependent
curandState_t state;
// seed-sequence-offset
curand_init(index, 0, 0, &state);

for (int j = index; j < n_events; j+= stride) {
double wgt = 1.0;
for (int i = 0; i < n_dim; i++) {
double rn = curand_uniform_double(&state);
double xn = BINS_MAX*(1.0 - rn);
int int_xn = max(0, min( (int) xn, BINS_MAX));
double aux_rand = xn - int_xn;
double x_ini = 0.0;
if (int_xn > 0) {
x_ini = divisions[BINS_MAX*i + int_xn-1];
}
double xdelta = divisions[BINS_MAX*i + int_xn] - x_ini;
double rand_x = x_ini + xdelta*aux_rand;
wgt *= xdelta*BINS_MAX;
// Now we need to add an offset to the arrays
all_randoms[j*n_dim + i] = reg_i + rand_x*(reg_f - reg_i);
all_div_indexes[j*n_dim + i] = int_xn;
}
all_wgts[j] = wgt;
}
}

}
9 changes: 9 additions & 0 deletions cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#define BINS_MAX 30

extern "C" {
__global__
void events_kernel(double *all_randoms, double *all_xwgts, int n, int n_events, double xjac, double *all_res, double *all_res2);

__global__
void generate_random_array_kernel(const int n_events, const int n_dim, const double *divisions, double *all_randoms, double *all_wgts, int *all_div_indexes);
}
24 changes: 24 additions & 0 deletions cuda/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
NC = nvcc
NF = -O3

EVENTS=1000000
DIMENSIONS=7

TARGET = cpp-cuda
SRC = cpp-cuda.cu kernel.cu

.PHONY: all clean run-cpp run-python

all: $(TARGET)

$(TARGET): $(SRC)
$(NC) $(NF) $^ -o $@

run-cpp: $(TARGET)
./$(TARGET) $(EVENTS) $(DIMENSIONS)

run-python:
python vegas_mc_pycuda.py -n $(EVENTS) -d $(DIMENSIONS)

clean:
rm -f cpp-cuda
105 changes: 21 additions & 84 deletions python/vegas_mc_pycuda.py → cuda/vegas_mc_pycuda.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/env python
import time
from integrand import MC_INTEGRAND, setup
import numpy as np
import numba as nb
import pycuda.driver as drv
Expand All @@ -10,86 +9,16 @@

BINS_MAX = 30
ALPHA = 0.1

mod = SourceModule("""
#include <curand.h>
#include <curand_kernel.h>

#define BINS_MAX 30
#define ALPHA 0.1
# Read the cuda kernel
f = open('kernel.cu', 'r')
kernel_str = f.read()
f.close()
kernel = SourceModule(kernel_str, no_extern_c=True)

extern "C" __global__
void events_kernel(double *all_randoms, double *all_xwgts, int n, int n_events, double xjac, double *all_res, double *all_res2) {
int block_id = blockIdx.x;
int thread_id = threadIdx.x;
int block_size = blockDim.x;

int index = block_id*block_size + thread_id;
int grid_dim = gridDim.x;
int stride = block_size * grid_dim;

// shared lepage
double a = 0.1;
double pref = pow(1.0/a/sqrt(M_PI), n);

for (int i = index; i < n_events; i += stride) {
double wgt = all_xwgts[i]*xjac;

double coef = 0.0;
for (int j = 0; j < n; j++) {
coef += pow( (all_randoms[i*n + j] - 1.0/2.0)/a, 2 );
}
double lepage = pref*exp(-coef);

double tmp = wgt*lepage;
all_res[i] = tmp;
all_res2[i] = pow(tmp,2);
}
}

extern "C" __global__
void generate_random_array_kernel(const int n_events, const int n_dim, const double *divisions, double *all_randoms, double *all_wgts, int *all_div_indexes) {
double reg_i = 0.0;
double reg_f = 1.0;

int block_id = blockIdx.x;
int thread_id = threadIdx.x;
int block_size = blockDim.x;

int index = block_id*block_size + thread_id;
int grid_dim = gridDim.x;
int stride = block_size * grid_dim;

// Use curandState_t to keep track of the seed, which is thread dependent
curandState_t state;
// seed-sequence-offset
curand_init(index, 0, 0, &state);

for (int j = index; j < n_events; j+= stride) {
double wgt = 1.0;
for (int i = 0; i < n_dim; i++) {
double rn = (double) curand(&state)/RAND_MAX/2; //unsigned?
double xn = BINS_MAX*(1.0 - rn);
int int_xn = max(0, min( (int) xn, BINS_MAX));
double aux_rand = xn - int_xn;
double x_ini = 0.0;
if (int_xn > 0) {
x_ini = divisions[BINS_MAX*i + int_xn-1];
}
double xdelta = divisions[BINS_MAX*i + int_xn] - x_ini;
double rand_x = x_ini + xdelta*aux_rand;
wgt *= xdelta*BINS_MAX;
// Now we need to add an offset to the arrays
all_randoms[j*n_dim + i] = reg_i + rand_x*(reg_f - reg_i);
all_div_indexes[j*n_dim + i] = int_xn;
}
all_wgts[j] = wgt;
}
}
""", no_extern_c=True)

events_kernel = mod.get_function("events_kernel")
generate_random_array_kernel = mod.get_function("generate_random_array_kernel")
events_kernel = kernel.get_function("events_kernel")
generate_random_array_kernel = kernel.get_function("generate_random_array_kernel")

@nb.njit(nb.float64())
def internal_rand():
Expand Down Expand Up @@ -275,12 +204,20 @@ def integrate(self, iters = 5, calls = 1e4):

if __name__ == '__main__':
"""Testing a basic integration"""
ncalls = setup['ncalls']
xlow = setup['xlow']
xupp = setup['xupp']
dim = setup['dim']

print(f'VEGAS MC numba, ncalls={ncalls}:')
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("-n", "--ncalls", help = "Number of events", default = int(1e6), type = int)
parser.add_argument("-d", "--dimensions", help = "Number of dimensions", default = 7, type = int)
# Not used at the moment
parser.add_argument("--xlow", default = 0.0, type = float)
parser.add_argument("--xupp", default = 1.0, type = float)
args = parser.parse_args()
ncalls = args.ncalls
xlow = args.xlow
xupp = args.xupp
dim = args.dimensions

print(f'VEGAS MC numba, ncalls={ncalls}, dimensions={dim}:')
start = time.time()
v = make_vegas(dim=dim, xl = xlow, xu = xupp)
r = v.integrate(calls=ncalls)
Expand Down