Skip to content

Commit

Permalink
Improved PAPI monitoring.
Browse files Browse the repository at this point in the history
  • Loading branch information
aowenson committed May 30, 2019
1 parent bd9bbd8 commit 951a272
Show file tree
Hide file tree
Showing 15 changed files with 101 additions and 28 deletions.
21 changes: 13 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ NVCCFLAGS += $(CODE_GEN_CUDA) -m64 -Xptxas -dlcm=ca -Xptxas=-v -use_fast_math -O


MGCFD_INCS := -Isrc -Isrc/Kernels
ifdef PAPI
MGCFD_INCS += -DPAPI
MGCFD_LIBS := -lpapi -lpfm
endif


## Enable VERIFY_OP2_TIMING to perform timing measurements external to
## those performed by OP2 internally. Intended to verify whether OP2 timers
Expand Down Expand Up @@ -237,7 +242,7 @@ $(OBJ_DIR)/mgcfd_seq_kernels.o:
-c -o $@ $(SRC_DIR)/../seq/_seqkernels.cpp
$(BIN_DIR)/mgcfd_seq: $(OP2_SEQ_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CPPFLAGS) $(OPTIMISE) $^ \
$(MPICPP) $(CPPFLAGS) $(OPTIMISE) $(MGCFD_LIBS) $^ \
-lm $(OP2_LIB) -lop2_seq -lop2_hdf5 $(HDF5_LIB) $(PARMETIS_LIB) $(PTSCOTCH_LIB) \
-o $@

Expand All @@ -255,7 +260,7 @@ $(OBJ_DIR)/mgcfd_openmp_kernels.o:
-c -o $@ $(SRC_DIR)/../openmp/_kernels.cpp
$(BIN_DIR)/mgcfd_openmp: $(OP2_OMP_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CPPFLAGS) $(OMPFLAGS) $^ $(OPTIMISE) \
$(MPICPP) $(CPPFLAGS) $(OMPFLAGS) $^ $(OPTIMISE) $(MGCFD_LIBS) \
-lm $(OP2_LIB) -lop2_openmp -lop2_hdf5 $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) \
-o $@

Expand All @@ -273,7 +278,7 @@ $(OBJ_DIR)/mgcfd_mpi_main.o:
-c -o $@ $(OP2_MAIN_SRC)
$(BIN_DIR)/mgcfd_mpi: $(OP2_MPI_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CPPFLAGS) $^ $(OPTIMISE) \
$(MPICPP) $(CPPFLAGS) $^ $(OPTIMISE) $(MGCFD_LIBS) \
-lm $(OP2_LIB) -lop2_mpi $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) \
-o $@

Expand All @@ -291,7 +296,7 @@ $(OBJ_DIR)/mgcfd_mpi_openmp_main.o:
-c -o $@ $(OP2_MAIN_SRC)
$(BIN_DIR)/mgcfd_mpi_openmp: $(OP2_MPI_OMP_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CPPFLAGS) $(OMPFLAGS) $^ $(OPTIMISE) \
$(MPICPP) $(CPPFLAGS) $(OMPFLAGS) $^ $(OPTIMISE) $(MGCFD_LIBS) \
-lm $(OP2_LIB) -lop2_mpi $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) \
-o $@

Expand All @@ -307,7 +312,7 @@ $(OBJ_DIR)/mgcfd_cuda_main.o:
-c -o $@ $(OP2_MAIN_SRC)
$(BIN_DIR)/mgcfd_cuda: $(OP2_CUDA_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CFLAGS) $^ $(OPTIMISE) \
$(MPICPP) $(CFLAGS) $^ $(OPTIMISE) $(MGCFD_LIBS) \
$(CUDA_LIB) -lcudart $(OP2_LIB) -lop2_cuda $(HDF5_LIB) -lop2_hdf5 \
-o $@

Expand All @@ -327,7 +332,7 @@ $(OBJ_DIR)/mgcfd_omp4_main.o:
-Iopenmp4/ -c -o $@ $(OP2_MAIN_SRC)
$(BIN_DIR)/mgcfd_openmp4: $(OP2_OMP4_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CPPFLAGS) $(OMPOFFLOAD) $^ $(OPTIMISE) \
$(MPICPP) $(CPPFLAGS) $(OMPOFFLOAD) $^ $(OPTIMISE) $(MGCFD_LIBS) \
$(OP2_LIB) -lop2_openmp4 $(CUDA_LIB) -lcudart \
$(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -lop2_hdf5 \
-o $@
Expand All @@ -347,7 +352,7 @@ $(OBJ_DIR)/mgcfd_openacc_main.o:
-c -o $@ $(OP2_MAIN_SRC)
$(BIN_DIR)/mgcfd_openacc: $(OP2_OPENACC_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OPTIMISE) $^ \
$(MPICPP) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OPTIMISE) $(MGCFD_LIBS) $^ \
$(CUDA_LIB) -lcudart $(OP2_LIB) -lop2_cuda $(HDF5_LIB) -lop2_hdf5 \
-o $@

Expand All @@ -364,7 +369,7 @@ $(OBJ_DIR)/mgcfd_mpi_cuda_main.o:
-c -o $@ $(OP2_MAIN_SRC)
$(BIN_DIR)/mgcfd_mpi_cuda: $(OP2_MPI_CUDA_OBJECTS)
mkdir -p $(BIN_DIR)
$(MPICPP) $(CFLAGS) $(OPTIMISE) $^ \
$(MPICPP) $(CFLAGS) $(OPTIMISE) $(MGCFD_LIBS) $^ \
$(CUDA_LIB) -lcudart $(OP2_LIB) -lop2_mpi_cuda $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) \
-o $@

Expand Down
4 changes: 4 additions & 0 deletions cuda/compute_flux_edge_kernel_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include "global.h"
#include "config.h"

#ifdef PAPI
#include <papi.h>
#endif

//user function
__device__ void compute_flux_edge_kernel_gpu(
const double *variables_a,
Expand Down
4 changes: 4 additions & 0 deletions openacc/compute_flux_edge_kernel_acckernel.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
#include "global.h"
#include "config.h"

#ifdef PAPI
#include <papi.h>
#endif

//user function
//#pragma acc routine
inline void compute_flux_edge_kernel_openacc(
Expand Down
4 changes: 4 additions & 0 deletions openmp/compute_flux_edge_kernel_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
//user function
#include ".././src/Kernels/flux.h"

#ifdef PAPI
#include <papi.h>
#endif

// host stub function
void op_par_loop_compute_flux_edge_kernel(char const *name, op_set set,
op_arg arg0,
Expand Down
4 changes: 4 additions & 0 deletions openmp4/compute_flux_edge_kernel_omp4kernel_func.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include "global.h"
#include "config.h"

#ifdef PAPI
#include <papi.h>
#endif

void compute_flux_edge_kernel_omp4_kernel(
int *map0,
int map0size,
Expand Down
3 changes: 3 additions & 0 deletions run-inputs/annotated.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
"cpp wrapper": "",
"mpicpp wrapper": "",

"papi comment": "Toggle whether to monitor PAPI events. Currently only implemented in 'seq' and 'mpi' codes.",
"papi": true,

"general parallelism comment": "If possible, enabled modes below will be combined; if not, gen_job.py will complain",
"mpi comment": "Toggle whether to use MPI",
"mpi": true,
Expand Down
2 changes: 1 addition & 1 deletion run-scripts/aggregate-output-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def clean_pd_read_csv(filepath):
return df

def get_data_colnames(df):
mg_cfd_data_colnames = ["iters", "computeTime", "syncTime"]
mg_cfd_data_colnames = ["iters", "computeTime", "syncTime", "count"]
op2_data_colnames = ["count", "total time", "plan time", "mpi time", "GB used", "GB total"]
data_colnames = list(Set(mg_cfd_data_colnames+op2_data_colnames).intersection(Set(df.columns.values)))
return data_colnames
Expand Down
11 changes: 11 additions & 0 deletions run-scripts/gen_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
defaults = {}
# Compilation:
defaults["compiler"] = "intel"
defaults["papi"] = False
defaults["cpp wrapper"] = ""
defaults["mpicpp wrapper"] = ""
defaults["openmp"] = False
Expand Down Expand Up @@ -102,6 +103,7 @@ def delete_folder_contents(dirpath):
js = get_key_value(profile, "setup", "job scheduler")

compiler = get_key_value(profile, "compile", "compiler")
use_papi = get_key_value(profile, "compile", "papi")
cpp_wrapper = get_key_value(profile, "compile", "cpp wrapper")
mpicpp_wrapper = get_key_value(profile, "compile", "mpicpp wrapper")
use_mpi = get_key_value(profile, "compile", "mpi")
Expand Down Expand Up @@ -130,6 +132,14 @@ def delete_folder_contents(dirpath):
if use_mpi:
raise Exception("Cannot combine OpenACC and MPI")

if use_papi:
if use_openmp:
print("WARNING: PAPI monitoring not yet implemented in OpenMP codes. Disabling PAPI.")
use_papi = False
if use_openmp4 or use_openacc or use_cuda:
print("WARNING: PAPI monitoring with accelerator codes is nonsense. Disabling PAPI.")
use_papi = False

if use_mpi:
if use_cuda:
bin_filename = "mgcfd_mpi_cuda"
Expand Down Expand Up @@ -251,6 +261,7 @@ def delete_folder_contents(dirpath):

## - Compilation:
py_sed(batch_filepath, "<COMPILER>", compiler)
py_sed(batch_filepath, "<PAPI>", str(use_papi).lower())
py_sed(batch_filepath, "<CPP_WRAPPER>", cpp_wrapper)
py_sed(batch_filepath, "<MPICPP_WRAPPER>", mpicpp_wrapper)
py_sed(batch_filepath, "<MPI>", str(use_mpi).lower())
Expand Down
9 changes: 8 additions & 1 deletion run-templates/run-mgcfd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ set -u

# Compilation variables:
compiler=<COMPILER>
papi=<PAPI>
cpp_wrapper="<CPP_WRAPPER>"
mpicpp_wrapper="<MPICPP_WRAPPER>"
mpi=<MPI>
Expand Down Expand Up @@ -71,6 +72,9 @@ bin_filepath="${app_dirpath}/bin/${bin_filename}"
if [ "$mpicpp_wrapper" != "" ]; then
make_cmd+="MPICPP_WRAPPER=$mpicpp_wrapper "
fi
if $papi ; then
make_cmd+="PAPI=1 "
fi
make_cmd+="make -j4 $bin_filename"
eval "$make_cmd"
chmod a+x "$bin_filepath"
Expand Down Expand Up @@ -99,7 +103,10 @@ else
exec_command=""
fi
fi
exec_command+=" $bin_filepath OP_MAPS_BASE_INDEX=1 -i input.dat -p ${run_outdir}/papi.conf -o ${run_outdir}/ -g $mg_cycles -m $partitioner"
exec_command+=" $bin_filepath OP_MAPS_BASE_INDEX=1 -i input.dat -o ${run_outdir}/ -g $mg_cycles -m $partitioner"
if $papi ; then
exec_command+=" -p ${run_outdir}/papi.conf"
fi
if $validate_solution ; then
exec_command+=" -v"
fi
Expand Down
27 changes: 27 additions & 0 deletions seq/compute_flux_edge_kernel_seqkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
//user function
#include ".././src/Kernels/flux.h"

#ifdef PAPI
#include "papi_funcs.h"
#endif

// host stub function
void op_par_loop_compute_flux_edge_kernel(char const *name, op_set set,
op_arg arg0,
Expand Down Expand Up @@ -51,14 +55,31 @@ void op_par_loop_compute_flux_edge_kernel(char const *name, op_set set,

if (set->size >0) {

#ifdef PAPI
// Init and start PAPI
long_long* temp_count_stores = (long_long*)malloc(sizeof(long_long)*num_events);
for (int e=0; e<num_events; e++) temp_count_stores[e] = 0;
my_papi_start(event_set);
#endif

op_timers_core(&inner_cpu_t1, &inner_wall_t1);
for ( int n=0; n<set_size; n++ ){
if (n==set->core_size) {
#ifdef PAPI
my_papi_stop(event_counts, temp_count_stores, event_set, num_events);
#endif

op_timers_core(&inner_cpu_t2, &inner_wall_t2);
compute_time += inner_wall_t2 - inner_wall_t1;
op_mpi_wait_all(nargs, args);
op_timers_core(&inner_cpu_t1, &inner_wall_t1);
sync_time += inner_wall_t1 - inner_wall_t2;

#ifdef PAPI
// Restart PAPI
for (int e=0; e<num_events; e++) temp_count_stores[e] = 0;
my_papi_start(event_set);
#endif
}
int map0idx = arg0.map_data[n * arg0.map->dim + 0];
int map1idx = arg0.map_data[n * arg0.map->dim + 1];
Expand All @@ -74,6 +95,12 @@ void op_par_loop_compute_flux_edge_kernel(char const *name, op_set set,
op_timers_core(&inner_cpu_t2, &inner_wall_t2);
compute_time += inner_wall_t2 - inner_wall_t1;
iter_counts += set_size;

#ifdef PAPI
my_papi_stop(event_counts, temp_count_stores, event_set, num_events);
for (int e=0; e<num_events; e++) temp_count_stores[e] = 0;
free(temp_count_stores);
#endif
}

op_timers_core(&inner_cpu_t1, &inner_wall_t1);
Expand Down
1 change: 0 additions & 1 deletion src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,6 @@ inline bool parse_arguments(int argc, char** argv) {
case 'h':
print_help();
return false;
break;
case 'i':
set_config_param("input_file", strdup(optarg));
break;
Expand Down
7 changes: 4 additions & 3 deletions src/euler3d_cpu_double.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@

#include "hdf5.h"

#ifdef PAPI
#include "papi_funcs.h"
#endif

// OP2:
#include "op_seq.h"
#include "op_hdf5.h"
Expand All @@ -32,9 +36,6 @@
#include "utils.h"
#include "io.h"
#include "timer.h"
#ifdef PAPI
#include "papi_funcs.h"
#endif

// Global scalars:
double smoothing_coefficient = double(0.2f);
Expand Down
23 changes: 13 additions & 10 deletions src/papi_funcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,9 @@ inline void dump_papi_counters_to_file(
header << "Rank";
header << ",Partitioner";
header << ",PAPI counter";
for (int l=0; l<num_levels; l++) {
header << "," << "flux" << l ;
}
header << ",kernel";
header << ",level";
header << ",count";
}

std::ofstream outfile;
Expand All @@ -238,17 +238,20 @@ inline void dump_papi_counters_to_file(
exit(EXIT_FAILURE);
}

std::ostringstream event_data_line;
event_data_line << rank;
event_data_line << "," << conf.partitioner_string;
event_data_line << "," << eventName;

for (int l=0; l<num_levels; l++) {
std::ostringstream event_data_line;
event_data_line << rank;
event_data_line << "," << conf.partitioner_string;
event_data_line << "," << eventName;

event_data_line << "," << "compute_flux_edge_kernel";
event_data_line << "," << l;

const int idx = l*num_events + eid;
event_data_line << ',' << flux_kernel_event_counts[idx];
}

outfile << event_data_line.str() << std::endl;
outfile << event_data_line.str() << std::endl;
}
}
outfile.close();
}
Expand Down
2 changes: 1 addition & 1 deletion src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ inline std::string& trim(std::string &s) {

static char *copy_str(char const *src) {
const size_t len = strlen(src) + 1;
char *dest = (char *)op_calloc(len, sizeof(char));
char *dest = (char*)malloc(len * sizeof(char));
return strncpy(dest, src, len);
}

Expand Down
7 changes: 4 additions & 3 deletions src_op/euler3d_cpu_double_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

#include "hdf5.h"

#ifdef PAPI
#include "papi_funcs.h"
#endif

// OP2:
#include "op_lib_cpp.h"

Expand Down Expand Up @@ -188,9 +192,6 @@ void op_par_loop_count_non_zeros(char const *, op_set,
#include "utils.h"
#include "io.h"
#include "timer.h"
#ifdef PAPI
#include "papi_funcs.h"
#endif

// Global scalars:
double smoothing_coefficient = double(0.2f);
Expand Down

0 comments on commit 951a272

Please sign in to comment.