diff --git a/pyop2/host.py b/pyop2/host.py index 2869f40c9..373fdac5b 100644 --- a/pyop2/host.py +++ b/pyop2/host.py @@ -85,11 +85,13 @@ def _iaca_ast_to_c(self, ast, opts={}): iaca_ast, last, nest, loop_count = coffee.utils.insert_iaca(ast, iakify) if not last: iakify += 1 - ast_handler = ASTKernel(iaca_ast, self._include_dirs) - ast_handler.plan_cpu(opts) - self._applied_blas = ast_handler.blas - self._applied_ap = ast_handler.ap - iaca_kernels.append([ast_handler.gencode(), nest, loop_count]) + # ast_handler = ASTKernel(iaca_ast, self._include_dirs) + # ast_handler.plan_cpu(opts) + # self._applied_blas = ast_handler.blas + # self._applied_ap = ast_handler.ap + # iaca_kernels.append([ast_handler.gencode(), nest, loop_count]) + # from IPython import embed; embed() + iaca_kernels.append([iaca_ast.gencode(), nest, loop_count]) return iaca_kernels @@ -777,18 +779,25 @@ def sum_nested_flop_values(self, iaca_kernels, val_pos): for ind, ic in enumerate(reversed(iaca_kernels)): if ic[1] == prev_nest: i_flops += (ic[2] if ic[3] else 1) * ic[val_pos] + # print "same nest: ", i_flops elif ic[1] < prev_nest: iaca_block_flops = 0 for ic_prev in iaca_kernels[len(iaca_kernels) - ind:]: if prev_nest == ic_prev[1]: iaca_block_flops += ic_prev[val_pos] - else: - break - i_flops = (ic[2] if ic[3] else 1) * (i_flops + ic[val_pos] - iaca_block_flops) + # else: + # break + curr_flops = (ic[2] if ic[3] else 1) * (i_flops + ic[val_pos] - iaca_block_flops) + if ic[6] == 1: + i_flops = min(curr_flops, (ic[2] if ic[3] else 1) * ic[val_pos]) + else: + i_flops = curr_flops + # print "prev nest greater: ", i_flops else: # Not tested yet. Might never apply to FFC kernels. total_flops += i_flops i_flops = (ic[2] if ic[3] else 1) * ic[val_pos] + # print "prev nest smaller: ", i_flops prev_nest = ic[1] total_flops += i_flops return total_flops @@ -807,12 +816,14 @@ def sum_nested_cycle_values(self, iaca_kernels, val_pos): if ic[1] == prev_nest: # Add the cycles or flops and multiply by loop counter if the loop is not unrolled i_flops += (ic[2] if ic[3] else 1) * ic[val_pos] + # print "same nest: ", i_flops # if the current loop contains the previous one (has a lower nest number). elif ic[1] < prev_nest: # if there is only one jump instruction and the loop is not unrolled (i.e. inner loops unrolled but current not unrolled) if ic[6] == 1 and ic[3]: # loop counter times the number of cycles i_flops = ic[2] * ic[val_pos] + # print "greater nest 1: ", i_flops else: # sum up the values for the static cycle counts for the loops contaied in the current loop # regardless of their unrolled/not unrolled status @@ -820,19 +831,22 @@ def sum_nested_cycle_values(self, iaca_kernels, val_pos): for ic_prev in iaca_kernels[len(iaca_kernels) - ind:]: if prev_nest == ic_prev[1]: iaca_block_flops += ic_prev[val_pos] - else: - break + # else: + # break # if there are less cycles to be done in the current loop then return that count if iaca_block_flops > ic[val_pos]: i_flops = (ic[2] if ic[3] else 1) * ic[val_pos] + # print "greater nest 2: ", i_flops else: # iflops contains the flop count for the inner loops, add to that the flops generated by the rest of the code # in the current loop i_flops = (ic[2] if ic[3] else 1) * (i_flops + ic[val_pos] - iaca_block_flops) + # print "greater nest 3: ", i_flops else: # Not tested yet. Might never apply to FFC kernels. total_flops += i_flops i_flops = (ic[2] if ic[3] else 1) * ic[val_pos] + # print "smaller nest: ", i_flops prev_nest = ic[1] total_flops += i_flops return total_flops @@ -985,6 +999,7 @@ def compile(self, argtypes=None, restype=None): wrapper_code['iaca_end'] = "" for ind in range(1, len(iaca_kernels)): iaca_kernels[ind][0] = self.get_c_code(iaca_kernels[ind][0], wrapper_code) + for ind in range(1, len(iaca_cycle_kernels)): iaca_cycle_kernels[ind][0] = self.get_c_code(iaca_cycle_kernels[ind][0], wrapper_code) iaca_path = path_to_iaca_file + region_name + "_" + self._kernel._md5 + ".txt" self.build_loop_nest_reports(iaca_kernels, wrapper_code, iaca_path, compilation, extension, cppargs, ldargs, argtypes, restype, compiler) diff --git a/pyop2/record.py b/pyop2/record.py index 73b0ac8ab..0b585a5ba 100644 --- a/pyop2/record.py +++ b/pyop2/record.py @@ -119,7 +119,7 @@ def __init__(self, self.folds["iaca_flops"] = fold_stats # IACA reported cycles for the loop over the columns (extruded mesh) self._cycles = [] - self.folds["cycles"] = fold_stats + self.folds["cycles"] = SUM if self._is_proc else AVG def _ncalls(self): """Number of calls per process""" @@ -218,7 +218,7 @@ def plot_list(self, frequency): self.v_volume, self.m_volume, self.mv_volume, self.vbw, self.mbw, self.mvbw, self.rvbw, self.iaca_flops, self.papi_flops, - self.iaca_mflops, self.papi_mflops, self.cycles / frequency, self.c_runtime] + self.iaca_mflops, self.papi_mflops, self.cycles * 1.0 / frequency, self.c_runtime] @property def name(self): @@ -278,6 +278,8 @@ def iaca_flops(self): @property def cycles(self): + if not self._is_proc: + return self._reduce("cycles", self._cycles) return self._reduce("cycles", self._cycles) / 1e9 ################################################# @@ -358,10 +360,14 @@ def cycles(self, value): @property def runtime(self): + if not self._is_proc: + return self.end_time - self.start_time return self._ncalls() * (self.end_time - self.start_time) @property def rv_runtime(self): + if not self._is_proc: + return self.rv_end_time - self.rv_start_time return self._ncalls() * (self.rv_end_time - self.rv_start_time) @property