Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various fixes #20

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def kernel(A):

A[0, 0] = np.sqrt(A[0, 0])
for i in range(1, A.shape[0]):
for j in nb.prange(i):
for j in range(i):
A[i, j] -= np.dot(A[i, :j], A[j, :j])
A[i, j] /= A[j, j]
A[i, i] -= np.dot(A[i, :i], A[i, :i])
Expand Down
25 changes: 13 additions & 12 deletions npbench/infrastructure/dace_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,14 @@ def parallelize(sdfg):
try:

def autoopt(sdfg, device, symbols): #, nofuse):
# Mark arrays as on the GPU
if device == dtypes.DeviceType.GPU:
for k, v in sdfg.arrays.items():
if not v.transient and type(v) == dace.data.Array:
v.storage = dace.dtypes.StorageType.GPU_Global
# # Mark arrays as on the GPU
# if device == dtypes.DeviceType.GPU:
# for k, v in sdfg.arrays.items():
# if not v.transient and type(v) == dace.data.Array:
# v.storage = dace.dtypes.StorageType.GPU_Global

# Auto-optimize SDFG
opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols)
opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols, use_gpu_storage=True)

auto_opt_sdfg = copy.deepcopy(strict_sdfg)
auto_opt_sdfg._name = 'auto_opt'
Expand Down Expand Up @@ -229,9 +229,10 @@ def vectorize(sdfg, vec_len=None):
dace.Config.set('library', 'blas', 'default_implementation', value='cuBLAS')

def copy_to_gpu(sdfg):
for k, v in sdfg.arrays.items():
if not v.transient and isinstance(v, dace.data.Array):
v.storage = dace.dtypes.StorageType.GPU_Global
opt.apply_gpu_storage(sdfg)
# for k, v in sdfg.arrays.items():
# if not v.transient and isinstance(v, dace.data.Array):
# v.storage = dace.dtypes.StorageType.GPU_Global

if self.info["arch"] == "gpu":
import cupy as cp
Expand All @@ -242,9 +243,9 @@ def copy_to_gpu(sdfg):
fe_time = t
if sdfg._name != 'auto_opt':
device = dtypes.DeviceType.GPU if self.info["arch"] == "gpu" else dtypes.DeviceType.CPU
if self.info["arch"] == "cpu":
# GPUTransform will set GPU schedules by itself
opt.set_fast_implementations(sdfg, device)
# if self.info["arch"] == "cpu":
# # GPUTransform will set GPU schedules by itself
opt.set_fast_implementations(sdfg, device)
if self.info["arch"] == "gpu":
if sdfg._name in ['strict', 'parallel', 'fusion']:
_, gpu_time1 = util.benchmark("copy_to_gpu(sdfg)",
Expand Down
4 changes: 2 additions & 2 deletions npbench/infrastructure/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def _execute(self, frmwrk: Framework, impl: Callable, impl_name: str, mode: str,
out = [out]
else:
out = []
if "out_args" in self.bench.info.keys():
out += [ldict[a] for a in self.frmwrk.args(self.bench)]
if "output_args" in self.bench.info.keys():
out += [ldict[a] for a in frmwrk.args(self.bench)]
return out, timelist

def run(self, preset: str, validate: bool, repeat: int, timeout: float = 200.0, ignore_errors: bool = True):
Expand Down
14 changes: 9 additions & 5 deletions npbench/infrastructure/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,20 @@ def inner(_it, _timer{init}):

def benchmark(stmt, setup="pass", out_text="", repeat=1, context={}, output=None, verbose=True):

timeit.template = timeit_tmpl.format(init='{init}', setup='{setup}', stmt='{stmt}', output=output)

ldict = {**context}
output = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
res = output[0][1]
raw_time_list = [a for a, _ in output]
raw_time_list = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
raw_time = np.median(raw_time_list)
ms_time = time_to_ms(raw_time)
if verbose:
print("{}: {}ms".format(out_text, ms_time))

if output is not None:
exec(setup, context)
exec(stmt, context)
res = context[output]
else:
res = None

return res, raw_time_list


Expand Down
Loading