From e542965a98de63a74eb4a3276ba5c0cd801b5f62 Mon Sep 17 00:00:00 2001 From: Jan Kleine Date: Sat, 16 Mar 2024 17:26:15 +0100 Subject: [PATCH] Fix Profiler + Minor improvements (#1548) There is currently a bug in the profiler, that the first run recorded includes the runtime of all the warmup runs, making one ugly outlier. This is because `times[0]` is set before the warmups and is never properly reset. This fixes that and makes the surrounding code a bit cleaner (e.g. using `np.diff` instead of list comprehension). I also added some options that make the profiler more usable when it's called inside of other `tqdm` progress bars (allows setting `tqdm`'s `leave=False` and stop it from printing results). List of changes: - Fix: timing error - Add: `tqdm_leave` parameter - Add: `print_results` parameter - Cleanup: Only print `Profiling...` if `tqdm` is not used - Cleanup: use `warnings.warn()` instead of `print("WARNING: ...")` - Cleanup: remove unused variable for return value (didn't seem to be used) - Cleanup: make diff calculation more concise --------- Co-authored-by: Tal Ben-Nun --- dace/builtin_hooks.py | 13 ++++++-- dace/frontend/operations.py | 62 +++++++++++++++++++++++++------------ 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/dace/builtin_hooks.py b/dace/builtin_hooks.py index 6af346e033..65b5c9b1a2 100644 --- a/dace/builtin_hooks.py +++ b/dace/builtin_hooks.py @@ -15,7 +15,12 @@ @contextmanager -def profile(repetitions: int = 100, warmup: int = 0): +def profile( + repetitions: int = 100, + warmup: int = 0, + tqdm_leave: bool = True, + print_results: bool = True, +): """ Context manager that enables profiling of each called DaCe program. If repetitions is greater than 1, the program is run multiple times and the average execution time is reported. @@ -35,6 +40,10 @@ def profile(repetitions: int = 100, warmup: int = 0): :param repetitions: The number of times to run each DaCe program. :param warmup: Number of additional repetitions to run the program without measuring time. + :param tqdm_leave: Sets the ``leave`` parameter of the ``tqdm`` progress bar (useful + for nested progress bars). Ignored if tqdm progress bar is not used. + :param print_results: Whether or not to print the median execution time after + all repetitions. :note: Running functions multiple times may affect the results of the program. """ from dace.frontend.operations import CompiledSDFGProfiler # Avoid circular import @@ -51,7 +60,7 @@ def profile(repetitions: int = 100, warmup: int = 0): yield hook return - profiler = CompiledSDFGProfiler(repetitions, warmup) + profiler = CompiledSDFGProfiler(repetitions, warmup, tqdm_leave, print_results) with on_compiled_sdfg_call(context_manager=profiler): yield profiler diff --git a/dace/frontend/operations.py b/dace/frontend/operations.py index 98dff2ba1e..11df057ee5 100644 --- a/dace/frontend/operations.py +++ b/dace/frontend/operations.py @@ -1,6 +1,7 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. from __future__ import print_function from functools import partial +from itertools import chain, repeat from contextlib import contextmanager from timeit import default_timer as timer @@ -10,6 +11,7 @@ import sympy import os import sys +import warnings from dace import dtypes from dace.config import Config @@ -28,12 +30,20 @@ class CompiledSDFGProfiler: times: List[Tuple['SDFG', List[float]]] #: The list of SDFGs and times for each SDFG called within the context. - def __init__(self, repetitions: int = 0, warmup: int = 0) -> None: + def __init__( + self, + repetitions: int = 0, + warmup: int = 0, + tqdm_leave: bool = True, + print_results: bool = True, + ) -> None: # Avoid import loop from dace.codegen.instrumentation import report self.repetitions = repetitions or int(Config.get('treps')) self.warmup = warmup + self.tqdm_leave = tqdm_leave + self.print_results = print_results if self.repetitions < 1: raise ValueError('Number of repetitions must be at least 1') if self.warmup < 0: @@ -47,34 +57,45 @@ def __init__(self, repetitions: int = 0, warmup: int = 0) -> None: def __call__(self, compiled_sdfg: 'CompiledSDFG', args: Tuple[Any, ...]): from dace.codegen.instrumentation import report # Avoid import loop - start = timer() + # zeros to overwrite start time, followed by indices for each repetition + iterator = chain(repeat(0, self.warmup), range(1, self.repetitions + 1)) - times = [start] * (self.repetitions + 1) - ret = None - print('\nProfiling...') - - iterator = range(self.warmup + self.repetitions) if Config.get_bool('profiling_status'): try: from tqdm import tqdm - iterator = tqdm(iterator, desc="Profiling", file=sys.stdout) + + iterator = tqdm( + iterator, + desc='Profiling', + total=(self.warmup + self.repetitions), + file=sys.stdout, + leave=self.tqdm_leave, + ) except ImportError: - print('WARNING: Cannot show profiling progress, missing optional ' - 'dependency tqdm...\n\tTo see a live progress bar please install ' - 'tqdm (`pip install tqdm`)\n\tTo disable this feature (and ' - 'this warning) set `profiling_status` to false in the dace ' - 'config (~/.dace.conf).') + warnings.warn( + 'Cannot show profiling progress, missing optional dependency ' + 'tqdm...\n\tTo see a live progress bar please install tqdm ' + '(`pip install tqdm`)\n\tTo disable this feature (and this ' + 'warning) set `profiling_status` to false in the dace config ' + '(~/.dace.conf).' + ) + print('\nProfiling...') + else: + print('\nProfiling...') - offset = 1 - self.warmup start_time = int(time.time()) + + times = np.ndarray(self.repetitions + 1, dtype=np.float64) times[0] = timer() + for i in iterator: # Call function compiled_sdfg._cfunc(compiled_sdfg._libhandle, *args) - if i >= self.warmup: - times[i + offset] = timer() - diffs = np.array([(times[i] - times[i - 1])*1e3 for i in range(1, self.repetitions + 1)]) + times[i] = timer() + + # compute pairwise differences and convert to milliseconds + diffs = np.diff(times) * 1e3 # Add entries to the instrumentation report self.report.name = self.report.name or start_time @@ -88,8 +109,9 @@ def __call__(self, compiled_sdfg: 'CompiledSDFG', args: Tuple[Any, ...]): self.report.durations[(0, -1, -1)][f'Python call to {compiled_sdfg.sdfg.name}'][-1].extend(diffs) # Print profiling results - time_msecs = np.median(diffs) - print(compiled_sdfg.sdfg.name, time_msecs, 'ms') + if self.print_results: + time_msecs = np.median(diffs) + print(compiled_sdfg.sdfg.name, time_msecs, 'ms') # Save every call separately self.times.append((compiled_sdfg.sdfg, diffs)) @@ -105,7 +127,7 @@ def __call__(self, compiled_sdfg: 'CompiledSDFG', args: Tuple[Any, ...]): # Restore state after skipping contents compiled_sdfg.do_not_execute = old_dne - return ret + return None def detect_reduction_type(wcr_str, openmp=False):