From bfe69232c82b7608608ab51cc5d5c5c2646461c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20M=C3=BCller?= <147368808+philip-paul-mueller@users.noreply.github.com> Date: Thu, 4 Jan 2024 02:16:12 +0100 Subject: [PATCH] Faster Call for `CompiledSDFG` (#1467) This PR adds a new possibility how a `CompiledSDFG` can be called. Before `__call__()` performed a lot of operations, which can be summarized as: - Ensuring that the arguments are ordered correctly. - Transforming them to the right type (`ndarray` to pointers). Especially when benchmarking smaller SDFG this is a lot of overhead, which actually dominates the execution. Furthermore, the runtime heavily depends on the number of arguments. To solve this, this PR introduces the `_fast_call()` function, which expects that its arguments are already in the right order and casted to the right type. In addition it does some refactoring and splits `_construct_args()` into multiple parts. --------- Co-authored-by: Philip Mueller Co-authored-by: Tal Ben-Nun --- dace/codegen/compiled_sdfg.py | 228 +++++++++++++++++++++------------- 1 file changed, 142 insertions(+), 86 deletions(-) diff --git a/dace/codegen/compiled_sdfg.py b/dace/codegen/compiled_sdfg.py index 7de385cead..fdf68214ed 100644 --- a/dace/codegen/compiled_sdfg.py +++ b/dace/codegen/compiled_sdfg.py @@ -33,7 +33,7 @@ def __init__(self, library_filename, program_name): """ self._stub_filename = os.path.join( os.path.dirname(os.path.realpath(library_filename)), - 'libdacestub_%s.%s' % (program_name, Config.get('compiler', 'library_extension'))) + f'libdacestub_{program_name}.{Config.get("compiler", "library_extension")}') self._library_filename = os.path.realpath(library_filename) self._stub = None self._lib = None @@ -47,7 +47,7 @@ def get_symbol(self, name, restype=ctypes.c_int): func = self._stub.get_symbol(self._lib, ctypes.c_char_p(name.encode())) if func is None: - raise KeyError('Function %s not found in library %s' % (name, os.path.basename(self._library_filename))) + raise KeyError(f'Function {name} not found in library {os.path.basename(self._library_filename)}') return ctypes.CFUNCTYPE(restype)(func) @@ -105,15 +105,14 @@ def load(self): is_loaded = self._stub.is_library_loaded(lib_cfilename) if is_loaded == 1: - warnings.warn('Library %s already loaded, renaming file' % self._library_filename) + warnings.warn(f'Library {self._library_filename} already loaded, renaming file') try: shutil.copyfile(self._library_filename, self._library_filename + '_') self._library_filename += '_' except shutil.Error: - raise cgx.DuplicateDLLError('Library %s is already loaded somewhere else ' % - os.path.basename(self._library_filename) + - 'and cannot be unloaded. Please use a different name ' + - 'for the SDFG/program.') + raise cgx.DuplicateDLLError(f'Library {os.path.basename(self._library_filename)}' + 'is already loaded somewhere else and cannot be unloaded. ' + 'Please use a different name for the SDFG/program.') # Actually load the library self._lib = ctypes.c_void_p(self._stub.load_library(lib_cfilename)) @@ -126,7 +125,7 @@ def load(self): result = subprocess.run(['ld', self._library_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stderr = result.stderr.decode('utf-8') reason = 'Reason:\n' + '\n'.join([l for l in stderr.split('\n') if '_start' not in l]) - raise RuntimeError('Could not load library %s. %s' % (os.path.basename(self._library_filename), reason)) + raise RuntimeError(f'Could not load library {os.path.basename(self._library_filename)}. {reason}') def unload(self): """ Unloads the internal library using the stub. """ @@ -212,6 +211,7 @@ def __init__(self, sdfg, lib: ReloadableDLL, argnames: List[str] = None): self.has_gpu_code = True break + def get_exported_function(self, name: str, restype=None) -> Optional[Callable[..., Any]]: """ Tries to find a symbol by name in the compiled SDFG, and convert it to a callable function @@ -225,6 +225,7 @@ def get_exported_function(self, name: str, restype=None) -> Optional[Callable[.. except KeyError: # Function not found return None + def get_state_struct(self) -> ctypes.Structure: """ Attempt to parse the SDFG source code and extract the state struct. This method will parse the first consecutive entries in the struct that are pointers. As soon as a non-pointer or other unparseable field is @@ -238,6 +239,7 @@ def get_state_struct(self) -> ctypes.Structure: return ctypes.cast(self._libhandle, ctypes.POINTER(self._try_parse_state_struct())).contents + def _try_parse_state_struct(self) -> Optional[Type[ctypes.Structure]]: from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid import cycle # the path of the main sdfg file containing the state struct @@ -365,25 +367,71 @@ def _get_error_text(self, result: Union[str, int]) -> str: else: return result + def __call__(self, *args, **kwargs): - # Update arguments from ordered list - if len(args) > 0 and self.argnames is not None: - kwargs.update({aname: arg for aname, arg in zip(self.argnames, args)}) + """ + Forwards the Python call to the compiled ``SDFG``. + + The order of the positional arguments is expected to be the same as in + the ``argnames`` member. The function will roughly perform the + following tasks: + - Change the order of the Python arguments into the one required by + the binary. + - Performing some basic sanity checks. + - Transforming the Python arguments into their ``C`` equivalents. + - Allocate the memory for the return values. + - Call the ``C` function. + + :note: The memory for the return values is only allocated the first + time this function is called. Thus, this function will always + return the same objects. To force the allocation of new memory + you can call ``clear_return_values()`` in advance. + """ + if self.argnames is None and len(args) != 0: + raise KeyError(f"Passed positional arguments to an SDFG that does not accept them.") + elif len(args) > 0 and self.argnames is not None: + kwargs.update( + # `_construct_args` will handle all of its arguments as kwargs. + {aname: arg for aname, arg in zip(self.argnames, args)} + ) + argtuple, initargtuple = self._construct_args(kwargs) # Missing arguments will be detected here. + # Return values are cached in `self._lastargs`. + return self.fast_call(argtuple, initargtuple, do_gpu_check=True) + + + def fast_call( + self, + callargs: Tuple[Any, ...], + initargs: Tuple[Any, ...], + do_gpu_check: bool = False, + ) -> Union[Tuple[Any, ...], Any]: + """ + Calls the underlying binary functions directly and bypassing + argument sanitation. - try: - argtuple, initargtuple = self._construct_args(kwargs) + This is a faster, but less user friendly version of ``__call__()``. + While ``__call__()`` will transforms its Python arguments such that + they can be forwarded, this function assumes that this processing + was already done by the user. + :param callargs: Arguments passed to the actual computation. + :param initargs: Arguments passed to the initialization function. + :param do_gpu_check: Check if errors happened on the GPU. + + :note: You may use `_construct_args()` to generate the processed arguments. + """ + try: # Call initializer function if necessary, then SDFG if self._initialized is False: self._lib.load() - self._initialize(initargtuple) + self._initialize(initargs) - with hooks.invoke_compiled_sdfg_call_hooks(self, argtuple): + with hooks.invoke_compiled_sdfg_call_hooks(self, callargs): if self.do_not_execute is False: - self._cfunc(self._libhandle, *argtuple) + self._cfunc(self._libhandle, *callargs) - if self.has_gpu_code: - # Optionally get errors from call + # Optionally get errors from call + if do_gpu_check and self.has_gpu_code: try: lasterror = common.get_gpu_runtime().get_last_error_string() except RuntimeError as ex: @@ -399,6 +447,7 @@ def __call__(self, *args, **kwargs): self._lib.unload() raise + def __del__(self): if self._initialized is True: self.finalize() @@ -406,24 +455,30 @@ def __del__(self): self._libhandle = ctypes.c_void_p(0) self._lib.unload() + def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: - """ Main function that controls argument construction for calling - the C prototype of the SDFG. + """ + Main function that controls argument construction for calling + the C prototype of the SDFG. + + Organizes arguments first by ``sdfg.arglist``, then data descriptors + by alphabetical order, then symbols by alphabetical order. - Organizes arguments first by `sdfg.arglist`, then data descriptors - by alphabetical order, then symbols by alphabetical order. + :note: If not initialized this function will initialize the memory for + the return values, however, it might also reallocate said memory. + :note: This function will also update the internal argument cache. """ - # Return value initialization (for values that have not been given) self._initialize_return_values(kwargs) + + # Add the return values to the arguments, since they are part of the C signature. for desc, arr in zip(self._retarray_shapes, self._return_arrays): kwargs[desc[0]] = arr - # Argument construction sig = self._sig typedict = self._typedict if len(kwargs) > 0: # Construct mapping from arguments to signature - arglist = [] + arglist = [] argtypes = [] argnames = [] for a in sig: @@ -433,38 +488,58 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: argnames.append(a) except KeyError: raise KeyError("Missing program argument \"{}\"".format(a)) + else: arglist = [] argtypes = [] argnames = [] sig = [] + # Type checking + no_view_arguments = not Config.get_bool('compiler', 'allow_view_arguments') for i, (a, arg, atype) in enumerate(zip(argnames, arglist, argtypes)): - if not dtypes.is_array(arg) and isinstance(atype, dt.Array): + is_array = dtypes.is_array(arg) + is_ndarray = isinstance(arg, np.ndarray) + is_dtArray = isinstance(atype, dt.Array) + if not is_array and is_dtArray: if isinstance(arg, list): - print('WARNING: Casting list argument "%s" to ndarray' % a) + print(f'WARNING: Casting list argument "{a}" to ndarray') elif arg is None: if atype.optional is False: # If array cannot be None raise TypeError(f'Passing a None value to a non-optional array in argument "{a}"') # Otherwise, None values are passed as null pointers below else: - raise TypeError('Passing an object (type %s) to an array in argument "%s"' % - (type(arg).__name__, a)) - elif dtypes.is_array(arg) and not isinstance(atype, dt.Array): + raise TypeError(f'Passing an object (type {type(arg).__name__}) to an array in argument "{a}"') + elif is_array and not is_dtArray: # GPU scalars and return values are pointers, so this is fine if atype.storage != dtypes.StorageType.GPU_Global and not a.startswith('__return'): - raise TypeError('Passing an array to a scalar (type %s) in argument "%s"' % (atype.dtype.ctype, a)) + raise TypeError(f'Passing an array to a scalar (type {atype.dtype.ctype}) in argument "{a}"') + elif (is_dtArray and is_ndarray and not isinstance(atype, dt.StructArray) + and atype.dtype.as_numpy_dtype() != arg.dtype): + # Make exception for vector types + if (isinstance(atype.dtype, dtypes.vector) and atype.dtype.vtype.as_numpy_dtype() == arg.dtype): + pass + else: + print(f'WARNING: Passing {arg.dtype} array argument "{a}" to a {atype.dtype.type.__name__} array') + elif is_dtArray and is_ndarray and arg.base is not None and not '__return' in a and no_view_arguments: + raise TypeError(f'Passing a numpy view (e.g., sub-array or "A.T") "{a}" to DaCe ' + 'programs is not allowed in order to retain analyzability. ' + 'Please make a copy with "numpy.copy(...)". If you know what ' + 'you are doing, you can override this error in the ' + 'configuration by setting compiler.allow_view_arguments ' + 'to True.') elif (not isinstance(atype, (dt.Array, dt.Structure)) and not isinstance(atype.dtype, dtypes.callback) and not isinstance(arg, (atype.dtype.type, sp.Basic)) and not (isinstance(arg, symbolic.symbol) and arg.dtype == atype.dtype)): - if isinstance(arg, int) and atype.dtype.type == np.int64: + is_int = isinstance(arg, int) + if is_int and atype.dtype.type == np.int64: pass - elif isinstance(arg, float) and atype.dtype.type == np.float64: + elif (is_int and atype.dtype.type == np.int32 and abs(arg) <= (1 << 31) - 1): pass - elif (isinstance(arg, int) and atype.dtype.type == np.int32 and abs(arg) <= (1 << 31) - 1): + elif (is_int and atype.dtype.type == np.uint32 and arg >= 0 and arg <= (1 << 32) - 1): pass - elif (isinstance(arg, int) and atype.dtype.type == np.uint32 and arg >= 0 and arg <= (1 << 32) - 1): + elif isinstance(arg, float) and atype.dtype.type == np.float64: pass elif (isinstance(arg, str) or arg is None) and atype.dtype == dtypes.string: if arg is None: @@ -475,24 +550,7 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: else: warnings.warn(f'Casting scalar argument "{a}" from {type(arg).__name__} to {atype.dtype.type}') arglist[i] = atype.dtype.type(arg) - elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) and not isinstance(atype, dt.StructArray) - and atype.dtype.as_numpy_dtype() != arg.dtype): - # Make exception for vector types - if (isinstance(atype.dtype, dtypes.vector) and atype.dtype.vtype.as_numpy_dtype() == arg.dtype): - pass - else: - print('WARNING: Passing %s array argument "%s" to a %s array' % - (arg.dtype, a, atype.dtype.type.__name__)) - elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) and arg.base is not None - and not '__return' in a and not Config.get_bool('compiler', 'allow_view_arguments')): - raise TypeError(f'Passing a numpy view (e.g., sub-array or "A.T") "{a}" to DaCe ' - 'programs is not allowed in order to retain analyzability. ' - 'Please make a copy with "numpy.copy(...)". If you know what ' - 'you are doing, you can override this error in the ' - 'configuration by setting compiler.allow_view_arguments ' - 'to True.') - # Explicit casting for index, (arg, argtype) in enumerate(zip(arglist, argtypes)): # Call a wrapper function to make NumPy arrays from pointers. if isinstance(argtype.dtype, dtypes.callback): @@ -505,52 +563,47 @@ def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: arglist[index] = ctypes.c_void_p(0) # Retain only the element datatype for upcoming checks and casts - arg_ctypes = [t.dtype.as_ctypes() for t in argtypes] - - sdfg = self._sdfg - - # Obtain SDFG constants - constants = sdfg.constants + arg_ctypes = tuple(at.dtype.as_ctypes() for at in argtypes) + + constants = self.sdfg.constants + callparams = tuple( + (actype(arg.get()) + if isinstance(arg, symbolic.symbol) + else arg, actype, atype, aname + ) + for arg, actype, atype, aname in zip(arglist, arg_ctypes, argtypes, argnames) + if not (symbolic.issymbolic(arg) and (hasattr(arg, 'name') and arg.name in constants)) + ) - # Remove symbolic constants from arguments - callparams = tuple((arg, actype, atype, aname) - for arg, actype, atype, aname in zip(arglist, arg_ctypes, argtypes, argnames) - if not symbolic.issymbolic(arg) or (hasattr(arg, 'name') and arg.name not in constants)) - - # Replace symbols with their values - callparams = tuple((actype(arg.get()) if isinstance(arg, symbolic.symbol) else arg, actype, atype, aname) - for arg, actype, atype, aname in callparams) - - # Construct init args, which only consist of the symbols symbols = self._free_symbols initargs = tuple( actype(arg) if not isinstance(arg, ctypes._SimpleCData) else arg - for arg, actype, atype, aname in callparams if aname in symbols) - - # Replace arrays with their base host/device pointers - newargs = tuple((ctypes.c_void_p(_array_interface_ptr(arg, atype.storage)), actype, - atype) if dtypes.is_array(arg) else (arg, actype, atype) - for arg, actype, atype, _ in callparams) + for arg, actype, atype, aname in callparams + if aname in symbols + ) try: - newargs = tuple( - actype(arg) if not isinstance(arg, (ctypes._SimpleCData)) else arg - for arg, actype, atype in newargs) - except TypeError: - # Pinpoint bad argument - for i, (arg, actype, _) in enumerate(newargs): - try: - if not isinstance(arg, ctypes._SimpleCData): - actype(arg) - except TypeError as ex: - raise TypeError(f'Invalid type for scalar argument "{callparams[i][3]}": {ex}') + # Replace arrays with their base host/device pointers + newargs = [None] * len(callparams) + for i, (arg, actype, atype, _) in enumerate(callparams): + if dtypes.is_array(arg): + newargs[i] = ctypes.c_void_p(_array_interface_ptr(arg, atype.storage)) # `c_void_p` is subclass of `ctypes._SimpleCData`. + elif not isinstance(arg, (ctypes._SimpleCData)): + newargs[i] = actype(arg) + else: + newargs[i] = arg + + except TypeError as ex: + raise TypeError(f'Invalid type for scalar argument "{callparams[i][3]}": {ex}') self._lastargs = newargs, initargs return self._lastargs + def clear_return_values(self): self._create_new_arrays = True + def _create_array(self, _: str, dtype: np.dtype, storage: dtypes.StorageType, shape: Tuple[int], strides: Tuple[int], total_size: int): ndarray = np.ndarray @@ -575,10 +628,12 @@ def ndarray(*args, buffer=None, **kwargs): # Create an array with the properties of the SDFG array return ndarray(shape, dtype, buffer=zeros(total_size, dtype), strides=strides) + def _initialize_return_values(self, kwargs): # Obtain symbol values from arguments and constants syms = dict() - syms.update({k: v for k, v in kwargs.items() if k not in self.sdfg.arrays}) + sdfg_arrays = self.sdfg.arrays + syms.update({k: v for k, v in kwargs.items() if k not in sdfg_arrays}) syms.update(self.sdfg.constants) # Clear references from last call (allow garbage collection) @@ -624,6 +679,7 @@ def _initialize_return_values(self, kwargs): arr = self._create_array(*shape_desc) self._return_arrays.append(arr) + def _convert_return_values(self): # Return the values as they would be from a Python function if self._return_arrays is None or len(self._return_arrays) == 0: