diff --git a/alpaca/decorator.py b/alpaca/decorator.py index 76cc3f2..29d25e0 100644 --- a/alpaca/decorator.py +++ b/alpaca/decorator.py @@ -4,6 +4,7 @@ """ from functools import wraps +import itertools from collections.abc import Iterable from importlib.metadata import version, PackageNotFoundError import inspect @@ -68,18 +69,55 @@ class Provenance(object): data structures used by the function). Alpaca will track and identify the elements inside the container, instead of the container itself. Default: None - container_output : bool or int, optional + container_output : bool or int or tuple, optional The function outputs data inside a container (e.g., a list). + If True, Alpaca will track and identify the elements inside the container, instead of the container itself. It will iterate over the function output object and identify the individual elements. However, for dictionary outputs, the dictionary object is identified together with its elements, to retain information on the keys. For other containers, the container object is not identified. + If an integer, this defines a multiple-level (nested) container. The number defines the depth for which to identify and serialize the objects. In this case, the function output object will always be - identified together with the element tree. + identified together with the element tree. For instance, consider + the two-level list `L = [[obj1, obj2], [obj3, obj4]]`. With + `container_output=0`, there will be a single function output node for + list `L`. Starting from `L`, there will be two additional nodes for + each of the inner lists (`L[0]` and `L[1]`, i.e., all elements from + level zero). With `container_output=1`, there will be a single + function output node for list `L`. Starting from `L`, there + will be two additional nodes for each of the inner lists (`L[0]` and + `L[1]`). Finally, starting from each inner list, there will be output + nodes for `obj1` and `obj2` (linked to `L[0]`) and for `obj3` and + `obj4` (linked to `L[1]`). Therefore, all elements from level one are + identified, and linked to the respective elements from level zero. + + If a tuple, this defines a range of the levels in a nested container + to consider when identifying the objects output by the function. For + example, taking the same list above, a `container_output=(0, 1)` will + start from level zero and stop at the elements from level + one (similar to `container_output=1`). With `container_output=(1, 1)`, + the first level will be ignored as function output. The function will + have two output nodes (directly for `L[0]` and `L[1]`). Starting from + each inner list, there will be output nodes for `obj1` and `obj2` + (linked to `L[0]`) and for `obj3` and `obj4` (linked to `L[1]`). + Therefore, the first level (zero) of the container is ignored, and only + elements from level one are described. The range feature is useful for + functions where the relevant outputs are containers whose elements + should also be described, but those containers are grouped inside a + single return list instead of the function returning a tuple with the + containers. + + It is important to note that all levels identified as integers or + range tuples should point to levels in the nested-container that + contain iterables. For example, in the list `L` above, the level 2 + are the objects `objX`. If `container_output=2`, Alpaca will try to + iterate over each `objX` and describe their elements. If they are + not iterable, an error will be raised. + Default: False Attributes @@ -174,10 +212,17 @@ def __init__(self, inputs, file_input=None, file_output=None, # Store the names of arguments that are inputs self.inputs = inputs - self.container_output = container_output - self._tracking_container_output = \ - ((isinstance(container_output, bool) and container_output) or - (not isinstance(container_output, bool) and container_output >= 0)) + self.container_output = False + self._tracking_container_output = False + if isinstance(container_output, bool): + self._tracking_container_output = container_output + self.container_output = container_output + elif isinstance(container_output, tuple): + self._tracking_container_output = len(container_output) == 2 + self.container_output = container_output + elif isinstance(container_output, int): + self._tracking_container_output = container_output >= 0 + self.container_output = (0, container_output) def _insert_static_information(self, tree, data_info, function, time_stamp): @@ -486,7 +531,7 @@ def _add_container_relationships(self, container, data_info, level, # This will work whether the main container is a dictionary or # other iterable. if (level is not None and - level < self.container_output and + level < max(self.container_output) and (isinstance(element, Iterable) or hasattr(container, "__getitem__"))): self._add_container_relationships(element, data_info, @@ -500,11 +545,28 @@ def _capture_container_output(self, function_output, data_info, time_stamp_start, execution_id): level = None if isinstance(self.container_output, bool) else 0 - if isinstance(function_output, dict) or level is not None: + if isinstance(function_output, dict): container_info = self._add_container_relationships( function_output, data_info, level, time_stamp_start, execution_id) return {0: container_info} + elif level is not None: + if not self.container_output or min(self.container_output) == 0: + # Starting from zero + container_info = self._add_container_relationships( + function_output, data_info, level, time_stamp_start, + execution_id) + return {0: container_info} + else: + # Process range starting from other level + elements = function_output + start_level = min(self.container_output) - 1 + for level in range(start_level): + # Unpack all elements until the requested start level + elements = itertools.chain(*elements) + return {idx: self._add_container_relationships( + element, data_info, start_level + 1, time_stamp_start, + execution_id) for idx, element in enumerate(elements)} # Process simple container. # The container object will not be identified. diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index 7c8ea89..0625631 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -78,29 +78,88 @@ def multiple_inputs_function(array_1, array_2, param1, param2): @Provenance(inputs=['array'], container_output=True) def container_output_function(array, param1, param2): - """ Takes a single input and outputs multiple elements in a container""" + """ + Takes a single input and outputs multiple elements in a container. + This function will have as tracked outputs all elements inside the first + level, i.e., the two NumPy arrays. + """ return [array + i for i in range(3, 5)] +@Provenance(inputs=['array'], container_output=0) +def container_output_function_level_0(array, param1, param2): + """ + Takes a single input and outputs multiple elements in a container. + This function will have as tracked output the return list. + Additional nodes from the list to each element inside the first level + (i.e., the two NumPy arrays) will be added. + """ + return [array + i for i in range(7, 9)] + + +@Provenance(inputs=['array'], container_output=1) +def container_output_function_level_1(array, param1, param2): + """ + Takes a single input and outputs multiple elements in a container. + This function will have as tracked output the return list. + Additional nodes from the list to each element inside the first level + (i.e., the two NumPy arrays) and from each array to all its elements + (i.e., the integers in the second level) will be added. + """ + return [array + i for i in range(2, 4)] + + +@Provenance(inputs=['array'], container_output=(0, 0)) +def container_output_function_level_range_0_0(array, param1, param2): + """ + Takes a single input and outputs multiple elements in a container. + As we are requesting to track from output level zero to zero, this + function will have as tracked output the return list. Additional nodes + from the list to each element inside the first level + (i.e., the two NumPy arrays) will be added. + """ + return [array + i for i in range(1, 3)] + + +@Provenance(inputs=['array'], container_output=(0, 1)) +def container_output_function_level_range_0_1(array, param1, param2): + """ + Takes a single input and outputs multiple elements in a container. + As we are requesting to track from output level zero to one, this function + will have as tracked output the return list. Additional nodes from the + list to each element inside the first level (i.e., the two NumPy arrays) + and from each array to all its elements (i.e., the integers in the second + level) will be added. + """ + return [array + i for i in range(5, 7)] + + +@Provenance(inputs=['array'], container_output=(1, 1)) +def container_output_function_level_range_1_1(array, param1, param2): + """ + Takes a single input and outputs multiple elements in a container. + As we are requesting to track from output level one to one, this function + will have as tracked outputs all elements inside the first + level, i.e., the two NumPy arrays. Additional nodes each array to all + its elements (i.e., the integers in the second level) will be added. + This option skips adding an output node for the first level, i.e., the + list that contains the two arrays. + """ + return [array + i for i in range(4, 6)] + + @Provenance(inputs=['array'], container_output=True) def dict_output_function(array, param1, param2): """ Takes as single input and outputs multiple elements in a dictionary """ return {f"key.{i}": array + i + 3 for i in range(0, 2)} -@Provenance(inputs=['array'], container_output=1) -def container_output_function_level(array, param1, param2): - """ Takes a single input and outputs multiple elements in a container""" - return [array + i for i in range(3, 5)] - - @Provenance(inputs=['array'], container_output=1) def dict_output_function_level(array, param1, param2): """ Takes as single input and outputs multiple elements in a dictionary """ return {f"key.{i}": array + i + 3 for i in range(0, 2)} - class NonIterableContainer(object): def __init__(self, start): @@ -126,9 +185,12 @@ def __init__(self, start): def __getitem__(self, item): return self._data[item] + + NonIterableContainerOutputObject.__init__ = \ Provenance(inputs=[], container_output=0)(NonIterableContainerOutputObject.__init__) + # Function to help verifying FunctionExecution tuples def _check_function_execution(actual, exp_function, exp_input, exp_params, exp_output, exp_arg_map, exp_kwarg_map, @@ -634,9 +696,74 @@ def test_container_output_function(self): exp_order=1, test_case=self) - def test_container_output_function_level(self): + def test_container_output_function_level_0(self): + activate(clear=True) + res = container_output_function_level_0(TEST_ARRAY, 3, 6) + deactivate() + + self.assertEqual(len(Provenance.history), 3) + + expected_output = DataObject( + hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", + type="builtins.list", id=id(res), details={}) + + expected_container_1 = DataObject( + hash=joblib.hash(TEST_ARRAY + 7, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[0]), + details={'shape': (3,), 'dtype': np.int64}) + + expected_container_2 = DataObject( + hash=joblib.hash(TEST_ARRAY + 8, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[1]), + details={'shape': (3,), 'dtype': np.int64}) + + # Check the subscript of each array with respect to the list returned + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: expected_output}, + exp_params={'index': 0}, + exp_output={0: expected_container_1}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + _check_function_execution( + actual=Provenance.history[1], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: expected_output}, + exp_params={'index': 1}, + exp_output={0: expected_container_2}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + # Main function execution + _check_function_execution( + actual=Provenance.history[2], + exp_function=FunctionInfo('container_output_function_level_0', + 'test_decorator', ''), + exp_input={'array': TEST_ARRAY_INFO}, + exp_params={'param1': 3, 'param2': 6}, + exp_output={0: expected_output}, + exp_arg_map=['array', 'param1', 'param2'], + exp_kwarg_map=[], + exp_code_stmnt="res = container_output_function_level_0(TEST_ARRAY, 3, 6)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + def test_container_output_function_level_1(self): activate(clear=True) - res = container_output_function_level(TEST_ARRAY, 4, 6) + res = container_output_function_level_1(TEST_ARRAY, 4, 6) deactivate() self.assertEqual(len(Provenance.history), 9) @@ -656,13 +783,175 @@ def test_container_output_function_level(self): type="builtins.list", id=id(res), details={}) expected_container_1 = DataObject( + hash=joblib.hash(TEST_ARRAY + 2, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[0]), + details={'shape': (3,), 'dtype': np.int64}) + + expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[1]), + details={'shape': (3,), 'dtype': np.int64}) + + # Check subscript of each element with respect to the array + containers = [expected_container_1, expected_container_2] + for history_index, element_index in zip( + (1, 2, 3, 5, 6, 7), + ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2))): + container = element_index[0] + element = element_index[1] + _check_function_execution( + actual=Provenance.history[history_index], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: containers[container]}, + exp_params={'index': element}, + exp_output={0: elements[container][element]}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + # Check the subscript of each array with respect to the list returned + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: expected_output}, + exp_params={'index': 0}, + exp_output={0: expected_container_1}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + _check_function_execution( + actual=Provenance.history[4], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: expected_output}, + exp_params={'index': 1}, + exp_output={0: expected_container_2}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + # Main function execution + _check_function_execution( + actual=Provenance.history[8], + exp_function=FunctionInfo('container_output_function_level_1', + 'test_decorator', ''), + exp_input={'array': TEST_ARRAY_INFO}, + exp_params={'param1': 4, 'param2': 6}, + exp_output={0: expected_output}, + exp_arg_map=['array', 'param1', 'param2'], + exp_kwarg_map=[], + exp_code_stmnt="res = container_output_function_level_1(TEST_ARRAY, 4, 6)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + def test_container_output_function_level_range_0_0(self): + # Should be similar to `container_output=0` + activate(clear=True) + res = container_output_function_level_range_0_0(TEST_ARRAY, 3, 6) + deactivate() + + self.assertEqual(len(Provenance.history), 3) + + expected_output = DataObject( + hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", + type="builtins.list", id=id(res), details={}) + + expected_container_1 = DataObject( + hash=joblib.hash(TEST_ARRAY + 1, hash_name='sha1'), + hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), details={'shape': (3,), 'dtype': np.int64}) expected_container_2 = DataObject( - hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), + hash=joblib.hash(TEST_ARRAY + 2, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[1]), + details={'shape': (3,), 'dtype': np.int64}) + + # Check the subscript of each array with respect to the list returned + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: expected_output}, + exp_params={'index': 0}, + exp_output={0: expected_container_1}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + _check_function_execution( + actual=Provenance.history[1], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: expected_output}, + exp_params={'index': 1}, + exp_output={0: expected_container_2}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + # Main function execution + _check_function_execution( + actual=Provenance.history[2], + exp_function=FunctionInfo('container_output_function_level_range_0_0', + 'test_decorator', ''), + exp_input={'array': TEST_ARRAY_INFO}, + exp_params={'param1': 3, 'param2': 6}, + exp_output={0: expected_output}, + exp_arg_map=['array', 'param1', 'param2'], + exp_kwarg_map=[], + exp_code_stmnt="res = container_output_function_level_range_0_0(TEST_ARRAY, 3, 6)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + def test_container_output_function_level_range_0_1(self): + # Should be similar to `container_output=1` + activate(clear=True) + res = container_output_function_level_range_0_1(TEST_ARRAY, 4, 6) + deactivate() + + self.assertEqual(len(Provenance.history), 9) + + elements = [[], []] + for idx, container in enumerate(res): + for el_idx, element in enumerate(container): + element_info = DataObject( + hash=joblib.hash(element, hash_name="sha1"), + hash_method="joblib_SHA1", + type="numpy.int64", id=None, + details={'shape': (), 'dtype': np.int64}) + elements[idx].append(element_info) + + expected_output = DataObject( + hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", + type="builtins.list", id=id(res), details={}) + + expected_container_1 = DataObject( + hash=joblib.hash(TEST_ARRAY + 5, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[0]), + details={'shape': (3,), 'dtype': np.int64}) + + expected_container_2 = DataObject( + hash=joblib.hash(TEST_ARRAY + 6, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), details={'shape': (3,), 'dtype': np.int64}) @@ -717,14 +1006,79 @@ def test_container_output_function_level(self): # Main function execution _check_function_execution( actual=Provenance.history[8], - exp_function=FunctionInfo('container_output_function_level', + exp_function=FunctionInfo('container_output_function_level_range_0_1', 'test_decorator', ''), exp_input={'array': TEST_ARRAY_INFO}, exp_params={'param1': 4, 'param2': 6}, exp_output={0: expected_output}, exp_arg_map=['array', 'param1', 'param2'], exp_kwarg_map=[], - exp_code_stmnt="res = container_output_function_level(TEST_ARRAY, 4, 6)", + exp_code_stmnt="res = container_output_function_level_range_0_1(TEST_ARRAY, 4, 6)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + def test_container_output_function_level_range_1_1(self): + activate(clear=True) + res = container_output_function_level_range_1_1(TEST_ARRAY, 4, 6) + deactivate() + + self.assertEqual(len(Provenance.history), 7) + + elements = [[], []] + for idx, container in enumerate(res): + for el_idx, element in enumerate(container): + element_info = DataObject( + hash=joblib.hash(element, hash_name="sha1"), + hash_method="joblib_SHA1", + type="numpy.int64", id=None, + details={'shape': (), 'dtype': np.int64}) + elements[idx].append(element_info) + + expected_container_1 = DataObject( + hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[0]), + details={'shape': (3,), 'dtype': np.int64}) + + expected_container_2 = DataObject( + hash=joblib.hash(TEST_ARRAY + 5, hash_name='sha1'), + hash_method="joblib_SHA1", + type="numpy.ndarray", id=id(res[1]), + details={'shape': (3,), 'dtype': np.int64}) + + # Check subscript of each element with respect to the array + containers = [expected_container_1, expected_container_2] + for history_index, element_index in zip( + (0, 1, 2, 3, 4, 5), + ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2))): + container = element_index[0] + element = element_index[1] + _check_function_execution( + actual=Provenance.history[history_index], + exp_function=FunctionInfo('subscript', '', ''), + exp_input={0: containers[container]}, + exp_params={'index': element}, + exp_output={0: elements[container][element]}, + exp_arg_map=None, + exp_kwarg_map=None, + exp_code_stmnt=None, + exp_return_targets=[], + exp_order=None, + test_case=self) + + # Main function execution + # There is no single list return directly from the function + _check_function_execution( + actual=Provenance.history[6], + exp_function=FunctionInfo('container_output_function_level_range_1_1', + 'test_decorator', ''), + exp_input={'array': TEST_ARRAY_INFO}, + exp_params={'param1': 4, 'param2': 6}, + exp_output={0: expected_container_1, 1: expected_container_2}, + exp_arg_map=['array', 'param1', 'param2'], + exp_kwarg_map=[], + exp_code_stmnt="res = container_output_function_level_range_1_1(TEST_ARRAY, 4, 6)", exp_return_targets=['res'], exp_order=1, test_case=self) @@ -944,7 +1298,6 @@ def test_non_iterable_container_output(self): exp_order=1, test_case=self) - def test_comprehensions(self): activate(clear=True) num_list = [comprehension_function(i) for i in range(3)]