From e228585353e8491ee180432b29a47348000279d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= <42555442+kohlerca@users.noreply.github.com> Date: Wed, 6 Dec 2023 10:16:13 +0100 Subject: [PATCH] Capture and serialization of object values (#27) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Implemented capture of values of strings, booleans, complex and numeric types. * Added serialization of values, with unit tests * Implemented capture of values of user-requested types, using the settings function. They are captured as strings. * Added option to display stored value for the node in the visualization graph --------- Co-authored-by: Cristiano Köhler --- alpaca/alpaca_types.py | 2 +- alpaca/data_information.py | 35 ++++- alpaca/decorator.py | 18 ++- alpaca/graph.py | 26 ++- alpaca/serialization/prov.py | 40 +++++ alpaca/settings.py | 13 +- alpaca/test/res/values.ttl | 111 +++++++++++++ alpaca/test/test_code_analysis.py | 63 +++++--- alpaca/test/test_data_information.py | 29 ++++ alpaca/test/test_decorator.py | 226 +++++++++++++++++++-------- alpaca/test/test_graph.py | 26 +++ alpaca/test/test_serialization.py | 94 +++++++++-- 12 files changed, 563 insertions(+), 120 deletions(-) create mode 100644 alpaca/test/res/values.ttl diff --git a/alpaca/alpaca_types.py b/alpaca/alpaca_types.py index aacd846..6610cbe 100644 --- a/alpaca/alpaca_types.py +++ b/alpaca/alpaca_types.py @@ -61,6 +61,6 @@ # the disk. DataObject = namedtuple('DataObject', ('hash', 'hash_method', 'type', 'id', - 'details')) + 'details', 'value')) File = namedtuple('File', ('hash', 'hash_type', 'path')) diff --git a/alpaca/data_information.py b/alpaca/data_information.py index f6eedbb..03dc8d3 100644 --- a/alpaca/data_information.py +++ b/alpaca/data_information.py @@ -19,6 +19,7 @@ import joblib import numpy as np +from numbers import Number from dill._dill import save_function from alpaca.alpaca_types import DataObject, File @@ -118,6 +119,11 @@ class _ObjectInformation(object): :func:`hash` function, depending on the `use_builtin_hash` parameter set during initialization. + The values of objects of the builtin types `str`, `bool`, `int`, `complex` + and `float` as well as the NumPy numeric types (e.g., `np.float64`) will + be stored. Additional object types to be stored (e.g., `builtins.dict`) + can be defined with the `store_values` parameter. + The method `info` is called to obtain the provenance information associated with the object during tracking, as the `DataObject` named tuple. The relevant metadata attributes are also stored in the tuple. @@ -132,6 +138,13 @@ class _ObjectInformation(object): List of package names whose object hashes will be computed using the Python builtin `hash` function, instead of `joblib.hash` function. Default: None + store_values : list, optional + List of object types whose values will be stored in the provenance + information (e.g., `builtins.dict`). This is in addition to the + builtin types `str`, `bool`, `int`, `complex` and `float` as well as + the NumPy numeric types (e.g., `np.float64`). The values of these are + always stored. + Default: None """ # This is a list of object attributes that provide relevant provenance @@ -140,10 +153,12 @@ class _ObjectInformation(object): 'id', 'nix_name', 'dimensionality', 'pid', 'create_time') - def __init__(self, use_builtin_hash=None): + def __init__(self, use_builtin_hash=None, store_values=None): self._hash_memoizer = dict() self._use_builtin_hash = copy(use_builtin_hash) \ if use_builtin_hash is not None else [] + self._store_values = copy(store_values)\ + if store_values is not None else [] @staticmethod def _get_object_package(obj): @@ -258,6 +273,11 @@ def info(self, obj): Reference for the object. * details : dict Extended information (metadata) on the object. + * value : object + For builtin objects (`str`, `int`, `float`, `bool`, `complex`) + or equivalent objects (e.g. `numpy.float64`), the value is + stored. Additional object types specified with the + :attr:`store_values` list will also be stored. """ type_information = type(obj) obj_type = f"{type_information.__module__}.{type_information.__name__}" @@ -267,7 +287,8 @@ def info(self, obj): if obj is None: unique_id = uuid.uuid4() return DataObject(hash=unique_id, hash_method="UUID", - type=obj_type, id=obj_id, details={}) + type=obj_type, id=obj_id, details={}, + value=None) # Here we can extract specific metadata to record details = {} @@ -290,5 +311,13 @@ def info(self, obj): obj_id=obj_id, package=package) + # Store object value + obj_value = None + if isinstance(obj, (str, bool, Number)): + obj_value = obj + elif obj_type in self._store_values: + obj_value = str(obj) + return DataObject(hash=obj_hash, hash_method=hash_method, - type=obj_type, id=obj_id, details=details) + type=obj_type, id=obj_id, details=details, + value=obj_value) diff --git a/alpaca/decorator.py b/alpaca/decorator.py index 29d25e0..da16640 100644 --- a/alpaca/decorator.py +++ b/alpaca/decorator.py @@ -422,7 +422,8 @@ def _capture_code_and_function_provenance(self, lineno, function): return source_line, ast_tree, return_targets, function_info def _capture_input_and_parameters_provenance(self, function, args, kwargs, - ast_tree, function_info, time_stamp_start, builtin_object_hash): + ast_tree, function_info, time_stamp_start, builtin_object_hash, + store_values): # 1. Extract the parameters passed to the function and store them in # the `input_data` dictionary. @@ -446,7 +447,8 @@ def _capture_input_and_parameters_provenance(self, function, args, kwargs, # After this step, all hashes and metadata of input parameters/files # are going to be stored in the dictionary `inputs`. - data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash) + data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash, + store_values=store_values) # Initialize parameter list with all default arguments that were not # passed to the function @@ -576,11 +578,12 @@ def _capture_container_output(self, function_output, data_info, def _capture_output_provenance(self, function_output, return_targets, input_data, builtin_object_hash, time_stamp_start, execution_id, - constructed_object=None): + store_values, constructed_object=None): # In case in-place operations were performed, lets not use # memoization - data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash) + data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash, + store_values=store_values) # 6. Create hash for the output using `_ObjectInformation` to follow # individual returns. The hashes will be stored in the `outputs` @@ -626,7 +629,8 @@ def wrapped(*args, **kwargs): builtin_object_hash = _ALPACA_SETTINGS[ 'use_builtin_hash_for_module'] - logger.debug(f"Builtin object hash: {builtin_object_hash}") + store_values = _ALPACA_SETTINGS['store_values'] + logging.debug(f"Builtin object hash: {builtin_object_hash}") lineno = None @@ -667,7 +671,8 @@ def wrapped(*args, **kwargs): function=function, args=args, kwargs=kwargs, ast_tree=ast_tree, function_info=function_info, time_stamp_start=time_stamp_start, - builtin_object_hash=builtin_object_hash) + builtin_object_hash=builtin_object_hash, + store_values=store_values) # Call the function function_output = function(*args, **kwargs) @@ -694,6 +699,7 @@ def wrapped(*args, **kwargs): builtin_object_hash=builtin_object_hash, time_stamp_start=time_stamp_start, execution_id=execution_id, + store_values=store_values, constructed_object=constructed_object) # Get the end time stamp diff --git a/alpaca/graph.py b/alpaca/graph.py index 5983273..198e366 100644 --- a/alpaca/graph.py +++ b/alpaca/graph.py @@ -98,7 +98,7 @@ def _get_name_value_pair(graph, bnode): def _get_entity_data(graph, entity, annotations=None, attributes=None, - strip_namespace=True): + strip_namespace=True, value_attribute=None): filter_map = defaultdict(list) filter_map.update( @@ -118,6 +118,12 @@ def _get_entity_data(graph, entity, annotations=None, attributes=None, _add_attribute(data, attr_name, attr_type, attr_value, strip_namespace) + # Get the stored value if requested and present + if value_attribute: + value = graph.value(entity, PROV.value) + if value: + data[value_attribute] = value.toPython() + if data['type'] == NSS_FILE: file_path = str(list(graph.objects(entity, ALPACA.filePath))[0]) data["File_path"] = file_path @@ -223,6 +229,15 @@ class name of the object (e.g., `ndarray`). The `Python_name` node time interval strings in the format supported by the Gephi timeline feature. If False, the attribute is not included. Default: True + value_attribute : str, optional + If provided, an attribute named `value_attribute` will be added to + the node attributes to show the values stored in the provenance + information. Alpaca stores the values of objects of the builtin types + `str`, `bool`, `int`, `float` and `complex`, as well as the NumPy + numeric types (e.g. `numpy.float64`) by default. The values of + additional types can be defined using the + :func:`alpaca.settings.alpaca_setting` function. + Default: None Attributes ---------- @@ -235,7 +250,7 @@ class name of the object (e.g., `ndarray`). The `Python_name` node def __init__(self, *prov_file, annotations=None, attributes=None, strip_namespace=True, remove_none=True, use_name_in_parameter=True, use_class_in_method_name=True, - time_intervals=True): + time_intervals=True, value_attribute=None): # Load PROV records from the file(s) doc = AlpacaProvDocument() @@ -250,7 +265,7 @@ def __init__(self, *prov_file, annotations=None, attributes=None, strip_namespace=strip_namespace, remove_none=remove_none, use_name_in_parameter=use_name_in_parameter, use_class_in_method_name=use_class_in_method_name, - time_intervals=time_intervals + time_intervals=time_intervals, value_attribute=value_attribute ) if time_intervals: @@ -319,7 +334,7 @@ def _transform_graph(graph, annotations=None, attributes=None, strip_namespace=True, remove_none=True, use_name_in_parameter=True, use_class_in_method_name=True, - time_intervals=True): + time_intervals=True, value_attribute=None): # Transform an RDFlib graph obtained from the PROV data, so that the # visualization is simplified. A new `nx.DiGraph` object is created # and returned. Annotations and attributes of the entities stored in @@ -341,7 +356,8 @@ def _transform_graph(graph, annotations=None, attributes=None, data = _get_entity_data(graph, entity, annotations=annotations, attributes=attributes, - strip_namespace=strip_namespace) + strip_namespace=strip_namespace, + value_attribute=value_attribute) transformed.add_node(node_id, **data) # Add all the edges. diff --git a/alpaca/serialization/prov.py b/alpaca/serialization/prov.py index 8aa0036..7b4f188 100644 --- a/alpaca/serialization/prov.py +++ b/alpaca/serialization/prov.py @@ -10,6 +10,8 @@ """ from itertools import product +import numpy as np +import numbers from rdflib import Graph, URIRef, BNode, Literal from rdflib.namespace import RDF, PROV, XSD @@ -59,6 +61,14 @@ class AlpacaProvDocument(object): control the serialization. """ + XSD_TYPES = { + numbers.Integral: XSD.integer, + numbers.Real: XSD.double, + numbers.Complex: XSD.string, + str: XSD.string, + bool: XSD.boolean, + } + def __init__(self): self.graph = Graph() self.graph.namespace_manager.bind('alpaca', ALPACA) @@ -142,6 +152,30 @@ def _add_FunctionExecution(self, script_info, session_id, execution_id, return uri # Entity methods + @classmethod + def _get_entity_value_datatype(cls, info): + value = info.value + if value is None: + return None + + # Check if builtin type or NumPy dtype + value_class = value.__class__ if not isinstance(value, np.number) \ + else value.dtype.type + if value_class in cls.XSD_TYPES: + return cls.XSD_TYPES[value_class] + + # Check if object is include in the `store_values` setting. + # In this case, they are always stored as strings + obj_type = info.type + if obj_type in _ALPACA_SETTINGS['store_values']: + return XSD.string + + for possible_type in (numbers.Integral, numbers.Real, numbers.Complex): + if issubclass(value_class, possible_type): + return cls.XSD_TYPES[possible_type] + + # Type not found + return None def _add_DataObjectEntity(self, info): # Adds a DataObjectEntity from the Alpaca PROV model @@ -152,6 +186,12 @@ def _add_DataObjectEntity(self, info): return uri self.graph.add((uri, RDF.type, ALPACA.DataObjectEntity)) self.graph.add((uri, ALPACA.hashSource, Literal(info.hash_method))) + + value_datatype = self._get_entity_value_datatype(info) + if value_datatype: + self.graph.add((uri, PROV.value, + Literal(info.value, datatype=value_datatype))) + self._add_entity_metadata(uri, info) self._entity_uris.add(uri) return uri diff --git a/alpaca/settings.py b/alpaca/settings.py index 6860755..0545989 100644 --- a/alpaca/settings.py +++ b/alpaca/settings.py @@ -51,6 +51,16 @@ Default: "my-authority" +* **store_values**: list of str + The values of the objects from the types in the list will be stored + together with the provenance information. Note that objects of the + builtin types `str`, `bool`, `int`, `float` and `complex`, as well as + the NumPy numeric types (e.g. `numpy.float64`) are stored by default. + This option should be used to store values of more complex types, such + as dictionaries. In this case, the list in this setting should have + the `builtins.dict` entry. The strings are the full path to the Python + object, i.e., `[module].[...].[object_class]`. + To set/read a setting, use the function :func:`alpaca_setting`. @@ -61,7 +71,8 @@ # Should be modified only through the `alpaca_setting` function. _ALPACA_SETTINGS = {'use_builtin_hash_for_module': [], - 'authority': "my-authority"} + 'authority': "my-authority", + 'store_values': []} def alpaca_setting(name, value=None): diff --git a/alpaca/test/res/values.ttl b/alpaca/test/res/values.ttl new file mode 100644 index 0000000..4483803 --- /dev/null +++ b/alpaca/test/res/values.ttl @@ -0,0 +1,111 @@ +@prefix alpaca: . +@prefix prov: . +@prefix xsd: . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1.1e+00 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value "test"^^xsd:string . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value "(3+5j)"^^xsd:string . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value true . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1.2e+00 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1.3e+00 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 2 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 3 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value -4 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value "{'id': [1, 2, 3], 'value': {4, 5, 6}}"^^xsd:string . + + a alpaca:DataObjectEntity ; + prov:wasAttributedTo ; + alpaca:hashSource "joblib_SHA1" . + + + a alpaca:FunctionExecution ; + prov:startedAtTime "2022-05-02T12:34:56.123456"^^xsd:dateTime ; + prov:endedAtTime "2022-05-02T12:35:56.123456"^^xsd:dateTime ; + prov:used ; + prov:wasAssociatedWith ; + alpaca:codeStatement "test_function(input_1, 5)" ; + alpaca:executionOrder 1 ; + alpaca:usedFunction ; + alpaca:hasParameter [ a alpaca:NameValuePair ; + alpaca:pairName "param_1" ; + alpaca:pairValue 5 ] . + + a alpaca:Function ; + alpaca:functionName "test_function" ; + alpaca:implementedIn "test" ; + alpaca:functionVersion "0.0.1" . + + a alpaca:ScriptAgent ; + alpaca:scriptPath "/script.py" . diff --git a/alpaca/test/test_code_analysis.py b/alpaca/test/test_code_analysis.py index f98a947..957de08 100644 --- a/alpaca/test/test_code_analysis.py +++ b/alpaca/test/test_code_analysis.py @@ -28,28 +28,32 @@ TEST_ARRAY_INFO = DataObject(hash=joblib_hash(TEST_ARRAY), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) ELEMENT_0_INFO = DataObject(hash=joblib_hash(TEST_ARRAY[0]), hash_method="joblib_SHA1", type="numpy.int64", id=id(TEST_ARRAY[0]), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=1) ELEMENT_1_INFO = DataObject(hash=joblib_hash(TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(TEST_ARRAY[1]), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=2) ELEMENT_2_INFO = DataObject(hash=joblib_hash(TEST_ARRAY[2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(TEST_ARRAY[2]), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=3) TEST_DICT = {'numbers': TEST_ARRAY} TEST_DICT_INFO = DataObject(hash=joblib_hash(TEST_DICT), hash_method="joblib_SHA1", type="builtins.dict", id=id(TEST_DICT), - details={}) + details={}, value=None) # To test attributes @@ -93,7 +97,8 @@ def _check_function_execution(actual, exp_function, exp_input, exp_params, exp_code_stmnt, exp_return_targets, exp_order, test_case): - data_object_attributes = ('hash', 'hash_method', 'type', 'details') + data_object_attributes = ('hash', 'hash_method', 'type', 'details', + 'value') # Check function test_case.assertTupleEqual(actual.function, exp_function) @@ -155,7 +160,7 @@ def test_subscript_index(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=3) _check_function_execution( actual=Provenance.history[0], @@ -203,7 +208,7 @@ def test_subscript_negative_index(self): hash=joblib_hash(TEST_ARRAY[-1]+TEST_ARRAY[-2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=5) _check_function_execution( actual=Provenance.history[0], @@ -251,12 +256,12 @@ def test_subscript_slice(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=3) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[0:2]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[0:2]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -294,12 +299,12 @@ def test_subscript_slice_no_start(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=3) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[:2]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[:2]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -337,12 +342,12 @@ def test_subscript_slice_no_stop(self): hash=joblib_hash(TEST_ARRAY[1]+TEST_ARRAY[2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=5) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[1:]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[1:]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -380,12 +385,12 @@ def test_subscript_slice_step(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=4) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[::2]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[::2]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -423,7 +428,7 @@ def test_subscript_index_str(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=6) _check_function_execution( actual=Provenance.history[0], @@ -462,7 +467,7 @@ def test_subscript_index_from_variable(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=6) _check_function_execution( actual=Provenance.history[0], @@ -500,12 +505,13 @@ def test_attribute(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=6) expected_container_info = DataObject( hash=joblib_hash(container_of_array), hash_method="joblib_SHA1", type="test_code_analysis.ContainerOfArray", - id=id(container_of_array), details={'array': TEST_ARRAY}) + id=id(container_of_array), details={'array': TEST_ARRAY}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -544,19 +550,22 @@ def test_attribute_method_call(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=6) object_info = DataObject( hash=joblib_hash(object_with_method), hash_method="joblib_SHA1", type="test_code_analysis.ObjectWithMethod", id=id(object_with_method), - details={}) + details={}, + value=None) expected_container_info = DataObject( hash=joblib_hash(container_of_array), hash_method="joblib_SHA1", type="test_code_analysis.ContainerOfArray", - id=id(container_of_array), details={'array': TEST_ARRAY}) + id=id(container_of_array), details={'array': TEST_ARRAY}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -596,17 +605,19 @@ def test_subscript_initializer(self): hash=joblib_hash(custom_object), hash_method="joblib_SHA1", type="test_code_analysis.CustomObject", id=id(custom_object), - details={'data': list_1}) + details={'data': list_1}, value=None) source_list_info = DataObject( hash=joblib_hash(source_data), hash_method="joblib_SHA1", - type="builtins.list", id=id(source_data), details={}) + type="builtins.list", id=id(source_data), details={}, + value=None) element_info = DataObject( hash=joblib_hash(list_1), hash_method="joblib_SHA1", - type="builtins.list", id=id(list_1), details={}) + type="builtins.list", id=id(list_1), details={}, + value=None) _check_function_execution( actual=Provenance.history[0], diff --git a/alpaca/test/test_data_information.py b/alpaca/test/test_data_information.py index 3d6e738..6b47032 100644 --- a/alpaca/test/test_data_information.py +++ b/alpaca/test/test_data_information.py @@ -106,6 +106,35 @@ def test_none(self): self.assertEqual(info.hash_method, "UUID") self.assertDictEqual(info.details, {}) + def test_store_value_requested(self): + object_info = _ObjectInformation(store_values=['builtins.dict']) + test_dict = dict(key=['3', '4']) + info = object_info.info(test_dict) + self.assertEqual(info.hash, joblib.hash(test_dict, hash_name='sha1')) + self.assertEqual(info.type, "builtins.dict") + self.assertEqual(info.hash_method, "joblib_SHA1") + self.assertDictEqual(info.details, {}) + self.assertEqual(info.value, "{'key': ['3', '4']}") + + def test_store_value_not_requested(self): + object_info = _ObjectInformation() + test_dict = dict(key=['3', '4']) + info = object_info.info(test_dict) + self.assertEqual(info.hash, joblib.hash(test_dict, hash_name='sha1')) + self.assertEqual(info.type, "builtins.dict") + self.assertEqual(info.hash_method, "joblib_SHA1") + self.assertDictEqual(info.details, {}) + self.assertEqual(info.value, None) + + def test_store_value_builtins(self): + object_info = _ObjectInformation() + info = object_info.info(5) + self.assertEqual(info.hash, joblib.hash(5, hash_name='sha1')) + self.assertEqual(info.type, "builtins.int") + self.assertEqual(info.hash_method, "joblib_SHA1") + self.assertDictEqual(info.details, {}) + self.assertEqual(info.value, 5) + def test_custom_class(self): custom_object_1 = ObjectClass(param=4) custom_object_2 = ObjectClass(param=3) diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index 0625631..2ea6dd3 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -15,7 +15,7 @@ import neo from alpaca import (Provenance, activate, deactivate, save_provenance, - print_history) + print_history, alpaca_setting) from alpaca.alpaca_types import (FunctionInfo, Container, DataObject, File) # Define some data and expected values test tracking @@ -24,14 +24,16 @@ TEST_ARRAY_INFO = DataObject(hash=joblib.hash(TEST_ARRAY, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) TEST_ARRAY_2 = np.array([4, 5, 6]) TEST_ARRAY_2_INFO = DataObject(hash=joblib.hash(TEST_ARRAY_2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY_2), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) CONTAINER = [TEST_ARRAY, TEST_ARRAY_2] @@ -388,7 +390,7 @@ def test_simple_function(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -420,7 +422,7 @@ def test_simple_function_no_target(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=output_id, - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -447,7 +449,7 @@ def test_kwargs_params(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -474,7 +476,7 @@ def test_kwargs_params_default(self): hash=joblib.hash(TEST_ARRAY+5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -501,7 +503,7 @@ def test_kwargs_params_default_override(self): hash=joblib.hash(TEST_ARRAY+5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -529,7 +531,7 @@ def test_container_input_function(self): hash=joblib.hash(np.float64(3.5), hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(avg), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, value=3.5) _check_function_execution( actual=Provenance.history[0], @@ -558,7 +560,7 @@ def test_varargs_input_function(self): hash=joblib.hash(np.float64(3.5), hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(avg), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, value=3.5) _check_function_execution( actual=Provenance.history[0], @@ -586,7 +588,7 @@ def test_multiple_inputs_function(self): hash=joblib.hash(TEST_ARRAY+TEST_ARRAY_2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -614,13 +616,13 @@ def test_multiple_outputs_function_elements(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res1), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_output_2 = DataObject( hash=joblib.hash(TEST_ARRAY+4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res2), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -647,7 +649,7 @@ def test_multiple_outputs_function_tuple(self): hash=joblib.hash((TEST_ARRAY+3, TEST_ARRAY+4), hash_name='sha1'), hash_method="joblib_SHA1", type="builtins.tuple", id=id(res), - details={}) + details={}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -674,13 +676,13 @@ def test_container_output_function(self): hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_output_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -705,19 +707,19 @@ def test_container_output_function_level_0(self): expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 7, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 8, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check the subscript of each array with respect to the list returned _check_function_execution( @@ -770,29 +772,30 @@ def test_container_output_function_level_1(self): elements = [[], []] for idx, container in enumerate(res): - for el_idx, element in enumerate(container): + for element in container: element_info = DataObject( hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements[idx].append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check subscript of each element with respect to the array containers = [expected_container_1, expected_container_2] @@ -866,19 +869,19 @@ def test_container_output_function_level_range_0_0(self): expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 1, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check the subscript of each array with respect to the list returned _check_function_execution( @@ -937,24 +940,24 @@ def test_container_output_function_level_range_0_1(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=element) elements[idx].append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 6, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check subscript of each element with respect to the array containers = [expected_container_1, expected_container_2] @@ -1032,20 +1035,23 @@ def test_container_output_function_level_range_1_1(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements[idx].append(element_info) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) # Check subscript of each element with respect to the array containers = [expected_container_1, expected_container_2] @@ -1092,19 +1098,21 @@ def test_dict_output_function(self): expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.dict", id=id(res), details={}) + type="builtins.dict", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.0']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.1']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -1156,29 +1164,32 @@ def test_dict_output_function_level(self): elements = {'key.0': [], 'key.1': []} for key, container in res.items(): - for el_idx, element in enumerate(container): + for element in container: element_info = DataObject( hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements[key].append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.dict", id=id(res), details={}) + type="builtins.dict", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.0']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.1']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) # Check subscript of each element with respect to the array containers = { @@ -1254,18 +1265,19 @@ def test_non_iterable_container_output(self): self.assertEqual(len(Provenance.history), 4) elements = [] - for el_idx, element in enumerate(res): - element_info = DataObject( - hash=joblib.hash(element, hash_name="sha1"), - hash_method="joblib_SHA1", - type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) - elements.append(element_info) + for element in res: + element_info = DataObject( + hash=joblib.hash(element, hash_name="sha1"), + hash_method="joblib_SHA1", + type="numpy.int64", id=None, + details={'shape': (), 'dtype': np.int64}, + value=element) + elements.append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", type="test_decorator.NonIterableContainer", id=id(res), - details={'data': res.data}) + details={'data': res.data}, value=None) # Check subscript of each element with respect to the container for history_index in (0, 1, 2): @@ -1316,7 +1328,8 @@ def test_comprehensions(self): hash=joblib.hash(element, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(element), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, + value=element) _check_function_execution( actual=Provenance.history[history], @@ -1338,7 +1351,8 @@ def test_comprehensions(self): hash=joblib.hash(element, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(element), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, + value=element) _check_function_execution( actual=Provenance.history[history], @@ -1360,7 +1374,8 @@ def test_comprehensions(self): hash=joblib.hash(element, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(element), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, + value=element) _check_function_execution( actual=Provenance.history[history], @@ -1409,7 +1424,7 @@ def test_file_input(self): expected_output = DataObject( hash=joblib.hash(expected_list, hash_name='sha1'), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_file = File("96ccc1380e069667069acecea3e2ab559441657807e0a86d14f49028710ddb3a", hash_type="sha256", path=file_name) @@ -1441,14 +1456,14 @@ def test_file_output(self): expected_input = DataObject( hash=joblib.hash(input_list, hash_name='sha1'), hash_method="joblib_SHA1", - type="builtins.list", id=id(input_list), details={}) + type="builtins.list", id=id(input_list), details={}, value=None) # As None has its own UUID, let's get what was generated self.assertEqual(len(Provenance.history), 1) output_uuid = Provenance.history[0].output[0].hash expected_none_output = DataObject(hash=output_uuid, hash_method="UUID", - type="builtins.NoneType", id=id(res), details={}) + type="builtins.NoneType", id=id(res), details={}, value=None) expected_file = File("00d20b4831b0dadded2c633bdfc3dde3926fc17baaed51dacdab3e52a3b0d419", hash_type="sha256", path=Path(file_name)) @@ -1509,13 +1524,15 @@ def test_static_method(self): hash_method="joblib_SHA1", type="test_decorator.ObjectWithMethod", id=id(obj), - details={'coefficient': 2}) + details={'coefficient': 2}, + value=None) expected_output = DataObject( hash=joblib.hash(TEST_ARRAY+4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -1549,7 +1566,8 @@ def test_method_descriptor(self): 'file_origin': None, 'description': None, 'segment': None, 'units': pq.mV.units, 'shape': (3, 1), 'dtype': np.int64, 't_start': 0 * pq.s, 't_stop': 3 * pq.s, - 'dimensionality': pq.mV.dimensionality}) + 'dimensionality': pq.mV.dimensionality}, + value=None) expected_output = DataObject( hash=joblib.hash(reshaped, hash_name='sha1'), @@ -1561,7 +1579,8 @@ def test_method_descriptor(self): 'file_origin': None, 'description': None, 'segment': None, 'units': pq.mV.units, 'shape': (1, 3), 'dtype': np.int64, 't_start': 0 * pq.s, 't_stop': 1 * pq.s, - 'dimensionality': pq.mV.dimensionality}) + 'dimensionality': pq.mV.dimensionality}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -1589,7 +1608,8 @@ def test_class_constructor(self): hash_method="joblib_SHA1", type="test_decorator.ObjectWithMethod", id=id(obj), - details={'coefficient': 2}) + details={'coefficient': 2}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -1618,13 +1638,13 @@ def test_object_method(self): hash_method="joblib_SHA1", type="test_decorator.ObjectWithMethod", id=id(obj), - details={'coefficient': 2}) + details={'coefficient': 2}, value=None) expected_output = DataObject( hash=joblib.hash(TEST_ARRAY+2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -1653,7 +1673,8 @@ def test_class_constructor_container_output(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements.append(element_info) expected_output = DataObject( @@ -1661,7 +1682,8 @@ def test_class_constructor_container_output(self): hash_method="joblib_SHA1", type="test_decorator.NonIterableContainerOutputObject", id=id(obj), - details={'_data': obj._data}) + details={'_data': obj._data}, + value=None) # Check subscript of each element with respect to the container for history_index in (0, 1, 2): @@ -1695,5 +1717,79 @@ def test_class_constructor_container_output(self): test_case=self) +@Provenance(inputs=['source']) +def use_dict(source): + return 3 + + +class ProvenanceDecoratorStoreValuesTestCase(unittest.TestCase): + + def setUp(self): + alpaca_setting('store_values', []) + + def test_capture_dict(self): + # This should have values for both the input dictionary and the + # integer return + alpaca_setting('store_values', ['builtins.dict']) + activate(clear=True) + test_dict = dict(id=[1, 2, 3], value={4, 5, 6}) + res = use_dict(test_dict) + deactivate() + + dict_info = DataObject(hash=joblib.hash(test_dict, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.dict", id=id(test_dict), + details={}, + value="{'id': [1, 2, 3], 'value': {4, 5, 6}}") + + expected_output = DataObject(hash=joblib.hash(3, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.int", id=id(res), + details={}, value=3) + + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('use_dict', 'test_decorator', ''), + exp_input={'source': dict_info}, + exp_params={}, + exp_output={0: expected_output}, + exp_arg_map=['source'], + exp_kwarg_map=[], + exp_code_stmnt="res = use_dict(test_dict)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + def test_capture_builtins_only(self): + # This should have values only for the integer return + activate(clear=True) + test_dict = dict(id=[1, 2, 3], value={4, 5, 6}) + res = use_dict(test_dict) + deactivate() + + dict_info = DataObject(hash=joblib.hash(test_dict, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.dict", id=id(test_dict), + details={}, value=None) + + expected_output = DataObject(hash=joblib.hash(3, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.int", id=id(res), + details={}, value=3) + + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('use_dict', 'test_decorator', ''), + exp_input={'source': dict_info}, + exp_params={}, + exp_output={0: expected_output}, + exp_arg_map=['source'], + exp_kwarg_map=[], + exp_code_stmnt="res = use_dict(test_dict)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + if __name__ == "__main__": unittest.main() diff --git a/alpaca/test/test_graph.py b/alpaca/test/test_graph.py index 46bb78b..f07314e 100644 --- a/alpaca/test/test_graph.py +++ b/alpaca/test/test_graph.py @@ -440,6 +440,32 @@ def test_remove_multiple_attributes_aggregation(self): self.assertTrue("Time Interval" not in node_attrs) self.assertTrue("sua" not in node_attrs) + def test_value_attribute(self): + input_file = self.ttl_path / "values.ttl" + graph = ProvenanceGraph(input_file, attributes='all', + annotations='all', value_attribute='value') + + node_values_by_id = { + "urn:fz-juelich.de:alpaca:object:Python:builtins.int:543211": 1, + "urn:fz-juelich.de:alpaca:object:Python:builtins.float:543212": 1.1, + "urn:fz-juelich.de:alpaca:object:Python:builtins.str:543213": "test", + "urn:fz-juelich.de:alpaca:object:Python:builtins.complex:543214": "(3+5j)", + "urn:fz-juelich.de:alpaca:object:Python:builtins.bool:543215": True, + "urn:fz-juelich.de:alpaca:object:Python:numpy.float32:543216": 1.2, + "urn:fz-juelich.de:alpaca:object:Python:numpy.float64:543217": 1.3, + "urn:fz-juelich.de:alpaca:object:Python:numpy.int64:543218": 2, + "urn:fz-juelich.de:alpaca:object:Python:numpy.int32:543219": 3, + "urn:fz-juelich.de:alpaca:object:Python:numpy.int16:5432110": -4, + "urn:fz-juelich.de:alpaca:object:Python:builtins.dict:5432111": "{'id': [1, 2, 3], 'value': {4, 5, 6}}", + "urn:fz-juelich.de:alpaca:object:Python:test.InputObject:12345": None, + "urn:fz-juelich.de:alpaca:object:Python:test.OutputObject:54321": None, + } + + for node, node_attrs in graph.graph.nodes(data=True): + if node_attrs['type'] == 'object': + expected_value = node_values_by_id[node] + self.assertEqual(expected_value, node_attrs.get('value', None)) + class GraphTimeIntervalTestCase(unittest.TestCase): diff --git a/alpaca/test/test_serialization.py b/alpaca/test/test_serialization.py index 79fc12f..96304b5 100644 --- a/alpaca/test/test_serialization.py +++ b/alpaca/test/test_serialization.py @@ -23,14 +23,15 @@ TEST_FUNCTION = FunctionInfo("test_function", "test", "0.0.1") # Object without metadata -INPUT = DataObject("12345", "joblib_SHA1", "test.InputObject", 12345, {}) +INPUT = DataObject("12345", "joblib_SHA1", "test.InputObject", 12345, {}, None) # Object with all main types of metadata INPUT_METADATA = DataObject("12345", "joblib_SHA1", "test.InputObject", 12345, details={'metadata_1': "value1", 'metadata_2': 5, 'metadata_3': 5.0, - 'metadata_4': True}) + 'metadata_4': True}, + value=None) OUTPUT_METADATA_NEO = DataObject("54321", "joblib_SHA1", "neo.core.SpikeTrain", 54321, @@ -42,7 +43,8 @@ [0, 1, 2, 3]), 'event': np.array( [True, False, False])} - }) + }, + value=None) # Object with special metadata @@ -51,15 +53,20 @@ OUTPUT_FILE = File("98765", "sha256", "/test_file_output") # Simple objects to test multiple inputs/outputs handling -INPUT_2 = DataObject("212345", "joblib_SHA1", "test.InputObject", 212345, {}) -OUTPUT = DataObject("54321", "joblib_SHA1", "test.OutputObject", 54321, {}) -OUTPUT_2 = DataObject("254321", "joblib_SHA1", "test.OutputObject", 254321, {}) +INPUT_2 = DataObject("212345", "joblib_SHA1", "test.InputObject", 212345, {}, + None) +OUTPUT = DataObject("54321", "joblib_SHA1", "test.OutputObject", 54321, {}, + None) +OUTPUT_2 = DataObject("254321", "joblib_SHA1", "test.OutputObject", 254321, {}, + None) # None output -NONE_OUTPUT = DataObject("777777", "UUID", "builtins.NoneType", 777777, {}) +NONE_OUTPUT = DataObject("777777", "UUID", "builtins.NoneType", 777777, {}, + None) # Object collections -COLLECTION = DataObject("888888", "joblib_SHA1", "builtins.list", 888888, {}) +COLLECTION = DataObject("888888", "joblib_SHA1", "builtins.list", 888888, {}, + None) # General information. Will be fixed across the tests TIMESTAMP_START = "2022-05-02T12:34:56.123456" @@ -83,6 +90,67 @@ def setUpClass(cls): cls.ttl_path = Path(__file__).parent / "res" alpaca_setting('authority', "fz-juelich.de") + def setUp(self): + alpaca_setting('store_values', []) + + def test_value_serialization(self): + # DataObject tuples for each type that should be captured + # They are output of the simulated output + alpaca_setting('store_values', ['builtins.dict']) + + INT = DataObject("543211", "joblib_SHA1", "builtins.int", 543211, + {}, 1) + FLOAT = DataObject("543212", "joblib_SHA1", "builtins.float", 543212, + {}, 1.1) + STR = DataObject("543213", "joblib_SHA1", "builtins.str", 543213, + {}, "test") + COMPLEX = DataObject("543214", "joblib_SHA1", "builtins.complex", + 543214, {}, 3+5j) + BOOL = DataObject("543215", "joblib_SHA1", "builtins.bool", 543215, + {}, True) + NUMPY_FLOAT32 = DataObject("543216", "joblib_SHA1", "numpy.float32", + 543216, {}, np.float32(1.2)) + NUMPY_FLOAT64 = DataObject("543217", "joblib_SHA1", "numpy.float64", + 543217, {}, np.float64(1.3)) + NUMPY_INT64 = DataObject("543218", "joblib_SHA1", "numpy.int64", + 543218, {}, np.int64(2)) + NUMPY_INT32 = DataObject("543219", "joblib_SHA1", "numpy.int32", + 543219, {}, np.int32(3)) + NUMPY_INT16 = DataObject("5432110", "joblib_SHA1", "numpy.int16", + 5432110, {}, np.int16(-4)) + + DICT = DataObject("5432111", "joblib_SHA1", "builtins.dict", + 5432111, {}, + str(dict(id=[1, 2, 3], value={4, 5, 6}))) + + function_execution = FunctionExecution( + function=TEST_FUNCTION, + input={'input_1': INPUT}, params={'param_1': 5}, + output={0: OUTPUT, 1: INT, 2: FLOAT, 3: STR, 4: COMPLEX, + 5: BOOL, 6: NUMPY_FLOAT32, 7: NUMPY_FLOAT64, + 8: NUMPY_INT64, 9: NUMPY_INT32, 10: NUMPY_INT16, + 11: DICT}, + call_ast=None, + arg_map=['input_1', 'param_1'], kwarg_map=[], return_targets=[], + time_stamp_start=TIMESTAMP_START, time_stamp_end=TIMESTAMP_END, + execution_id="12345", order=1, + code_statement="test_function(input_1, 5)" + ) + + # Load expected RDF graph + expected_graph_file = self.ttl_path / "values.ttl" + expected_graph = rdflib.Graph() + expected_graph.parse(expected_graph_file, format='turtle') + + # Serialize the history using AlpacaProv document + alpaca_prov = AlpacaProvDocument() + alpaca_prov.add_history(SCRIPT_INFO, SCRIPT_SESSION_ID, + history=[function_execution]) + + # Check if graphs are equal + self.assertTrue(assert_rdf_graphs_equal(alpaca_prov.graph, + expected_graph)) + def test_input_output_serialization(self): function_execution = FunctionExecution( function=TEST_FUNCTION, @@ -166,7 +234,7 @@ def test_class_method_serialization(self): hash_method="joblib_SHA1", type="test.ObjectWithMethod", id=232323, - details={}) + details={}, value=None) function_execution = FunctionExecution( function=FunctionInfo('ObjectWithMethod.process', @@ -437,16 +505,16 @@ def test_multiple_memberships(self): self.ttl_path = Path(__file__).parent / "res" super_container = DataObject("2333333", "joblib_SHA1", - "test.SuperContainer", 2333333, {}) + "test.SuperContainer", 2333333, {}, None) super_container_list = DataObject("23333332", "joblib_SHA1", - "builtins.list", 23333332, {}) + "builtins.list", 23333332, {}, None) container = DataObject("333333", "joblib_SHA1", "test.Container", 333333, - {}) + {}, None) container_list = DataObject("3333332", "joblib_SHA1", "builtins.list", - 3333332, {}) + 3333332, {}, None) attribute_access_container = FunctionExecution( function=FunctionInfo(name='attribute', module="", version=""),