From c86e8a43da36cceefb2070e50899e2cf9b9d3f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Fri, 14 Jul 2023 14:28:01 +0200 Subject: [PATCH 1/9] Implemented capture of values of strings, booleans, complex and numeric types. --- alpaca/alpaca_types.py | 2 +- alpaca/data_information.py | 13 +++++++-- alpaca/test/test_code_analysis.py | 46 +++++++++++++++++-------------- alpaca/test/test_decorator.py | 42 ++++++++++++++-------------- alpaca/test/test_serialization.py | 33 +++++++++++++--------- 5 files changed, 80 insertions(+), 56 deletions(-) diff --git a/alpaca/alpaca_types.py b/alpaca/alpaca_types.py index aacd846..6610cbe 100644 --- a/alpaca/alpaca_types.py +++ b/alpaca/alpaca_types.py @@ -61,6 +61,6 @@ # the disk. DataObject = namedtuple('DataObject', ('hash', 'hash_method', 'type', 'id', - 'details')) + 'details', 'value')) File = namedtuple('File', ('hash', 'hash_type', 'path')) diff --git a/alpaca/data_information.py b/alpaca/data_information.py index f6eedbb..9cf3bd3 100644 --- a/alpaca/data_information.py +++ b/alpaca/data_information.py @@ -258,6 +258,10 @@ def info(self, obj): Reference for the object. * details : dict Extended information (metadata) on the object. + * value : object + For builtin objects (`str`, `int`, `float`, `bool`) or + equivalent objects (e.g. `numpy.float64`), the value is + stored. """ type_information = type(obj) obj_type = f"{type_information.__module__}.{type_information.__name__}" @@ -267,7 +271,8 @@ def info(self, obj): if obj is None: unique_id = uuid.uuid4() return DataObject(hash=unique_id, hash_method="UUID", - type=obj_type, id=obj_id, details={}) + type=obj_type, id=obj_id, details={}, + value=None) # Here we can extract specific metadata to record details = {} @@ -290,5 +295,9 @@ def info(self, obj): obj_id=obj_id, package=package) + obj_value = obj if isinstance(obj, (str, int, bool, complex, float, + np.number)) else None + return DataObject(hash=obj_hash, hash_method=hash_method, - type=obj_type, id=obj_id, details=details) + type=obj_type, id=obj_id, details=details, + value=obj_value) diff --git a/alpaca/test/test_code_analysis.py b/alpaca/test/test_code_analysis.py index 0060f20..b0ac144 100644 --- a/alpaca/test/test_code_analysis.py +++ b/alpaca/test/test_code_analysis.py @@ -28,28 +28,32 @@ TEST_ARRAY_INFO = DataObject(hash=joblib_hash(TEST_ARRAY), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) ELEMENT_0_INFO = DataObject(hash=joblib_hash(TEST_ARRAY[0]), hash_method="joblib_SHA1", type="numpy.int64", id=id(TEST_ARRAY[0]), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=1) ELEMENT_1_INFO = DataObject(hash=joblib_hash(TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(TEST_ARRAY[1]), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=2) ELEMENT_2_INFO = DataObject(hash=joblib_hash(TEST_ARRAY[2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(TEST_ARRAY[2]), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=3) TEST_DICT = {'numbers': TEST_ARRAY} TEST_DICT_INFO = DataObject(hash=joblib_hash(TEST_DICT), hash_method="joblib_SHA1", type="builtins.dict", id=id(TEST_DICT), - details={}) + details={}, value=None) # To test attributes @@ -95,7 +99,8 @@ def _check_function_execution(actual, exp_function, exp_input, exp_params, exp_code_stmnt, exp_return_targets, exp_order, test_case): - data_object_attributes = ('hash', 'hash_method', 'type', 'details') + data_object_attributes = ('hash', 'hash_method', 'type', 'details', + 'value') # Check function test_case.assertTupleEqual(actual.function, exp_function) @@ -157,7 +162,7 @@ def test_subscript_index(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=3) _check_function_execution( actual=Provenance.history[0], @@ -205,7 +210,7 @@ def test_subscript_negative_index(self): hash=joblib_hash(TEST_ARRAY[-1]+TEST_ARRAY[-2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=5) _check_function_execution( actual=Provenance.history[0], @@ -253,12 +258,12 @@ def test_subscript_slice(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=3) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[0:2]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[0:2]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -296,12 +301,12 @@ def test_subscript_slice_no_start(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[1]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=3) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[:2]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[:2]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -339,12 +344,12 @@ def test_subscript_slice_no_stop(self): hash=joblib_hash(TEST_ARRAY[1]+TEST_ARRAY[2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=5) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[1:]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[1:]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -382,12 +387,12 @@ def test_subscript_slice_step(self): hash=joblib_hash(TEST_ARRAY[0]+TEST_ARRAY[2]), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=4) expected_slice_output = DataObject( hash=joblib_hash(TEST_ARRAY[::2]), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY[::2]), - details={'shape': (2,), 'dtype': np.int64}) + details={'shape': (2,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -425,7 +430,7 @@ def test_subscript_index_str(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=6) _check_function_execution( actual=Provenance.history[0], @@ -464,7 +469,7 @@ def test_subscript_index_from_variable(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=6) _check_function_execution( actual=Provenance.history[0], @@ -502,12 +507,13 @@ def test_attribute(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=6) expected_container_info = DataObject( hash=joblib_hash(container_of_array), hash_method="joblib_SHA1", type="test_code_analysis.ContainerOfArray", - id=id(container_of_array), details={'array': TEST_ARRAY}) + id=id(container_of_array), details={'array': TEST_ARRAY}, + value=None) _check_function_execution( actual=Provenance.history[0], diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index de1769e..1445931 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -24,14 +24,16 @@ TEST_ARRAY_INFO = DataObject(hash=joblib.hash(TEST_ARRAY, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) TEST_ARRAY_2 = np.array([4, 5, 6]) TEST_ARRAY_2_INFO = DataObject(hash=joblib.hash(TEST_ARRAY_2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(TEST_ARRAY_2), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) CONTAINER = [TEST_ARRAY, TEST_ARRAY_2] @@ -327,7 +329,7 @@ def test_simple_function(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -359,7 +361,7 @@ def test_simple_function_no_target(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=output_id, - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -386,7 +388,7 @@ def test_kwargs_params(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -413,7 +415,7 @@ def test_kwargs_params_default(self): hash=joblib.hash(TEST_ARRAY+5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -440,7 +442,7 @@ def test_kwargs_params_default_override(self): hash=joblib.hash(TEST_ARRAY+5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -468,7 +470,7 @@ def test_container_input_function(self): hash=joblib.hash(np.float64(3.5), hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(avg), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, value=3.5) _check_function_execution( actual=Provenance.history[0], @@ -497,7 +499,7 @@ def test_varargs_input_function(self): hash=joblib.hash(np.float64(3.5), hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(avg), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, value=3.5) _check_function_execution( actual=Provenance.history[0], @@ -525,7 +527,7 @@ def test_multiple_inputs_function(self): hash=joblib.hash(TEST_ARRAY+TEST_ARRAY_2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -553,13 +555,13 @@ def test_multiple_outputs_function_elements(self): hash=joblib.hash(TEST_ARRAY+3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res1), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_output_2 = DataObject( hash=joblib.hash(TEST_ARRAY+4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res2), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -586,7 +588,7 @@ def test_multiple_outputs_function_tuple(self): hash=joblib.hash((TEST_ARRAY+3, TEST_ARRAY+4), hash_name='sha1'), hash_method="joblib_SHA1", type="builtins.tuple", id=id(res), - details={}) + details={}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -613,13 +615,13 @@ def test_container_output_function(self): hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_output_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], @@ -1057,7 +1059,7 @@ def test_file_input(self): expected_output = DataObject( hash=joblib.hash(expected_list, hash_name='sha1'), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_file = File("96ccc1380e069667069acecea3e2ab559441657807e0a86d14f49028710ddb3a", hash_type="sha256", path=file_name) @@ -1089,14 +1091,14 @@ def test_file_output(self): expected_input = DataObject( hash=joblib.hash(input_list, hash_name='sha1'), hash_method="joblib_SHA1", - type="builtins.list", id=id(input_list), details={}) + type="builtins.list", id=id(input_list), details={}, value=None) # As None has its own UUID, let's get what was generated self.assertEqual(len(Provenance.history), 1) output_uuid = Provenance.history[0].output[0].hash expected_none_output = DataObject(hash=output_uuid, hash_method="UUID", - type="builtins.NoneType", id=id(res), details={}) + type="builtins.NoneType", id=id(res), details={}, value=None) expected_file = File("00d20b4831b0dadded2c633bdfc3dde3926fc17baaed51dacdab3e52a3b0d419", hash_type="sha256", path=Path(file_name)) @@ -1224,13 +1226,13 @@ def test_object_method(self): hash_method="joblib_SHA1", type="test_decorator.ObjectWithMethod", id=id(obj), - details={'coefficient': 2}) + details={'coefficient': 2}, value=None) expected_output = DataObject( hash=joblib.hash(TEST_ARRAY+2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) _check_function_execution( actual=Provenance.history[0], diff --git a/alpaca/test/test_serialization.py b/alpaca/test/test_serialization.py index 79fc12f..fd9ac15 100644 --- a/alpaca/test/test_serialization.py +++ b/alpaca/test/test_serialization.py @@ -23,14 +23,15 @@ TEST_FUNCTION = FunctionInfo("test_function", "test", "0.0.1") # Object without metadata -INPUT = DataObject("12345", "joblib_SHA1", "test.InputObject", 12345, {}) +INPUT = DataObject("12345", "joblib_SHA1", "test.InputObject", 12345, {}, None) # Object with all main types of metadata INPUT_METADATA = DataObject("12345", "joblib_SHA1", "test.InputObject", 12345, details={'metadata_1': "value1", 'metadata_2': 5, 'metadata_3': 5.0, - 'metadata_4': True}) + 'metadata_4': True}, + value=None) OUTPUT_METADATA_NEO = DataObject("54321", "joblib_SHA1", "neo.core.SpikeTrain", 54321, @@ -42,7 +43,8 @@ [0, 1, 2, 3]), 'event': np.array( [True, False, False])} - }) + }, + value=None) # Object with special metadata @@ -51,15 +53,20 @@ OUTPUT_FILE = File("98765", "sha256", "/test_file_output") # Simple objects to test multiple inputs/outputs handling -INPUT_2 = DataObject("212345", "joblib_SHA1", "test.InputObject", 212345, {}) -OUTPUT = DataObject("54321", "joblib_SHA1", "test.OutputObject", 54321, {}) -OUTPUT_2 = DataObject("254321", "joblib_SHA1", "test.OutputObject", 254321, {}) +INPUT_2 = DataObject("212345", "joblib_SHA1", "test.InputObject", 212345, {}, + None) +OUTPUT = DataObject("54321", "joblib_SHA1", "test.OutputObject", 54321, {}, + None) +OUTPUT_2 = DataObject("254321", "joblib_SHA1", "test.OutputObject", 254321, {}, + None) # None output -NONE_OUTPUT = DataObject("777777", "UUID", "builtins.NoneType", 777777, {}) +NONE_OUTPUT = DataObject("777777", "UUID", "builtins.NoneType", 777777, {}, + None) # Object collections -COLLECTION = DataObject("888888", "joblib_SHA1", "builtins.list", 888888, {}) +COLLECTION = DataObject("888888", "joblib_SHA1", "builtins.list", 888888, {}, + None) # General information. Will be fixed across the tests TIMESTAMP_START = "2022-05-02T12:34:56.123456" @@ -166,7 +173,7 @@ def test_class_method_serialization(self): hash_method="joblib_SHA1", type="test.ObjectWithMethod", id=232323, - details={}) + details={}, value=None) function_execution = FunctionExecution( function=FunctionInfo('ObjectWithMethod.process', @@ -437,16 +444,16 @@ def test_multiple_memberships(self): self.ttl_path = Path(__file__).parent / "res" super_container = DataObject("2333333", "joblib_SHA1", - "test.SuperContainer", 2333333, {}) + "test.SuperContainer", 2333333, {}, None) super_container_list = DataObject("23333332", "joblib_SHA1", - "builtins.list", 23333332, {}) + "builtins.list", 23333332, {}, None) container = DataObject("333333", "joblib_SHA1", "test.Container", 333333, - {}) + {}, None) container_list = DataObject("3333332", "joblib_SHA1", "builtins.list", - 3333332, {}) + 3333332, {}, None) attribute_access_container = FunctionExecution( function=FunctionInfo(name='attribute', module="", version=""), From d1b9ca80b85f13cf773d6381e73a9f5c0deb13d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Fri, 14 Jul 2023 16:23:52 +0200 Subject: [PATCH 2/9] Added serialization of values, with unit tests --- alpaca/serialization/prov.py | 34 ++++++++++ alpaca/test/res/values.ttl | 104 ++++++++++++++++++++++++++++++ alpaca/test/test_serialization.py | 52 +++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 alpaca/test/res/values.ttl diff --git a/alpaca/serialization/prov.py b/alpaca/serialization/prov.py index 8aa0036..94d6ea8 100644 --- a/alpaca/serialization/prov.py +++ b/alpaca/serialization/prov.py @@ -10,6 +10,8 @@ """ from itertools import product +import numpy as np +import numbers from rdflib import Graph, URIRef, BNode, Literal from rdflib.namespace import RDF, PROV, XSD @@ -59,6 +61,14 @@ class AlpacaProvDocument(object): control the serialization. """ + XSD_TYPES = { + numbers.Integral: XSD.integer, + numbers.Real: XSD.double, + numbers.Complex: XSD.string, + str: XSD.string, + bool: XSD.boolean, + } + def __init__(self): self.graph = Graph() self.graph.namespace_manager.bind('alpaca', ALPACA) @@ -142,6 +152,24 @@ def _add_FunctionExecution(self, script_info, session_id, execution_id, return uri # Entity methods + @classmethod + def _get_entity_value_datatype(cls, info): + value = info.value + if value is None: + return None + + # Check if builtin type or NumPy dtype + value_class = value.__class__ if not isinstance(value, np.number) \ + else value.dtype.type + if value_class in cls.XSD_TYPES: + return cls.XSD_TYPES[value_class] + + for possible_type in (numbers.Integral, numbers.Real, numbers.Complex): + if issubclass(value_class, possible_type): + return cls.XSD_TYPES[possible_type] + + # Type not found + return None def _add_DataObjectEntity(self, info): # Adds a DataObjectEntity from the Alpaca PROV model @@ -152,6 +180,12 @@ def _add_DataObjectEntity(self, info): return uri self.graph.add((uri, RDF.type, ALPACA.DataObjectEntity)) self.graph.add((uri, ALPACA.hashSource, Literal(info.hash_method))) + + value_datatype = self._get_entity_value_datatype(info) + if value_datatype: + self.graph.add((uri, PROV.value, + Literal(info.value, datatype=value_datatype))) + self._add_entity_metadata(uri, info) self._entity_uris.add(uri) return uri diff --git a/alpaca/test/res/values.ttl b/alpaca/test/res/values.ttl new file mode 100644 index 0000000..c065fcc --- /dev/null +++ b/alpaca/test/res/values.ttl @@ -0,0 +1,104 @@ +@prefix alpaca: . +@prefix prov: . +@prefix xsd: . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1.1e+00 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value "test"^^xsd:string . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value "(3+5j)"^^xsd:string . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value true . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1.2e+00 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 1.3e+00 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 2 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value 3 . + + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value -4 . + + a alpaca:DataObjectEntity ; + prov:wasAttributedTo ; + alpaca:hashSource "joblib_SHA1" . + + + a alpaca:FunctionExecution ; + prov:startedAtTime "2022-05-02T12:34:56.123456"^^xsd:dateTime ; + prov:endedAtTime "2022-05-02T12:35:56.123456"^^xsd:dateTime ; + prov:used ; + prov:wasAssociatedWith ; + alpaca:codeStatement "test_function(input_1, 5)" ; + alpaca:executionOrder 1 ; + alpaca:usedFunction ; + alpaca:hasParameter [ a alpaca:NameValuePair ; + alpaca:pairName "param_1" ; + alpaca:pairValue 5 ] . + + a alpaca:Function ; + alpaca:functionName "test_function" ; + alpaca:implementedIn "test" ; + alpaca:functionVersion "0.0.1" . + + a alpaca:ScriptAgent ; + alpaca:scriptPath "/script.py" . diff --git a/alpaca/test/test_serialization.py b/alpaca/test/test_serialization.py index fd9ac15..f1f08bc 100644 --- a/alpaca/test/test_serialization.py +++ b/alpaca/test/test_serialization.py @@ -90,6 +90,58 @@ def setUpClass(cls): cls.ttl_path = Path(__file__).parent / "res" alpaca_setting('authority', "fz-juelich.de") + def test_value_serialization(self): + # DataObject tuples for each type that should be captured + # They are output of the simulated output + + INT = DataObject("543211", "joblib_SHA1", "builtins.int", 543211, + {}, 1) + FLOAT = DataObject("543212", "joblib_SHA1", "builtins.float", 543212, + {}, 1.1) + STR = DataObject("543213", "joblib_SHA1", "builtins.str", 543213, + {}, "test") + COMPLEX = DataObject("543214", "joblib_SHA1", "builtins.complex", + 543214, {}, 3+5j) + BOOL = DataObject("543215", "joblib_SHA1", "builtins.bool", 543215, + {}, True) + NUMPY_FLOAT32 = DataObject("543216", "joblib_SHA1", "numpy.float32", + 543216, {}, np.float32(1.2)) + NUMPY_FLOAT64 = DataObject("543217", "joblib_SHA1", "numpy.float64", + 543217, {}, np.float64(1.3)) + NUMPY_INT64 = DataObject("543218", "joblib_SHA1", "numpy.int64", + 543218, {}, np.int64(2)) + NUMPY_INT32 = DataObject("543219", "joblib_SHA1", "numpy.int32", + 543219, {}, np.int32(3)) + NUMPY_INT16 = DataObject("5432110", "joblib_SHA1", "numpy.int16", + 5432110, {}, np.int16(-4)) + + function_execution = FunctionExecution( + function=TEST_FUNCTION, + input={'input_1': INPUT}, params={'param_1': 5}, + output={0: OUTPUT, 1: INT, 2: FLOAT, 3: STR, 4: COMPLEX, + 5: BOOL, 6: NUMPY_FLOAT32, 7: NUMPY_FLOAT64, + 8: NUMPY_INT64, 9: NUMPY_INT32, 10: NUMPY_INT16}, + call_ast=None, + arg_map=['input_1', 'param_1'], kwarg_map=[], return_targets=[], + time_stamp_start=TIMESTAMP_START, time_stamp_end=TIMESTAMP_END, + execution_id="12345", order=1, + code_statement="test_function(input_1, 5)" + ) + + # Load expected RDF graph + expected_graph_file = self.ttl_path / "values.ttl" + expected_graph = rdflib.Graph() + expected_graph.parse(expected_graph_file, format='turtle') + + # Serialize the history using AlpacaProv document + alpaca_prov = AlpacaProvDocument() + alpaca_prov.add_history(SCRIPT_INFO, SCRIPT_SESSION_ID, + history=[function_execution]) + + # Check if graphs are equal + self.assertTrue(assert_rdf_graphs_equal(alpaca_prov.graph, + expected_graph)) + def test_input_output_serialization(self): function_execution = FunctionExecution( function=TEST_FUNCTION, From c7eeb7d87eb131c7d5b83ae80f7f4b85f44c8039 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Mon, 17 Jul 2023 16:11:45 +0200 Subject: [PATCH 3/9] Implemented capture of values of user-requested types, using the settings function. They are captured as strings. --- alpaca/data_information.py | 32 ++++++++++--- alpaca/decorator.py | 18 +++++--- alpaca/serialization/prov.py | 6 +++ alpaca/settings.py | 13 +++++- alpaca/test/res/values.ttl | 7 +++ alpaca/test/test_decorator.py | 75 ++++++++++++++++++++++++++++++- alpaca/test/test_serialization.py | 11 ++++- 7 files changed, 147 insertions(+), 15 deletions(-) diff --git a/alpaca/data_information.py b/alpaca/data_information.py index 9cf3bd3..03dc8d3 100644 --- a/alpaca/data_information.py +++ b/alpaca/data_information.py @@ -19,6 +19,7 @@ import joblib import numpy as np +from numbers import Number from dill._dill import save_function from alpaca.alpaca_types import DataObject, File @@ -118,6 +119,11 @@ class _ObjectInformation(object): :func:`hash` function, depending on the `use_builtin_hash` parameter set during initialization. + The values of objects of the builtin types `str`, `bool`, `int`, `complex` + and `float` as well as the NumPy numeric types (e.g., `np.float64`) will + be stored. Additional object types to be stored (e.g., `builtins.dict`) + can be defined with the `store_values` parameter. + The method `info` is called to obtain the provenance information associated with the object during tracking, as the `DataObject` named tuple. The relevant metadata attributes are also stored in the tuple. @@ -132,6 +138,13 @@ class _ObjectInformation(object): List of package names whose object hashes will be computed using the Python builtin `hash` function, instead of `joblib.hash` function. Default: None + store_values : list, optional + List of object types whose values will be stored in the provenance + information (e.g., `builtins.dict`). This is in addition to the + builtin types `str`, `bool`, `int`, `complex` and `float` as well as + the NumPy numeric types (e.g., `np.float64`). The values of these are + always stored. + Default: None """ # This is a list of object attributes that provide relevant provenance @@ -140,10 +153,12 @@ class _ObjectInformation(object): 'id', 'nix_name', 'dimensionality', 'pid', 'create_time') - def __init__(self, use_builtin_hash=None): + def __init__(self, use_builtin_hash=None, store_values=None): self._hash_memoizer = dict() self._use_builtin_hash = copy(use_builtin_hash) \ if use_builtin_hash is not None else [] + self._store_values = copy(store_values)\ + if store_values is not None else [] @staticmethod def _get_object_package(obj): @@ -259,9 +274,10 @@ def info(self, obj): * details : dict Extended information (metadata) on the object. * value : object - For builtin objects (`str`, `int`, `float`, `bool`) or - equivalent objects (e.g. `numpy.float64`), the value is - stored. + For builtin objects (`str`, `int`, `float`, `bool`, `complex`) + or equivalent objects (e.g. `numpy.float64`), the value is + stored. Additional object types specified with the + :attr:`store_values` list will also be stored. """ type_information = type(obj) obj_type = f"{type_information.__module__}.{type_information.__name__}" @@ -295,8 +311,12 @@ def info(self, obj): obj_id=obj_id, package=package) - obj_value = obj if isinstance(obj, (str, int, bool, complex, float, - np.number)) else None + # Store object value + obj_value = None + if isinstance(obj, (str, bool, Number)): + obj_value = obj + elif obj_type in self._store_values: + obj_value = str(obj) return DataObject(hash=obj_hash, hash_method=hash_method, type=obj_type, id=obj_id, details=details, diff --git a/alpaca/decorator.py b/alpaca/decorator.py index f502176..902b465 100644 --- a/alpaca/decorator.py +++ b/alpaca/decorator.py @@ -365,7 +365,8 @@ def _capture_code_and_function_provenance(self, lineno, function): return source_line, ast_tree, return_targets, function_info def _capture_input_and_parameters_provenance(self, function, args, kwargs, - ast_tree, function_info, time_stamp_start, builtin_object_hash): + ast_tree, function_info, time_stamp_start, builtin_object_hash, + store_values): # 1. Extract the parameters passed to the function and store them in # the `input_data` dictionary. @@ -389,7 +390,8 @@ def _capture_input_and_parameters_provenance(self, function, args, kwargs, # After this step, all hashes and metadata of input parameters/files # are going to be stored in the dictionary `inputs`. - data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash) + data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash, + store_values=store_values) # Initialize parameter list with all default arguments that were not # passed to the function @@ -502,11 +504,12 @@ def _capture_container_output(self, function_output, data_info, def _capture_output_provenance(self, function_output, return_targets, input_data, builtin_object_hash, time_stamp_start, execution_id, - constructed_object=None): + store_values, constructed_object=None): # In case in-place operations were performed, lets not use # memoization - data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash) + data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash, + store_values=store_values) # 6. Create hash for the output using `_ObjectInformation` to follow # individual returns. The hashes will be stored in the `outputs` @@ -552,7 +555,8 @@ def wrapped(*args, **kwargs): builtin_object_hash = _ALPACA_SETTINGS[ 'use_builtin_hash_for_module'] - logger.debug(f"Builtin object hash: {builtin_object_hash}") + store_values = _ALPACA_SETTINGS['store_values'] + logging.debug(f"Builtin object hash: {builtin_object_hash}") lineno = None @@ -593,7 +597,8 @@ def wrapped(*args, **kwargs): function=function, args=args, kwargs=kwargs, ast_tree=ast_tree, function_info=function_info, time_stamp_start=time_stamp_start, - builtin_object_hash=builtin_object_hash) + builtin_object_hash=builtin_object_hash, + store_values=store_values) # Call the function function_output = function(*args, **kwargs) @@ -620,6 +625,7 @@ def wrapped(*args, **kwargs): builtin_object_hash=builtin_object_hash, time_stamp_start=time_stamp_start, execution_id=execution_id, + store_values=store_values, constructed_object=constructed_object) # Get the end time stamp diff --git a/alpaca/serialization/prov.py b/alpaca/serialization/prov.py index 94d6ea8..7b4f188 100644 --- a/alpaca/serialization/prov.py +++ b/alpaca/serialization/prov.py @@ -164,6 +164,12 @@ def _get_entity_value_datatype(cls, info): if value_class in cls.XSD_TYPES: return cls.XSD_TYPES[value_class] + # Check if object is include in the `store_values` setting. + # In this case, they are always stored as strings + obj_type = info.type + if obj_type in _ALPACA_SETTINGS['store_values']: + return XSD.string + for possible_type in (numbers.Integral, numbers.Real, numbers.Complex): if issubclass(value_class, possible_type): return cls.XSD_TYPES[possible_type] diff --git a/alpaca/settings.py b/alpaca/settings.py index 6860755..0545989 100644 --- a/alpaca/settings.py +++ b/alpaca/settings.py @@ -51,6 +51,16 @@ Default: "my-authority" +* **store_values**: list of str + The values of the objects from the types in the list will be stored + together with the provenance information. Note that objects of the + builtin types `str`, `bool`, `int`, `float` and `complex`, as well as + the NumPy numeric types (e.g. `numpy.float64`) are stored by default. + This option should be used to store values of more complex types, such + as dictionaries. In this case, the list in this setting should have + the `builtins.dict` entry. The strings are the full path to the Python + object, i.e., `[module].[...].[object_class]`. + To set/read a setting, use the function :func:`alpaca_setting`. @@ -61,7 +71,8 @@ # Should be modified only through the `alpaca_setting` function. _ALPACA_SETTINGS = {'use_builtin_hash_for_module': [], - 'authority': "my-authority"} + 'authority': "my-authority", + 'store_values': []} def alpaca_setting(name, value=None): diff --git a/alpaca/test/res/values.ttl b/alpaca/test/res/values.ttl index c065fcc..4483803 100644 --- a/alpaca/test/res/values.ttl +++ b/alpaca/test/res/values.ttl @@ -78,6 +78,13 @@ alpaca:hashSource "joblib_SHA1" ; prov:value -4 . + a alpaca:DataObjectEntity ; + prov:wasDerivedFrom ; + prov:wasAttributedTo ; + prov:wasGeneratedBy ; + alpaca:hashSource "joblib_SHA1" ; + prov:value "{'id': [1, 2, 3], 'value': {4, 5, 6}}"^^xsd:string . + a alpaca:DataObjectEntity ; prov:wasAttributedTo ; alpaca:hashSource "joblib_SHA1" . diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index 1445931..c9a3cef 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -15,7 +15,7 @@ import neo from alpaca import (Provenance, activate, deactivate, save_provenance, - print_history) + print_history, alpaca_setting) from alpaca.alpaca_types import (FunctionInfo, Container, DataObject, File) # Define some data and expected values test tracking @@ -1303,5 +1303,78 @@ def test_class_constructor_container_output(self): test_case=self) +@Provenance(inputs=['source']) +def use_dict(source): + return 3 + +class ProvenanceDecoratorStoreValuesTestCase(unittest.TestCase): + + def setUp(self): + alpaca_setting('store_values', []) + + def test_capture_dict(self): + # This should have values for both the input dictionary and the + # integer return + alpaca_setting('store_values', ['builtins.dict']) + activate(clear=True) + test_dict = dict(id=[1, 2, 3], value={4, 5, 6}) + res = use_dict(test_dict) + deactivate() + + dict_info = DataObject(hash=joblib.hash(test_dict, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.dict", id=id(test_dict), + details={}, + value="{'id': [1, 2, 3], 'value': {4, 5, 6}}") + + expected_output = DataObject(hash=joblib.hash(3, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.int", id=id(res), + details={}, value=3) + + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('use_dict', 'test_decorator', ''), + exp_input={'source': dict_info}, + exp_params={}, + exp_output={0: expected_output}, + exp_arg_map=['source'], + exp_kwarg_map=[], + exp_code_stmnt="res = use_dict(test_dict)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + def test_capture_builtins_only(self): + # This should have values only for the integer return + activate(clear=True) + test_dict = dict(id=[1, 2, 3], value={4, 5, 6}) + res = use_dict(test_dict) + deactivate() + + dict_info = DataObject(hash=joblib.hash(test_dict, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.dict", id=id(test_dict), + details={}, value=None) + + expected_output = DataObject(hash=joblib.hash(3, hash_name='sha1'), + hash_method="joblib_SHA1", + type="builtins.int", id=id(res), + details={}, value=3) + + _check_function_execution( + actual=Provenance.history[0], + exp_function=FunctionInfo('use_dict', 'test_decorator', ''), + exp_input={'source': dict_info}, + exp_params={}, + exp_output={0: expected_output}, + exp_arg_map=['source'], + exp_kwarg_map=[], + exp_code_stmnt="res = use_dict(test_dict)", + exp_return_targets=['res'], + exp_order=1, + test_case=self) + + if __name__ == "__main__": unittest.main() diff --git a/alpaca/test/test_serialization.py b/alpaca/test/test_serialization.py index f1f08bc..96304b5 100644 --- a/alpaca/test/test_serialization.py +++ b/alpaca/test/test_serialization.py @@ -90,9 +90,13 @@ def setUpClass(cls): cls.ttl_path = Path(__file__).parent / "res" alpaca_setting('authority', "fz-juelich.de") + def setUp(self): + alpaca_setting('store_values', []) + def test_value_serialization(self): # DataObject tuples for each type that should be captured # They are output of the simulated output + alpaca_setting('store_values', ['builtins.dict']) INT = DataObject("543211", "joblib_SHA1", "builtins.int", 543211, {}, 1) @@ -115,12 +119,17 @@ def test_value_serialization(self): NUMPY_INT16 = DataObject("5432110", "joblib_SHA1", "numpy.int16", 5432110, {}, np.int16(-4)) + DICT = DataObject("5432111", "joblib_SHA1", "builtins.dict", + 5432111, {}, + str(dict(id=[1, 2, 3], value={4, 5, 6}))) + function_execution = FunctionExecution( function=TEST_FUNCTION, input={'input_1': INPUT}, params={'param_1': 5}, output={0: OUTPUT, 1: INT, 2: FLOAT, 3: STR, 4: COMPLEX, 5: BOOL, 6: NUMPY_FLOAT32, 7: NUMPY_FLOAT64, - 8: NUMPY_INT64, 9: NUMPY_INT32, 10: NUMPY_INT16}, + 8: NUMPY_INT64, 9: NUMPY_INT32, 10: NUMPY_INT16, + 11: DICT}, call_ast=None, arg_map=['input_1', 'param_1'], kwarg_map=[], return_targets=[], time_stamp_start=TIMESTAMP_START, time_stamp_end=TIMESTAMP_END, From 16f89beed2be69155feecff36a4a4845b38ef466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Mon, 17 Jul 2023 16:20:31 +0200 Subject: [PATCH 4/9] Unit tests for the capture of values of user-requested types. --- alpaca/test/test_data_information.py | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/alpaca/test/test_data_information.py b/alpaca/test/test_data_information.py index 3d6e738..6b47032 100644 --- a/alpaca/test/test_data_information.py +++ b/alpaca/test/test_data_information.py @@ -106,6 +106,35 @@ def test_none(self): self.assertEqual(info.hash_method, "UUID") self.assertDictEqual(info.details, {}) + def test_store_value_requested(self): + object_info = _ObjectInformation(store_values=['builtins.dict']) + test_dict = dict(key=['3', '4']) + info = object_info.info(test_dict) + self.assertEqual(info.hash, joblib.hash(test_dict, hash_name='sha1')) + self.assertEqual(info.type, "builtins.dict") + self.assertEqual(info.hash_method, "joblib_SHA1") + self.assertDictEqual(info.details, {}) + self.assertEqual(info.value, "{'key': ['3', '4']}") + + def test_store_value_not_requested(self): + object_info = _ObjectInformation() + test_dict = dict(key=['3', '4']) + info = object_info.info(test_dict) + self.assertEqual(info.hash, joblib.hash(test_dict, hash_name='sha1')) + self.assertEqual(info.type, "builtins.dict") + self.assertEqual(info.hash_method, "joblib_SHA1") + self.assertDictEqual(info.details, {}) + self.assertEqual(info.value, None) + + def test_store_value_builtins(self): + object_info = _ObjectInformation() + info = object_info.info(5) + self.assertEqual(info.hash, joblib.hash(5, hash_name='sha1')) + self.assertEqual(info.type, "builtins.int") + self.assertEqual(info.hash_method, "joblib_SHA1") + self.assertDictEqual(info.details, {}) + self.assertEqual(info.value, 5) + def test_custom_class(self): custom_object_1 = ObjectClass(param=4) custom_object_2 = ObjectClass(param=3) From da0f04199d7619e154c625264c35c6301d5452a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Thu, 10 Aug 2023 09:44:28 +0200 Subject: [PATCH 5/9] Updated unit tests --- alpaca/test/test_code_analysis.py | 17 ++++--- alpaca/test/test_decorator.py | 75 ++++++++++++++++++------------- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/alpaca/test/test_code_analysis.py b/alpaca/test/test_code_analysis.py index b0ac144..47c07a1 100644 --- a/alpaca/test/test_code_analysis.py +++ b/alpaca/test/test_code_analysis.py @@ -552,19 +552,22 @@ def test_attribute_method_call(self): hash=joblib_hash(np.sum(TEST_ARRAY)), hash_method="joblib_SHA1", type="numpy.int64", id=id(res), - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=6) object_info = DataObject( hash=joblib_hash(object_with_method), hash_method="joblib_SHA1", type="test_code_analysis.ObjectWithMethod", id=id(object_with_method), - details={}) + details={}, + value=None) expected_container_info = DataObject( hash=joblib_hash(container_of_array), hash_method="joblib_SHA1", type="test_code_analysis.ContainerOfArray", - id=id(container_of_array), details={'array': TEST_ARRAY}) + id=id(container_of_array), details={'array': TEST_ARRAY}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -604,17 +607,19 @@ def test_subscript_initializer(self): hash=joblib_hash(custom_object), hash_method="joblib_SHA1", type="test_code_analysis.CustomObject", id=id(custom_object), - details={'data': list_1}) + details={'data': list_1}, value=None) source_list_info = DataObject( hash=joblib_hash(source_data), hash_method="joblib_SHA1", - type="builtins.list", id=id(source_data), details={}) + type="builtins.list", id=id(source_data), details={}, + value=None) element_info = DataObject( hash=joblib_hash(list_1), hash_method="joblib_SHA1", - type="builtins.list", id=id(list_1), details={}) + type="builtins.list", id=id(list_1), details={}, + value=None) _check_function_execution( actual=Provenance.history[0], diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index c9a3cef..d1e69c9 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -646,29 +646,30 @@ def test_container_output_function_level(self): elements = [[], []] for idx, container in enumerate(res): - for el_idx, element in enumerate(container): + for element in container: element_info = DataObject( hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements[idx].append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check subscript of each element with respect to the array containers = [expected_container_1, expected_container_2] @@ -741,19 +742,21 @@ def test_dict_output_function(self): expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.dict", id=id(res), details={}) + type="builtins.dict", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.0']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.1']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -805,29 +808,32 @@ def test_dict_output_function_level(self): elements = {'key.0': [], 'key.1': []} for key, container in res.items(): - for el_idx, element in enumerate(container): + for element in container: element_info = DataObject( hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements[key].append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.dict", id=id(res), details={}) + type="builtins.dict", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 3, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.0']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res['key.1']), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) # Check subscript of each element with respect to the array containers = { @@ -903,18 +909,19 @@ def test_non_iterable_container_output(self): self.assertEqual(len(Provenance.history), 4) elements = [] - for el_idx, element in enumerate(res): - element_info = DataObject( - hash=joblib.hash(element, hash_name="sha1"), - hash_method="joblib_SHA1", - type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) - elements.append(element_info) + for element in res: + element_info = DataObject( + hash=joblib.hash(element, hash_name="sha1"), + hash_method="joblib_SHA1", + type="numpy.int64", id=None, + details={'shape': (), 'dtype': np.int64}, + value=element) + elements.append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", type="test_decorator.NonIterableContainer", id=id(res), - details={'data': res.data}) + details={'data': res.data}, value=None) # Check subscript of each element with respect to the container for history_index in (0, 1, 2): @@ -947,7 +954,6 @@ def test_non_iterable_container_output(self): exp_order=1, test_case=self) - def test_comprehensions(self): activate(clear=True) num_list = [comprehension_function(i) for i in range(3)] @@ -966,7 +972,8 @@ def test_comprehensions(self): hash=joblib.hash(element, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(element), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, + value=element) _check_function_execution( actual=Provenance.history[history], @@ -988,7 +995,8 @@ def test_comprehensions(self): hash=joblib.hash(element, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(element), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, + value=element) _check_function_execution( actual=Provenance.history[history], @@ -1010,7 +1018,8 @@ def test_comprehensions(self): hash=joblib.hash(element, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.float64", id=id(element), - details={'shape': (), 'dtype': np.float64}) + details={'shape': (), 'dtype': np.float64}, + value=element) _check_function_execution( actual=Provenance.history[history], @@ -1157,7 +1166,8 @@ def test_method_descriptor(self): 'file_origin': None, 'description': None, 'segment': None, 'units': pq.mV.units, 'shape': (3, 1), 'dtype': np.int64, 't_start': 0 * pq.s, 't_stop': 3 * pq.s, - 'dimensionality': pq.mV.dimensionality}) + 'dimensionality': pq.mV.dimensionality}, + value=None) expected_output = DataObject( hash=joblib.hash(reshaped, hash_name='sha1'), @@ -1169,7 +1179,8 @@ def test_method_descriptor(self): 'file_origin': None, 'description': None, 'segment': None, 'units': pq.mV.units, 'shape': (1, 3), 'dtype': np.int64, 't_start': 0 * pq.s, 't_stop': 1 * pq.s, - 'dimensionality': pq.mV.dimensionality}) + 'dimensionality': pq.mV.dimensionality}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -1197,7 +1208,8 @@ def test_class_constructor(self): hash_method="joblib_SHA1", type="test_decorator.ObjectWithMethod", id=id(obj), - details={'coefficient': 2}) + details={'coefficient': 2}, + value=None) _check_function_execution( actual=Provenance.history[0], @@ -1261,7 +1273,8 @@ def test_class_constructor_container_output(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=element) elements.append(element_info) expected_output = DataObject( @@ -1269,7 +1282,8 @@ def test_class_constructor_container_output(self): hash_method="joblib_SHA1", type="test_decorator.NonIterableContainerOutputObject", id=id(obj), - details={'_data': obj._data}) + details={'_data': obj._data}, + value=None) # Check subscript of each element with respect to the container for history_index in (0, 1, 2): @@ -1307,6 +1321,7 @@ def test_class_constructor_container_output(self): def use_dict(source): return 3 + class ProvenanceDecoratorStoreValuesTestCase(unittest.TestCase): def setUp(self): From f19b3da1a512a155a4331f7f7495eabcd5c30819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Tue, 21 Nov 2023 15:09:44 +0100 Subject: [PATCH 6/9] Added option to display stored value for the node in the visualization graph --- alpaca/graph.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/alpaca/graph.py b/alpaca/graph.py index 5983273..198e366 100644 --- a/alpaca/graph.py +++ b/alpaca/graph.py @@ -98,7 +98,7 @@ def _get_name_value_pair(graph, bnode): def _get_entity_data(graph, entity, annotations=None, attributes=None, - strip_namespace=True): + strip_namespace=True, value_attribute=None): filter_map = defaultdict(list) filter_map.update( @@ -118,6 +118,12 @@ def _get_entity_data(graph, entity, annotations=None, attributes=None, _add_attribute(data, attr_name, attr_type, attr_value, strip_namespace) + # Get the stored value if requested and present + if value_attribute: + value = graph.value(entity, PROV.value) + if value: + data[value_attribute] = value.toPython() + if data['type'] == NSS_FILE: file_path = str(list(graph.objects(entity, ALPACA.filePath))[0]) data["File_path"] = file_path @@ -223,6 +229,15 @@ class name of the object (e.g., `ndarray`). The `Python_name` node time interval strings in the format supported by the Gephi timeline feature. If False, the attribute is not included. Default: True + value_attribute : str, optional + If provided, an attribute named `value_attribute` will be added to + the node attributes to show the values stored in the provenance + information. Alpaca stores the values of objects of the builtin types + `str`, `bool`, `int`, `float` and `complex`, as well as the NumPy + numeric types (e.g. `numpy.float64`) by default. The values of + additional types can be defined using the + :func:`alpaca.settings.alpaca_setting` function. + Default: None Attributes ---------- @@ -235,7 +250,7 @@ class name of the object (e.g., `ndarray`). The `Python_name` node def __init__(self, *prov_file, annotations=None, attributes=None, strip_namespace=True, remove_none=True, use_name_in_parameter=True, use_class_in_method_name=True, - time_intervals=True): + time_intervals=True, value_attribute=None): # Load PROV records from the file(s) doc = AlpacaProvDocument() @@ -250,7 +265,7 @@ def __init__(self, *prov_file, annotations=None, attributes=None, strip_namespace=strip_namespace, remove_none=remove_none, use_name_in_parameter=use_name_in_parameter, use_class_in_method_name=use_class_in_method_name, - time_intervals=time_intervals + time_intervals=time_intervals, value_attribute=value_attribute ) if time_intervals: @@ -319,7 +334,7 @@ def _transform_graph(graph, annotations=None, attributes=None, strip_namespace=True, remove_none=True, use_name_in_parameter=True, use_class_in_method_name=True, - time_intervals=True): + time_intervals=True, value_attribute=None): # Transform an RDFlib graph obtained from the PROV data, so that the # visualization is simplified. A new `nx.DiGraph` object is created # and returned. Annotations and attributes of the entities stored in @@ -341,7 +356,8 @@ def _transform_graph(graph, annotations=None, attributes=None, data = _get_entity_data(graph, entity, annotations=annotations, attributes=attributes, - strip_namespace=strip_namespace) + strip_namespace=strip_namespace, + value_attribute=value_attribute) transformed.add_node(node_id, **data) # Add all the edges. From 22141fb79a06904edc70e7a43d8af50173558f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Tue, 21 Nov 2023 15:10:10 +0100 Subject: [PATCH 7/9] Unit tests for visualization graph --- alpaca/test/test_graph.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/alpaca/test/test_graph.py b/alpaca/test/test_graph.py index 46bb78b..f07314e 100644 --- a/alpaca/test/test_graph.py +++ b/alpaca/test/test_graph.py @@ -440,6 +440,32 @@ def test_remove_multiple_attributes_aggregation(self): self.assertTrue("Time Interval" not in node_attrs) self.assertTrue("sua" not in node_attrs) + def test_value_attribute(self): + input_file = self.ttl_path / "values.ttl" + graph = ProvenanceGraph(input_file, attributes='all', + annotations='all', value_attribute='value') + + node_values_by_id = { + "urn:fz-juelich.de:alpaca:object:Python:builtins.int:543211": 1, + "urn:fz-juelich.de:alpaca:object:Python:builtins.float:543212": 1.1, + "urn:fz-juelich.de:alpaca:object:Python:builtins.str:543213": "test", + "urn:fz-juelich.de:alpaca:object:Python:builtins.complex:543214": "(3+5j)", + "urn:fz-juelich.de:alpaca:object:Python:builtins.bool:543215": True, + "urn:fz-juelich.de:alpaca:object:Python:numpy.float32:543216": 1.2, + "urn:fz-juelich.de:alpaca:object:Python:numpy.float64:543217": 1.3, + "urn:fz-juelich.de:alpaca:object:Python:numpy.int64:543218": 2, + "urn:fz-juelich.de:alpaca:object:Python:numpy.int32:543219": 3, + "urn:fz-juelich.de:alpaca:object:Python:numpy.int16:5432110": -4, + "urn:fz-juelich.de:alpaca:object:Python:builtins.dict:5432111": "{'id': [1, 2, 3], 'value': {4, 5, 6}}", + "urn:fz-juelich.de:alpaca:object:Python:test.InputObject:12345": None, + "urn:fz-juelich.de:alpaca:object:Python:test.OutputObject:54321": None, + } + + for node, node_attrs in graph.graph.nodes(data=True): + if node_attrs['type'] == 'object': + expected_value = node_values_by_id[node] + self.assertEqual(expected_value, node_attrs.get('value', None)) + class GraphTimeIntervalTestCase(unittest.TestCase): From 8c66a5d2b6a01bdefe0893841a04d589bf6b8755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Tue, 21 Nov 2023 15:10:41 +0100 Subject: [PATCH 8/9] Added missing value entries to new unit tests --- alpaca/test/test_decorator.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index 20253cf..d41ab49 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -707,19 +707,19 @@ def test_container_output_function_level_0(self): expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 7, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 8, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check the subscript of each array with respect to the list returned _check_function_execution( @@ -869,19 +869,19 @@ def test_container_output_function_level_range_0_0(self): expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 1, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 2, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check the subscript of each array with respect to the list returned _check_function_execution( @@ -940,24 +940,24 @@ def test_container_output_function_level_range_0_1(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, value=None) elements[idx].append(element_info) expected_output = DataObject( hash=joblib.hash(res, hash_name="sha1"), hash_method="joblib_SHA1", - type="builtins.list", id=id(res), details={}) + type="builtins.list", id=id(res), details={}, value=None) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 6, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, value=None) # Check subscript of each element with respect to the array containers = [expected_container_1, expected_container_2] @@ -1035,20 +1035,23 @@ def test_container_output_function_level_range_1_1(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}) + details={'shape': (), 'dtype': np.int64}, + value=None) elements[idx].append(element_info) expected_container_1 = DataObject( hash=joblib.hash(TEST_ARRAY + 4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[0]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) expected_container_2 = DataObject( hash=joblib.hash(TEST_ARRAY + 5, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res[1]), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) # Check subscript of each element with respect to the array containers = [expected_container_1, expected_container_2] @@ -1521,13 +1524,15 @@ def test_static_method(self): hash_method="joblib_SHA1", type="test_decorator.ObjectWithMethod", id=id(obj), - details={'coefficient': 2}) + details={'coefficient': 2}, + value=None) expected_output = DataObject( hash=joblib.hash(TEST_ARRAY+4, hash_name='sha1'), hash_method="joblib_SHA1", type="numpy.ndarray", id=id(res), - details={'shape': (3,), 'dtype': np.int64}) + details={'shape': (3,), 'dtype': np.int64}, + value=None) _check_function_execution( actual=Provenance.history[0], From afde4e197df307dae77cc92707ac36a7dedcd5a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= Date: Tue, 21 Nov 2023 15:16:09 +0100 Subject: [PATCH 9/9] Added missing value entries to new unit tests --- alpaca/test/test_decorator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alpaca/test/test_decorator.py b/alpaca/test/test_decorator.py index d41ab49..2ea6dd3 100644 --- a/alpaca/test/test_decorator.py +++ b/alpaca/test/test_decorator.py @@ -940,7 +940,7 @@ def test_container_output_function_level_range_0_1(self): hash=joblib.hash(element, hash_name="sha1"), hash_method="joblib_SHA1", type="numpy.int64", id=None, - details={'shape': (), 'dtype': np.int64}, value=None) + details={'shape': (), 'dtype': np.int64}, value=element) elements[idx].append(element_info) expected_output = DataObject( @@ -1036,7 +1036,7 @@ def test_container_output_function_level_range_1_1(self): hash_method="joblib_SHA1", type="numpy.int64", id=None, details={'shape': (), 'dtype': np.int64}, - value=None) + value=element) elements[idx].append(element_info) expected_container_1 = DataObject(