INM-6 · kohlerca · Dec 6, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 17, 2023
diff --git a/alpaca/alpaca_types.py b/alpaca/alpaca_types.py
@@ -61,6 +61,6 @@
 # the disk.
 
 DataObject = namedtuple('DataObject', ('hash', 'hash_method', 'type', 'id',
-                                       'details'))
+                                       'details', 'value'))
 
 File = namedtuple('File', ('hash', 'hash_type', 'path'))
diff --git a/alpaca/data_information.py b/alpaca/data_information.py
@@ -19,6 +19,7 @@
 
 import joblib
 import numpy as np
+from numbers import Number
 from dill._dill import save_function
 
 from alpaca.alpaca_types import DataObject, File
@@ -118,6 +119,11 @@ class _ObjectInformation(object):
     :func:`hash` function, depending on the `use_builtin_hash` parameter set
     during initialization.
 
+    The values of objects of the builtin types `str`, `bool`, `int`, `complex`
+    and `float` as well as the NumPy numeric types (e.g., `np.float64`) will
+    be stored. Additional object types to be stored (e.g., `builtins.dict`)
+    can be defined with the `store_values` parameter.
+
     The method `info` is called to obtain the provenance information
     associated with the object during tracking, as the `DataObject` named
     tuple. The relevant metadata attributes are also stored in the tuple.
@@ -132,6 +138,13 @@ class _ObjectInformation(object):
         List of package names whose object hashes will be computed using the
         Python builtin `hash` function, instead of `joblib.hash` function.
         Default: None
+    store_values : list, optional
+        List of object types whose values will be stored in the provenance
+        information (e.g., `builtins.dict`). This is in addition to the
+        builtin types `str`, `bool`, `int`, `complex` and `float` as well as
+        the NumPy numeric types (e.g., `np.float64`). The values of these are
+        always stored.
+        Default: None
     """
 
     # This is a list of object attributes that provide relevant provenance
@@ -140,10 +153,12 @@ class _ObjectInformation(object):
                             'id', 'nix_name', 'dimensionality', 'pid',
                             'create_time')
 
-    def __init__(self, use_builtin_hash=None):
+    def __init__(self, use_builtin_hash=None, store_values=None):
         self._hash_memoizer = dict()
         self._use_builtin_hash = copy(use_builtin_hash) \
             if use_builtin_hash is not None else []
+        self._store_values = copy(store_values)\
+            if store_values is not None else []
 
     @staticmethod
     def _get_object_package(obj):
@@ -258,6 +273,11 @@ def info(self, obj):
                 Reference for the object.
             * details : dict
                 Extended information (metadata) on the object.
+            * value : object
+                For builtin objects (`str`, `int`, `float`, `bool`, `complex`)
+                or equivalent objects (e.g. `numpy.float64`), the value is
+                stored. Additional object types specified with the
+                :attr:`store_values` list will also be stored.
         """
         type_information = type(obj)
         obj_type = f"{type_information.__module__}.{type_information.__name__}"
@@ -267,7 +287,8 @@ def info(self, obj):
         if obj is None:
             unique_id = uuid.uuid4()
             return DataObject(hash=unique_id, hash_method="UUID",
-                              type=obj_type, id=obj_id, details={})
+                              type=obj_type, id=obj_id, details={},
+                              value=None)
 
         # Here we can extract specific metadata to record
         details = {}
@@ -290,5 +311,13 @@ def info(self, obj):
                                                       obj_id=obj_id,
                                                       package=package)
 
+        # Store object value
+        obj_value = None
+        if isinstance(obj, (str, bool, Number)):
+            obj_value = obj
+        elif obj_type in self._store_values:
+            obj_value = str(obj)
+
         return DataObject(hash=obj_hash, hash_method=hash_method,
-                          type=obj_type, id=obj_id, details=details)
+                          type=obj_type, id=obj_id, details=details,
+                          value=obj_value)
diff --git a/alpaca/decorator.py b/alpaca/decorator.py
@@ -422,7 +422,8 @@ def _capture_code_and_function_provenance(self, lineno, function):
         return source_line, ast_tree, return_targets, function_info
 
     def _capture_input_and_parameters_provenance(self, function, args, kwargs,
-        ast_tree, function_info, time_stamp_start, builtin_object_hash):
+        ast_tree, function_info, time_stamp_start, builtin_object_hash,
+        store_values):
 
         # 1. Extract the parameters passed to the function and store them in
         # the `input_data` dictionary.
@@ -446,7 +447,8 @@ def _capture_input_and_parameters_provenance(self, function, args, kwargs,
         # After this step, all hashes and metadata of input parameters/files
         # are going to be stored in the dictionary `inputs`.
 
-        data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash)
+        data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash,
+                                       store_values=store_values)
 
         # Initialize parameter list with all default arguments that were not
         # passed to the function
@@ -576,11 +578,12 @@ def _capture_container_output(self, function_output, data_info,
     def _capture_output_provenance(self, function_output, return_targets,
                                    input_data, builtin_object_hash,
                                    time_stamp_start, execution_id,
-                                   constructed_object=None):
+                                   store_values, constructed_object=None):
 
         # In case in-place operations were performed, lets not use
         # memoization
-        data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash)
+        data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash,
+                                       store_values=store_values)
 
         # 6. Create hash for the output using `_ObjectInformation` to follow
         # individual returns. The hashes will be stored in the `outputs`
@@ -626,7 +629,8 @@ def wrapped(*args, **kwargs):
 
             builtin_object_hash = _ALPACA_SETTINGS[
                 'use_builtin_hash_for_module']
-            logger.debug(f"Builtin object hash: {builtin_object_hash}")
+            store_values = _ALPACA_SETTINGS['store_values']
+            logging.debug(f"Builtin object hash: {builtin_object_hash}")
 
             lineno = None
 
@@ -667,7 +671,8 @@ def wrapped(*args, **kwargs):
                                 function=function, args=args, kwargs=kwargs,
                                 ast_tree=ast_tree, function_info=function_info,
                                 time_stamp_start=time_stamp_start,
-                                builtin_object_hash=builtin_object_hash)
+                                builtin_object_hash=builtin_object_hash,
+                                store_values=store_values)
 
             # Call the function
             function_output = function(*args, **kwargs)
@@ -694,6 +699,7 @@ def wrapped(*args, **kwargs):
                     builtin_object_hash=builtin_object_hash,
                     time_stamp_start=time_stamp_start,
                     execution_id=execution_id,
+                    store_values=store_values,
                     constructed_object=constructed_object)
 
                 # Get the end time stamp

diff --git a/alpaca/graph.py b/alpaca/graph.py
@@ -98,7 +98,7 @@ def _get_name_value_pair(graph, bnode):
 
 
 def _get_entity_data(graph, entity, annotations=None, attributes=None,
-                     strip_namespace=True):
+                     strip_namespace=True, value_attribute=None):
     filter_map = defaultdict(list)
 
     filter_map.update(
@@ -118,6 +118,12 @@ def _get_entity_data(graph, entity, annotations=None, attributes=None,
                     _add_attribute(data, attr_name, attr_type, attr_value,
                                    strip_namespace)
 
+    # Get the stored value if requested and present
+    if value_attribute:
+        value = graph.value(entity, PROV.value)
+        if value:
+            data[value_attribute] = value.toPython()
+
     if data['type'] == NSS_FILE:
         file_path = str(list(graph.objects(entity, ALPACA.filePath))[0])
         data["File_path"] = file_path
@@ -223,6 +229,15 @@ class name of the object (e.g., `ndarray`). The `Python_name` node
         time interval strings in the format supported by the Gephi timeline
         feature. If False, the attribute is not included.
         Default: True
+    value_attribute : str, optional
+        If provided, an attribute named `value_attribute` will be added to
+        the node attributes to show the values stored in the provenance
+        information. Alpaca stores the values of objects of the builtin types
+        `str`, `bool`, `int`, `float` and `complex`, as well as the NumPy
+        numeric types (e.g. `numpy.float64`) by default. The values of
+        additional types can be defined using the
+        :func:`alpaca.settings.alpaca_setting` function.
+        Default: None
 
     Attributes
     ----------
@@ -235,7 +250,7 @@ class name of the object (e.g., `ndarray`). The `Python_name` node
     def __init__(self, *prov_file, annotations=None, attributes=None,
                  strip_namespace=True, remove_none=True,
                  use_name_in_parameter=True, use_class_in_method_name=True,
-                 time_intervals=True):
+                 time_intervals=True, value_attribute=None):
 
         # Load PROV records from the file(s)
         doc = AlpacaProvDocument()
@@ -250,7 +265,7 @@ def __init__(self, *prov_file, annotations=None, attributes=None,
             strip_namespace=strip_namespace, remove_none=remove_none,
             use_name_in_parameter=use_name_in_parameter,
             use_class_in_method_name=use_class_in_method_name,
-            time_intervals=time_intervals
+            time_intervals=time_intervals, value_attribute=value_attribute
         )
 
         if time_intervals:
@@ -319,7 +334,7 @@ def _transform_graph(graph, annotations=None, attributes=None,
                          strip_namespace=True, remove_none=True,
                          use_name_in_parameter=True,
                          use_class_in_method_name=True,
-                         time_intervals=True):
+                         time_intervals=True, value_attribute=None):
         # Transform an RDFlib graph obtained from the PROV data, so that the
         # visualization is simplified. A new `nx.DiGraph` object is created
         # and returned. Annotations and attributes of the entities stored in
@@ -341,7 +356,8 @@ def _transform_graph(graph, annotations=None, attributes=None,
             data = _get_entity_data(graph, entity,
                                     annotations=annotations,
                                     attributes=attributes,
-                                    strip_namespace=strip_namespace)
+                                    strip_namespace=strip_namespace,
+                                    value_attribute=value_attribute)
             transformed.add_node(node_id, **data)
 
         # Add all the edges.

diff --git a/alpaca/serialization/prov.py b/alpaca/serialization/prov.py
@@ -10,6 +10,8 @@
 """
 
 from itertools import product
+import numpy as np
+import numbers
 
 from rdflib import Graph, URIRef, BNode, Literal
 from rdflib.namespace import RDF, PROV, XSD
@@ -59,6 +61,14 @@ class AlpacaProvDocument(object):
     control the serialization.
     """
 
+    XSD_TYPES = {
+        numbers.Integral: XSD.integer,
+        numbers.Real: XSD.double,
+        numbers.Complex: XSD.string,
+        str: XSD.string,
+        bool: XSD.boolean,
+    }
+
     def __init__(self):
         self.graph = Graph()
         self.graph.namespace_manager.bind('alpaca', ALPACA)
@@ -142,6 +152,30 @@ def _add_FunctionExecution(self, script_info, session_id, execution_id,
         return uri
 
     # Entity methods
+    @classmethod
+    def _get_entity_value_datatype(cls, info):
+        value = info.value
+        if value is None:
+            return None
+
+        # Check if builtin type or NumPy dtype
+        value_class = value.__class__ if not isinstance(value, np.number) \
+            else value.dtype.type
+        if value_class in cls.XSD_TYPES:
+            return cls.XSD_TYPES[value_class]
+
+        # Check if object is include in the `store_values` setting.
+        # In this case, they are always stored as strings
+        obj_type = info.type
+        if obj_type in _ALPACA_SETTINGS['store_values']:
+            return XSD.string
+
+        for possible_type in (numbers.Integral, numbers.Real, numbers.Complex):
+            if issubclass(value_class, possible_type):
+                return cls.XSD_TYPES[possible_type]
+
+        # Type not found
+        return None
 
     def _add_DataObjectEntity(self, info):
         # Adds a DataObjectEntity from the Alpaca PROV model
@@ -152,6 +186,12 @@ def _add_DataObjectEntity(self, info):
             return uri
         self.graph.add((uri, RDF.type, ALPACA.DataObjectEntity))
         self.graph.add((uri, ALPACA.hashSource, Literal(info.hash_method)))
+
+        value_datatype = self._get_entity_value_datatype(info)
+        if value_datatype:
+            self.graph.add((uri, PROV.value,
+                            Literal(info.value, datatype=value_datatype)))
+
         self._add_entity_metadata(uri, info)
         self._entity_uris.add(uri)
         return uri

diff --git a/alpaca/settings.py b/alpaca/settings.py
@@ -51,6 +51,16 @@
 
         Default: "my-authority"
 
+* **store_values**: list of str
+        The values of the objects from the types in the list will be stored
+        together with the provenance information. Note that objects of the
+        builtin types `str`, `bool`, `int`, `float` and `complex`, as well as
+        the NumPy numeric types (e.g. `numpy.float64`) are stored by default.
+        This option should be used to store values of more complex types, such
+        as dictionaries. In this case, the list in this setting should have
+        the `builtins.dict` entry. The strings are the full path to the Python
+        object, i.e., `[module].[...].[object_class]`.
+
 
 To set/read a setting, use the function :func:`alpaca_setting`.
 
@@ -61,7 +71,8 @@
 # Should be modified only through the `alpaca_setting` function.
 
 _ALPACA_SETTINGS = {'use_builtin_hash_for_module': [],
-                    'authority': "my-authority"}
+                    'authority': "my-authority",
+                    'store_values': []}
 
 
 def alpaca_setting(name, value=None):