Skip to content

Commit

Permalink
Capture and serialization of object values (#27)
Browse files Browse the repository at this point in the history
* Implemented capture of values of strings, booleans, complex and numeric types.
* Added serialization of values, with unit tests
* Implemented capture of values of user-requested types, using the settings function. They are captured as strings.
* Added option to display stored value for the node in the visualization graph

---------

Co-authored-by: Cristiano Köhler <[email protected]>
  • Loading branch information
kohlerca and Cristiano Köhler authored Dec 6, 2023
1 parent 785fa8f commit e228585
Show file tree
Hide file tree
Showing 12 changed files with 563 additions and 120 deletions.
2 changes: 1 addition & 1 deletion alpaca/alpaca_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,6 @@
# the disk.

DataObject = namedtuple('DataObject', ('hash', 'hash_method', 'type', 'id',
'details'))
'details', 'value'))

File = namedtuple('File', ('hash', 'hash_type', 'path'))
35 changes: 32 additions & 3 deletions alpaca/data_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import joblib
import numpy as np
from numbers import Number
from dill._dill import save_function

from alpaca.alpaca_types import DataObject, File
Expand Down Expand Up @@ -118,6 +119,11 @@ class _ObjectInformation(object):
:func:`hash` function, depending on the `use_builtin_hash` parameter set
during initialization.
The values of objects of the builtin types `str`, `bool`, `int`, `complex`
and `float` as well as the NumPy numeric types (e.g., `np.float64`) will
be stored. Additional object types to be stored (e.g., `builtins.dict`)
can be defined with the `store_values` parameter.
The method `info` is called to obtain the provenance information
associated with the object during tracking, as the `DataObject` named
tuple. The relevant metadata attributes are also stored in the tuple.
Expand All @@ -132,6 +138,13 @@ class _ObjectInformation(object):
List of package names whose object hashes will be computed using the
Python builtin `hash` function, instead of `joblib.hash` function.
Default: None
store_values : list, optional
List of object types whose values will be stored in the provenance
information (e.g., `builtins.dict`). This is in addition to the
builtin types `str`, `bool`, `int`, `complex` and `float` as well as
the NumPy numeric types (e.g., `np.float64`). The values of these are
always stored.
Default: None
"""

# This is a list of object attributes that provide relevant provenance
Expand All @@ -140,10 +153,12 @@ class _ObjectInformation(object):
'id', 'nix_name', 'dimensionality', 'pid',
'create_time')

def __init__(self, use_builtin_hash=None):
def __init__(self, use_builtin_hash=None, store_values=None):
self._hash_memoizer = dict()
self._use_builtin_hash = copy(use_builtin_hash) \
if use_builtin_hash is not None else []
self._store_values = copy(store_values)\
if store_values is not None else []

@staticmethod
def _get_object_package(obj):
Expand Down Expand Up @@ -258,6 +273,11 @@ def info(self, obj):
Reference for the object.
* details : dict
Extended information (metadata) on the object.
* value : object
For builtin objects (`str`, `int`, `float`, `bool`, `complex`)
or equivalent objects (e.g. `numpy.float64`), the value is
stored. Additional object types specified with the
:attr:`store_values` list will also be stored.
"""
type_information = type(obj)
obj_type = f"{type_information.__module__}.{type_information.__name__}"
Expand All @@ -267,7 +287,8 @@ def info(self, obj):
if obj is None:
unique_id = uuid.uuid4()
return DataObject(hash=unique_id, hash_method="UUID",
type=obj_type, id=obj_id, details={})
type=obj_type, id=obj_id, details={},
value=None)

# Here we can extract specific metadata to record
details = {}
Expand All @@ -290,5 +311,13 @@ def info(self, obj):
obj_id=obj_id,
package=package)

# Store object value
obj_value = None
if isinstance(obj, (str, bool, Number)):
obj_value = obj
elif obj_type in self._store_values:
obj_value = str(obj)

return DataObject(hash=obj_hash, hash_method=hash_method,
type=obj_type, id=obj_id, details=details)
type=obj_type, id=obj_id, details=details,
value=obj_value)
18 changes: 12 additions & 6 deletions alpaca/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ def _capture_code_and_function_provenance(self, lineno, function):
return source_line, ast_tree, return_targets, function_info

def _capture_input_and_parameters_provenance(self, function, args, kwargs,
ast_tree, function_info, time_stamp_start, builtin_object_hash):
ast_tree, function_info, time_stamp_start, builtin_object_hash,
store_values):

# 1. Extract the parameters passed to the function and store them in
# the `input_data` dictionary.
Expand All @@ -446,7 +447,8 @@ def _capture_input_and_parameters_provenance(self, function, args, kwargs,
# After this step, all hashes and metadata of input parameters/files
# are going to be stored in the dictionary `inputs`.

data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash)
data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash,
store_values=store_values)

# Initialize parameter list with all default arguments that were not
# passed to the function
Expand Down Expand Up @@ -576,11 +578,12 @@ def _capture_container_output(self, function_output, data_info,
def _capture_output_provenance(self, function_output, return_targets,
input_data, builtin_object_hash,
time_stamp_start, execution_id,
constructed_object=None):
store_values, constructed_object=None):

# In case in-place operations were performed, lets not use
# memoization
data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash)
data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash,
store_values=store_values)

# 6. Create hash for the output using `_ObjectInformation` to follow
# individual returns. The hashes will be stored in the `outputs`
Expand Down Expand Up @@ -626,7 +629,8 @@ def wrapped(*args, **kwargs):

builtin_object_hash = _ALPACA_SETTINGS[
'use_builtin_hash_for_module']
logger.debug(f"Builtin object hash: {builtin_object_hash}")
store_values = _ALPACA_SETTINGS['store_values']
logging.debug(f"Builtin object hash: {builtin_object_hash}")

lineno = None

Expand Down Expand Up @@ -667,7 +671,8 @@ def wrapped(*args, **kwargs):
function=function, args=args, kwargs=kwargs,
ast_tree=ast_tree, function_info=function_info,
time_stamp_start=time_stamp_start,
builtin_object_hash=builtin_object_hash)
builtin_object_hash=builtin_object_hash,
store_values=store_values)

# Call the function
function_output = function(*args, **kwargs)
Expand All @@ -694,6 +699,7 @@ def wrapped(*args, **kwargs):
builtin_object_hash=builtin_object_hash,
time_stamp_start=time_stamp_start,
execution_id=execution_id,
store_values=store_values,
constructed_object=constructed_object)

# Get the end time stamp
Expand Down
26 changes: 21 additions & 5 deletions alpaca/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _get_name_value_pair(graph, bnode):


def _get_entity_data(graph, entity, annotations=None, attributes=None,
strip_namespace=True):
strip_namespace=True, value_attribute=None):
filter_map = defaultdict(list)

filter_map.update(
Expand All @@ -118,6 +118,12 @@ def _get_entity_data(graph, entity, annotations=None, attributes=None,
_add_attribute(data, attr_name, attr_type, attr_value,
strip_namespace)

# Get the stored value if requested and present
if value_attribute:
value = graph.value(entity, PROV.value)
if value:
data[value_attribute] = value.toPython()

if data['type'] == NSS_FILE:
file_path = str(list(graph.objects(entity, ALPACA.filePath))[0])
data["File_path"] = file_path
Expand Down Expand Up @@ -223,6 +229,15 @@ class name of the object (e.g., `ndarray`). The `Python_name` node
time interval strings in the format supported by the Gephi timeline
feature. If False, the attribute is not included.
Default: True
value_attribute : str, optional
If provided, an attribute named `value_attribute` will be added to
the node attributes to show the values stored in the provenance
information. Alpaca stores the values of objects of the builtin types
`str`, `bool`, `int`, `float` and `complex`, as well as the NumPy
numeric types (e.g. `numpy.float64`) by default. The values of
additional types can be defined using the
:func:`alpaca.settings.alpaca_setting` function.
Default: None
Attributes
----------
Expand All @@ -235,7 +250,7 @@ class name of the object (e.g., `ndarray`). The `Python_name` node
def __init__(self, *prov_file, annotations=None, attributes=None,
strip_namespace=True, remove_none=True,
use_name_in_parameter=True, use_class_in_method_name=True,
time_intervals=True):
time_intervals=True, value_attribute=None):

# Load PROV records from the file(s)
doc = AlpacaProvDocument()
Expand All @@ -250,7 +265,7 @@ def __init__(self, *prov_file, annotations=None, attributes=None,
strip_namespace=strip_namespace, remove_none=remove_none,
use_name_in_parameter=use_name_in_parameter,
use_class_in_method_name=use_class_in_method_name,
time_intervals=time_intervals
time_intervals=time_intervals, value_attribute=value_attribute
)

if time_intervals:
Expand Down Expand Up @@ -319,7 +334,7 @@ def _transform_graph(graph, annotations=None, attributes=None,
strip_namespace=True, remove_none=True,
use_name_in_parameter=True,
use_class_in_method_name=True,
time_intervals=True):
time_intervals=True, value_attribute=None):
# Transform an RDFlib graph obtained from the PROV data, so that the
# visualization is simplified. A new `nx.DiGraph` object is created
# and returned. Annotations and attributes of the entities stored in
Expand All @@ -341,7 +356,8 @@ def _transform_graph(graph, annotations=None, attributes=None,
data = _get_entity_data(graph, entity,
annotations=annotations,
attributes=attributes,
strip_namespace=strip_namespace)
strip_namespace=strip_namespace,
value_attribute=value_attribute)
transformed.add_node(node_id, **data)

# Add all the edges.
Expand Down
40 changes: 40 additions & 0 deletions alpaca/serialization/prov.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"""

from itertools import product
import numpy as np
import numbers

from rdflib import Graph, URIRef, BNode, Literal
from rdflib.namespace import RDF, PROV, XSD
Expand Down Expand Up @@ -59,6 +61,14 @@ class AlpacaProvDocument(object):
control the serialization.
"""

XSD_TYPES = {
numbers.Integral: XSD.integer,
numbers.Real: XSD.double,
numbers.Complex: XSD.string,
str: XSD.string,
bool: XSD.boolean,
}

def __init__(self):
self.graph = Graph()
self.graph.namespace_manager.bind('alpaca', ALPACA)
Expand Down Expand Up @@ -142,6 +152,30 @@ def _add_FunctionExecution(self, script_info, session_id, execution_id,
return uri

# Entity methods
@classmethod
def _get_entity_value_datatype(cls, info):
value = info.value
if value is None:
return None

# Check if builtin type or NumPy dtype
value_class = value.__class__ if not isinstance(value, np.number) \
else value.dtype.type
if value_class in cls.XSD_TYPES:
return cls.XSD_TYPES[value_class]

# Check if object is include in the `store_values` setting.
# In this case, they are always stored as strings
obj_type = info.type
if obj_type in _ALPACA_SETTINGS['store_values']:
return XSD.string

for possible_type in (numbers.Integral, numbers.Real, numbers.Complex):
if issubclass(value_class, possible_type):
return cls.XSD_TYPES[possible_type]

# Type not found
return None

def _add_DataObjectEntity(self, info):
# Adds a DataObjectEntity from the Alpaca PROV model
Expand All @@ -152,6 +186,12 @@ def _add_DataObjectEntity(self, info):
return uri
self.graph.add((uri, RDF.type, ALPACA.DataObjectEntity))
self.graph.add((uri, ALPACA.hashSource, Literal(info.hash_method)))

value_datatype = self._get_entity_value_datatype(info)
if value_datatype:
self.graph.add((uri, PROV.value,
Literal(info.value, datatype=value_datatype)))

self._add_entity_metadata(uri, info)
self._entity_uris.add(uri)
return uri
Expand Down
13 changes: 12 additions & 1 deletion alpaca/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@
Default: "my-authority"
* **store_values**: list of str
The values of the objects from the types in the list will be stored
together with the provenance information. Note that objects of the
builtin types `str`, `bool`, `int`, `float` and `complex`, as well as
the NumPy numeric types (e.g. `numpy.float64`) are stored by default.
This option should be used to store values of more complex types, such
as dictionaries. In this case, the list in this setting should have
the `builtins.dict` entry. The strings are the full path to the Python
object, i.e., `[module].[...].[object_class]`.
To set/read a setting, use the function :func:`alpaca_setting`.
Expand All @@ -61,7 +71,8 @@
# Should be modified only through the `alpaca_setting` function.

_ALPACA_SETTINGS = {'use_builtin_hash_for_module': [],
'authority': "my-authority"}
'authority': "my-authority",
'store_values': []}


def alpaca_setting(name, value=None):
Expand Down
Loading

0 comments on commit e228585

Please sign in to comment.