Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Capture and serialization of object values #27

Merged
merged 10 commits into from
Dec 6, 2023
2 changes: 1 addition & 1 deletion alpaca/alpaca_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,6 @@
# the disk.

DataObject = namedtuple('DataObject', ('hash', 'hash_method', 'type', 'id',
'details'))
'details', 'value'))

File = namedtuple('File', ('hash', 'hash_type', 'path'))
35 changes: 32 additions & 3 deletions alpaca/data_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import joblib
import numpy as np
from numbers import Number
from dill._dill import save_function

from alpaca.alpaca_types import DataObject, File
Expand Down Expand Up @@ -118,6 +119,11 @@ class _ObjectInformation(object):
:func:`hash` function, depending on the `use_builtin_hash` parameter set
during initialization.

The values of objects of the builtin types `str`, `bool`, `int`, `complex`
and `float` as well as the NumPy numeric types (e.g., `np.float64`) will
be stored. Additional object types to be stored (e.g., `builtins.dict`)
can be defined with the `store_values` parameter.

The method `info` is called to obtain the provenance information
associated with the object during tracking, as the `DataObject` named
tuple. The relevant metadata attributes are also stored in the tuple.
Expand All @@ -132,6 +138,13 @@ class _ObjectInformation(object):
List of package names whose object hashes will be computed using the
Python builtin `hash` function, instead of `joblib.hash` function.
Default: None
store_values : list, optional
List of object types whose values will be stored in the provenance
information (e.g., `builtins.dict`). This is in addition to the
builtin types `str`, `bool`, `int`, `complex` and `float` as well as
the NumPy numeric types (e.g., `np.float64`). The values of these are
always stored.
Default: None
"""

# This is a list of object attributes that provide relevant provenance
Expand All @@ -140,10 +153,12 @@ class _ObjectInformation(object):
'id', 'nix_name', 'dimensionality', 'pid',
'create_time')

def __init__(self, use_builtin_hash=None):
def __init__(self, use_builtin_hash=None, store_values=None):
self._hash_memoizer = dict()
self._use_builtin_hash = copy(use_builtin_hash) \
if use_builtin_hash is not None else []
self._store_values = copy(store_values)\
if store_values is not None else []

@staticmethod
def _get_object_package(obj):
Expand Down Expand Up @@ -258,6 +273,11 @@ def info(self, obj):
Reference for the object.
* details : dict
Extended information (metadata) on the object.
* value : object
For builtin objects (`str`, `int`, `float`, `bool`, `complex`)
or equivalent objects (e.g. `numpy.float64`), the value is
stored. Additional object types specified with the
:attr:`store_values` list will also be stored.
"""
type_information = type(obj)
obj_type = f"{type_information.__module__}.{type_information.__name__}"
Expand All @@ -267,7 +287,8 @@ def info(self, obj):
if obj is None:
unique_id = uuid.uuid4()
return DataObject(hash=unique_id, hash_method="UUID",
type=obj_type, id=obj_id, details={})
type=obj_type, id=obj_id, details={},
value=None)

# Here we can extract specific metadata to record
details = {}
Expand All @@ -290,5 +311,13 @@ def info(self, obj):
obj_id=obj_id,
package=package)

# Store object value
obj_value = None
if isinstance(obj, (str, bool, Number)):
obj_value = obj
elif obj_type in self._store_values:
obj_value = str(obj)

return DataObject(hash=obj_hash, hash_method=hash_method,
type=obj_type, id=obj_id, details=details)
type=obj_type, id=obj_id, details=details,
value=obj_value)
18 changes: 12 additions & 6 deletions alpaca/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ def _capture_code_and_function_provenance(self, lineno, function):
return source_line, ast_tree, return_targets, function_info

def _capture_input_and_parameters_provenance(self, function, args, kwargs,
ast_tree, function_info, time_stamp_start, builtin_object_hash):
ast_tree, function_info, time_stamp_start, builtin_object_hash,
store_values):

# 1. Extract the parameters passed to the function and store them in
# the `input_data` dictionary.
Expand All @@ -446,7 +447,8 @@ def _capture_input_and_parameters_provenance(self, function, args, kwargs,
# After this step, all hashes and metadata of input parameters/files
# are going to be stored in the dictionary `inputs`.

data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash)
data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash,
store_values=store_values)

# Initialize parameter list with all default arguments that were not
# passed to the function
Expand Down Expand Up @@ -576,11 +578,12 @@ def _capture_container_output(self, function_output, data_info,
def _capture_output_provenance(self, function_output, return_targets,
input_data, builtin_object_hash,
time_stamp_start, execution_id,
constructed_object=None):
store_values, constructed_object=None):

# In case in-place operations were performed, lets not use
# memoization
data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash)
data_info = _ObjectInformation(use_builtin_hash=builtin_object_hash,
store_values=store_values)

# 6. Create hash for the output using `_ObjectInformation` to follow
# individual returns. The hashes will be stored in the `outputs`
Expand Down Expand Up @@ -626,7 +629,8 @@ def wrapped(*args, **kwargs):

builtin_object_hash = _ALPACA_SETTINGS[
'use_builtin_hash_for_module']
logger.debug(f"Builtin object hash: {builtin_object_hash}")
store_values = _ALPACA_SETTINGS['store_values']
logging.debug(f"Builtin object hash: {builtin_object_hash}")

lineno = None

Expand Down Expand Up @@ -667,7 +671,8 @@ def wrapped(*args, **kwargs):
function=function, args=args, kwargs=kwargs,
ast_tree=ast_tree, function_info=function_info,
time_stamp_start=time_stamp_start,
builtin_object_hash=builtin_object_hash)
builtin_object_hash=builtin_object_hash,
store_values=store_values)

# Call the function
function_output = function(*args, **kwargs)
Expand All @@ -694,6 +699,7 @@ def wrapped(*args, **kwargs):
builtin_object_hash=builtin_object_hash,
time_stamp_start=time_stamp_start,
execution_id=execution_id,
store_values=store_values,
constructed_object=constructed_object)

# Get the end time stamp
Expand Down
26 changes: 21 additions & 5 deletions alpaca/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _get_name_value_pair(graph, bnode):


def _get_entity_data(graph, entity, annotations=None, attributes=None,
strip_namespace=True):
strip_namespace=True, value_attribute=None):
filter_map = defaultdict(list)

filter_map.update(
Expand All @@ -118,6 +118,12 @@ def _get_entity_data(graph, entity, annotations=None, attributes=None,
_add_attribute(data, attr_name, attr_type, attr_value,
strip_namespace)

# Get the stored value if requested and present
if value_attribute:
value = graph.value(entity, PROV.value)
if value:
data[value_attribute] = value.toPython()

if data['type'] == NSS_FILE:
file_path = str(list(graph.objects(entity, ALPACA.filePath))[0])
data["File_path"] = file_path
Expand Down Expand Up @@ -223,6 +229,15 @@ class name of the object (e.g., `ndarray`). The `Python_name` node
time interval strings in the format supported by the Gephi timeline
feature. If False, the attribute is not included.
Default: True
value_attribute : str, optional
If provided, an attribute named `value_attribute` will be added to
the node attributes to show the values stored in the provenance
information. Alpaca stores the values of objects of the builtin types
`str`, `bool`, `int`, `float` and `complex`, as well as the NumPy
numeric types (e.g. `numpy.float64`) by default. The values of
additional types can be defined using the
:func:`alpaca.settings.alpaca_setting` function.
Default: None

Attributes
----------
Expand All @@ -235,7 +250,7 @@ class name of the object (e.g., `ndarray`). The `Python_name` node
def __init__(self, *prov_file, annotations=None, attributes=None,
strip_namespace=True, remove_none=True,
use_name_in_parameter=True, use_class_in_method_name=True,
time_intervals=True):
time_intervals=True, value_attribute=None):

# Load PROV records from the file(s)
doc = AlpacaProvDocument()
Expand All @@ -250,7 +265,7 @@ def __init__(self, *prov_file, annotations=None, attributes=None,
strip_namespace=strip_namespace, remove_none=remove_none,
use_name_in_parameter=use_name_in_parameter,
use_class_in_method_name=use_class_in_method_name,
time_intervals=time_intervals
time_intervals=time_intervals, value_attribute=value_attribute
)

if time_intervals:
Expand Down Expand Up @@ -319,7 +334,7 @@ def _transform_graph(graph, annotations=None, attributes=None,
strip_namespace=True, remove_none=True,
use_name_in_parameter=True,
use_class_in_method_name=True,
time_intervals=True):
time_intervals=True, value_attribute=None):
# Transform an RDFlib graph obtained from the PROV data, so that the
# visualization is simplified. A new `nx.DiGraph` object is created
# and returned. Annotations and attributes of the entities stored in
Expand All @@ -341,7 +356,8 @@ def _transform_graph(graph, annotations=None, attributes=None,
data = _get_entity_data(graph, entity,
annotations=annotations,
attributes=attributes,
strip_namespace=strip_namespace)
strip_namespace=strip_namespace,
value_attribute=value_attribute)
transformed.add_node(node_id, **data)

# Add all the edges.
Expand Down
40 changes: 40 additions & 0 deletions alpaca/serialization/prov.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"""

from itertools import product
import numpy as np
import numbers

from rdflib import Graph, URIRef, BNode, Literal
from rdflib.namespace import RDF, PROV, XSD
Expand Down Expand Up @@ -59,6 +61,14 @@ class AlpacaProvDocument(object):
control the serialization.
"""

XSD_TYPES = {
numbers.Integral: XSD.integer,
numbers.Real: XSD.double,
numbers.Complex: XSD.string,
str: XSD.string,
bool: XSD.boolean,
}

def __init__(self):
self.graph = Graph()
self.graph.namespace_manager.bind('alpaca', ALPACA)
Expand Down Expand Up @@ -142,6 +152,30 @@ def _add_FunctionExecution(self, script_info, session_id, execution_id,
return uri

# Entity methods
@classmethod
def _get_entity_value_datatype(cls, info):
value = info.value
if value is None:
return None

# Check if builtin type or NumPy dtype
value_class = value.__class__ if not isinstance(value, np.number) \
else value.dtype.type
if value_class in cls.XSD_TYPES:
return cls.XSD_TYPES[value_class]

# Check if object is include in the `store_values` setting.
# In this case, they are always stored as strings
obj_type = info.type
if obj_type in _ALPACA_SETTINGS['store_values']:
return XSD.string

for possible_type in (numbers.Integral, numbers.Real, numbers.Complex):
if issubclass(value_class, possible_type):
return cls.XSD_TYPES[possible_type]

# Type not found
return None

def _add_DataObjectEntity(self, info):
# Adds a DataObjectEntity from the Alpaca PROV model
Expand All @@ -152,6 +186,12 @@ def _add_DataObjectEntity(self, info):
return uri
self.graph.add((uri, RDF.type, ALPACA.DataObjectEntity))
self.graph.add((uri, ALPACA.hashSource, Literal(info.hash_method)))

value_datatype = self._get_entity_value_datatype(info)
if value_datatype:
self.graph.add((uri, PROV.value,
Literal(info.value, datatype=value_datatype)))

self._add_entity_metadata(uri, info)
self._entity_uris.add(uri)
return uri
Expand Down
13 changes: 12 additions & 1 deletion alpaca/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@
Default: "my-authority"
* **store_values**: list of str
The values of the objects from the types in the list will be stored
together with the provenance information. Note that objects of the
builtin types `str`, `bool`, `int`, `float` and `complex`, as well as
the NumPy numeric types (e.g. `numpy.float64`) are stored by default.
This option should be used to store values of more complex types, such
as dictionaries. In this case, the list in this setting should have
the `builtins.dict` entry. The strings are the full path to the Python
object, i.e., `[module].[...].[object_class]`.
To set/read a setting, use the function :func:`alpaca_setting`.
Expand All @@ -61,7 +71,8 @@
# Should be modified only through the `alpaca_setting` function.

_ALPACA_SETTINGS = {'use_builtin_hash_for_module': [],
'authority': "my-authority"}
'authority': "my-authority",
'store_values': []}


def alpaca_setting(name, value=None):
Expand Down
Loading
Loading