Skip to content

Commit

Permalink
[Enh] Selecting ranges for levels in nested container outputs (#25)
Browse files Browse the repository at this point in the history
* Using a tuple to define a range in container output tracking

* Added documentation for the new behavior of container_output option.

---------

Co-authored-by: Cristiano Köhler <[email protected]>
  • Loading branch information
kohlerca and Cristiano Köhler authored Nov 17, 2023
1 parent 12f2e4d commit 785fa8f
Show file tree
Hide file tree
Showing 2 changed files with 437 additions and 22 deletions.
78 changes: 70 additions & 8 deletions alpaca/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

from functools import wraps
import itertools
from collections.abc import Iterable
from importlib.metadata import version, PackageNotFoundError
import inspect
Expand Down Expand Up @@ -68,18 +69,55 @@ class Provenance(object):
data structures used by the function). Alpaca will track and identify
the elements inside the container, instead of the container itself.
Default: None
container_output : bool or int, optional
container_output : bool or int or tuple, optional
The function outputs data inside a container (e.g., a list).
If True, Alpaca will track and identify the elements inside the
container, instead of the container itself. It will iterate over the
function output object and identify the individual elements. However,
for dictionary outputs, the dictionary object is identified together
with its elements, to retain information on the keys. For other
containers, the container object is not identified.
If an integer, this defines a multiple-level (nested) container. The
number defines the depth for which to identify and serialize the
objects. In this case, the function output object will always be
identified together with the element tree.
identified together with the element tree. For instance, consider
the two-level list `L = [[obj1, obj2], [obj3, obj4]]`. With
`container_output=0`, there will be a single function output node for
list `L`. Starting from `L`, there will be two additional nodes for
each of the inner lists (`L[0]` and `L[1]`, i.e., all elements from
level zero). With `container_output=1`, there will be a single
function output node for list `L`. Starting from `L`, there
will be two additional nodes for each of the inner lists (`L[0]` and
`L[1]`). Finally, starting from each inner list, there will be output
nodes for `obj1` and `obj2` (linked to `L[0]`) and for `obj3` and
`obj4` (linked to `L[1]`). Therefore, all elements from level one are
identified, and linked to the respective elements from level zero.
If a tuple, this defines a range of the levels in a nested container
to consider when identifying the objects output by the function. For
example, taking the same list above, a `container_output=(0, 1)` will
start from level zero and stop at the elements from level
one (similar to `container_output=1`). With `container_output=(1, 1)`,
the first level will be ignored as function output. The function will
have two output nodes (directly for `L[0]` and `L[1]`). Starting from
each inner list, there will be output nodes for `obj1` and `obj2`
(linked to `L[0]`) and for `obj3` and `obj4` (linked to `L[1]`).
Therefore, the first level (zero) of the container is ignored, and only
elements from level one are described. The range feature is useful for
functions where the relevant outputs are containers whose elements
should also be described, but those containers are grouped inside a
single return list instead of the function returning a tuple with the
containers.
It is important to note that all levels identified as integers or
range tuples should point to levels in the nested-container that
contain iterables. For example, in the list `L` above, the level 2
are the objects `objX`. If `container_output=2`, Alpaca will try to
iterate over each `objX` and describe their elements. If they are
not iterable, an error will be raised.
Default: False
Attributes
Expand Down Expand Up @@ -174,10 +212,17 @@ def __init__(self, inputs, file_input=None, file_output=None,
# Store the names of arguments that are inputs
self.inputs = inputs

self.container_output = container_output
self._tracking_container_output = \
((isinstance(container_output, bool) and container_output) or
(not isinstance(container_output, bool) and container_output >= 0))
self.container_output = False
self._tracking_container_output = False
if isinstance(container_output, bool):
self._tracking_container_output = container_output
self.container_output = container_output
elif isinstance(container_output, tuple):
self._tracking_container_output = len(container_output) == 2
self.container_output = container_output
elif isinstance(container_output, int):
self._tracking_container_output = container_output >= 0
self.container_output = (0, container_output)

def _insert_static_information(self, tree, data_info, function,
time_stamp):
Expand Down Expand Up @@ -486,7 +531,7 @@ def _add_container_relationships(self, container, data_info, level,
# This will work whether the main container is a dictionary or
# other iterable.
if (level is not None and
level < self.container_output and
level < max(self.container_output) and
(isinstance(element, Iterable) or
hasattr(container, "__getitem__"))):
self._add_container_relationships(element, data_info,
Expand All @@ -500,11 +545,28 @@ def _capture_container_output(self, function_output, data_info,
time_stamp_start, execution_id):
level = None if isinstance(self.container_output, bool) else 0

if isinstance(function_output, dict) or level is not None:
if isinstance(function_output, dict):
container_info = self._add_container_relationships(
function_output, data_info, level, time_stamp_start,
execution_id)
return {0: container_info}
elif level is not None:
if not self.container_output or min(self.container_output) == 0:
# Starting from zero
container_info = self._add_container_relationships(
function_output, data_info, level, time_stamp_start,
execution_id)
return {0: container_info}
else:
# Process range starting from other level
elements = function_output
start_level = min(self.container_output) - 1
for level in range(start_level):
# Unpack all elements until the requested start level
elements = itertools.chain(*elements)
return {idx: self._add_container_relationships(
element, data_info, start_level + 1, time_stamp_start,
execution_id) for idx, element in enumerate(elements)}

# Process simple container.
# The container object will not be identified.
Expand Down
Loading

0 comments on commit 785fa8f

Please sign in to comment.