From 557c410569caa95f779869af4e80592dc007f7ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= <c.koehler@fz-juelich.de>
Date: Thu, 9 Nov 2023 17:42:52 +0100
Subject: [PATCH] Unit test for using a callable for graph aggregation

---
 alpaca/test/res/multiple_file_output.ttl | 103 +++++++++++++++++++++++
 alpaca/test/test_graph.py                |  82 ++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 alpaca/test/res/multiple_file_output.ttl

diff --git a/alpaca/test/res/multiple_file_output.ttl b/alpaca/test/res/multiple_file_output.ttl
new file mode 100644
index 0000000..4e1d0f2
--- /dev/null
+++ b/alpaca/test/res/multiple_file_output.ttl
@@ -0,0 +1,103 @@
+@prefix alpaca: <http://purl.org/alpaca#> .
+@prefix prov: <http://www.w3.org/ns/prov#> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+<urn:my-authority:alpaca:file:sha256:98765> a alpaca:FileEntity ;
+    alpaca:filePath "/outputs/1.png"^^xsd:string ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:12345> ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#12345> .
+
+<urn:my-authority:alpaca:file:sha256:987651> a alpaca:FileEntity ;
+    alpaca:filePath "/outputs/2.png"^^xsd:string ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:123452> ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#123452> .
+
+<urn:my-authority:alpaca:object:Python:builtins.NoneType:777777> a alpaca:DataObjectEntity ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999>  ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:12345> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#12345> ;
+    alpaca:hashSource "UUID" .
+
+<urn:my-authority:alpaca:object:Python:builtins.NoneType:7777772> a alpaca:DataObjectEntity ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999>  ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:123452> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#123452> ;
+    alpaca:hashSource "UUID" .
+
+<urn:my-authority:alpaca:object:Python:test.InputObject:12345> a alpaca:DataObjectEntity ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:22345> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.cut_function#12345> ;
+    alpaca:hashSource "joblib_SHA1" .
+
+<urn:my-authority:alpaca:object:Python:test.InputObject:123452> a alpaca:DataObjectEntity ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:22345> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.cut_function#12345> ;
+    alpaca:hashSource "joblib_SHA1" .
+
+<urn:my-authority:alpaca:object:Python:test.InputObject:22345> a alpaca:DataObjectEntity ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    alpaca:hashSource "joblib_SHA1" .
+
+<urn:my-authority:alpaca:file:sha256:18765> a alpaca:FileEntity ;
+    alpaca:filePath "/full.png"^^xsd:string ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:22345> ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#22345> .
+
+<urn:my-authority:alpaca:object:Python:builtins.NoneType:666666> a alpaca:DataObjectEntity ;
+    prov:wasAttributedTo <urn:my-authority:alpaca:script:Python:script.py:111111#999999>  ;
+    prov:wasDerivedFrom <urn:my-authority:alpaca:object:Python:test.InputObject:22345> ;
+    prov:wasGeneratedBy <urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#22345> ;
+    alpaca:hashSource "UUID" .
+
+<urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#12345> a alpaca:FunctionExecution ;
+    prov:startedAtTime "2022-05-02T12:34:56.123456"^^xsd:dateTime ;
+    prov:endedAtTime "2022-05-02T12:35:56.123456"^^xsd:dateTime ;
+    prov:used <urn:my-authority:alpaca:object:Python:test.InputObject:12345> ;
+    prov:wasAssociatedWith <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    alpaca:codeStatement "plot_function(input, out_file)" ;
+    alpaca:executionOrder 3 ;
+    alpaca:usedFunction <urn:my-authority:alpaca:function:Python:test.plot_function> .
+
+<urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#123452> a alpaca:FunctionExecution ;
+    prov:startedAtTime "2022-05-02T12:34:56.123456"^^xsd:dateTime ;
+    prov:endedAtTime "2022-05-02T12:35:56.123456"^^xsd:dateTime ;
+    prov:used <urn:my-authority:alpaca:object:Python:test.InputObject:123452> ;
+    prov:wasAssociatedWith <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    alpaca:codeStatement "plot_function(input, out_file)" ;
+    alpaca:executionOrder 4 ;
+    alpaca:usedFunction <urn:my-authority:alpaca:function:Python:test.plot_function> .
+
+<urn:my-authority:alpaca:function_execution:Python:111111:999999:test.plot_function#22345> a alpaca:FunctionExecution ;
+    prov:startedAtTime "2022-05-02T12:34:56.123456"^^xsd:dateTime ;
+    prov:endedAtTime "2022-05-02T12:35:56.123456"^^xsd:dateTime ;
+    prov:used <urn:my-authority:alpaca:object:Python:test.InputObject:22345> ;
+    prov:wasAssociatedWith <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    alpaca:codeStatement "plot_function(input, out_file)" ;
+    alpaca:executionOrder 1 ;
+    alpaca:usedFunction <urn:my-authority:alpaca:function:Python:test.plot_function> .
+
+<urn:my-authority:alpaca:function_execution:Python:111111:999999:test.cut_function#12345> a alpaca:FunctionExecution ;prov:startedAtTime "2022-05-02T12:34:56.123456"^^xsd:dateTime ;
+    prov:endedAtTime "2022-05-02T12:35:56.123456"^^xsd:dateTime ;
+    prov:used <urn:my-authority:alpaca:object:Python:test.InputObject:22345> ;
+    prov:wasAssociatedWith <urn:my-authority:alpaca:script:Python:script.py:111111#999999> ;
+    alpaca:codeStatement "cut_function(full_data)" ;
+    alpaca:executionOrder 2 ;
+    alpaca:usedFunction <urn:my-authority:alpaca:function:Python:test.cut_function> .
+                                                                                           
+<urn:my-authority:alpaca:function:Python:test.plot_function> a alpaca:Function ;
+    alpaca:functionName "plot_function" ;
+    alpaca:implementedIn "test" ;
+    alpaca:functionVersion "0.0.1" .
+
+<urn:my-authority:alpaca:function:Python:test.cut_function> a alpaca:Function ;
+    alpaca:functionName "cut_function" ;
+    alpaca:implementedIn "test" ;
+    alpaca:functionVersion "0.0.1" .
+
+<urn:my-authority:alpaca:script:Python:script.py:111111#999999> a alpaca:ScriptAgent ;
+    alpaca:scriptPath "/script.py" .
diff --git a/alpaca/test/test_graph.py b/alpaca/test/test_graph.py
index c18945e..46bb78b 100644
--- a/alpaca/test/test_graph.py
+++ b/alpaca/test/test_graph.py
@@ -1,3 +1,4 @@
+import sys
 import unittest
 
 from pathlib import Path
@@ -543,6 +544,87 @@ def test_overall_aggregation(self):
                 for key, value in expected_values_per_node[label].items():
                     self.assertEqual(attrs[key], value)
 
+    def test_aggregation_by_callable(self):
+        graph_file = self.ttl_path / "multiple_file_output.ttl"
+
+        # Non-aggregated graph
+        graph = ProvenanceGraph(graph_file)
+
+        # Aggregate without attributes
+        aggregated = graph.aggregate({}, output_file=None)
+
+        # Aggregate separating by file path in File nodes
+        aggregated_path = graph.aggregate({'File': ('File_path',)},
+                                          output_file=None)
+
+        # Aggregate using a callable to separate files which path starts with
+        # "/outputs/"
+        is_cut_plot = lambda g, n, d: d['File_path'].startswith("/outputs/")
+        aggregated_callable = graph.aggregate({'File': (is_cut_plot,)},
+                                              output_file=None)
+
+        # Define a dictionary with the expected values for each case, that
+        # are used in subtests below
+        tests = {
+            'non_aggregated': {'graph': graph.graph, 'length': 10,
+                               'counts': {'InputObject': 3,
+                                          'plot_function': 3,
+                                          'cut_function': 1,
+                                          'File': 3},
+                               'paths': ["/full.png",
+                                         "/outputs/1.png",
+                                         "/outputs/2.png"]
+                               },
+
+            'aggregated': {'graph': aggregated, 'length': 5,
+                           'counts': {'InputObject': 2,
+                                      'plot_function': 1,
+                                      'cut_function': 1,
+                                      'File': 1},
+                           'paths': "/full.png;/outputs/1.png;/outputs/2.png"
+                           },
+
+            'aggregated_path': {'graph': aggregated_path, 'length': 10,
+                                'counts': {'InputObject': 3,
+                                           'plot_function': 3,
+                                           'cut_function': 1,
+                                           'File': 3},
+                                'paths': ["/full.png",
+                                          "/outputs/1.png",
+                                          "/outputs/2.png"]
+                                },
+            'aggregated_callable': {'graph': aggregated_callable, 'length': 7,
+                                    'counts': {'InputObject': 2,
+                                               'plot_function': 2,
+                                               'cut_function': 1,
+                                               'File': 2},
+                                    'paths': ["/full.png",
+                                              "/outputs/1.png;/outputs/2.png"]
+                                    },
+        }
+
+        for key, expected in tests.items():
+            with self.subTest(f"Graph {key}"):
+                test_graph = expected['graph']
+                nodes = test_graph.nodes
+                self.assertEqual(len(nodes), expected['length'])
+
+                # Check if node counts is as expected
+                all_labels = [nodes[node]['label'] for node in nodes]
+                counts = Counter(all_labels)
+                for label, count in expected['counts'].items():
+                    self.assertEqual(counts[label], count)
+
+                # Check if file paths in the node are as expected
+                paths = expected['paths']
+                for node, attrs in nodes.items():
+                    # Check value of file paths in File nodes
+                    if attrs['label'] == "File":
+                        if isinstance(paths, list):
+                            self.assertTrue(attrs['File_path'] in paths)
+                        else:
+                            self.assertEqual(attrs['File_path'], paths)
+
     def test_aggregation_by_attribute_with_missing(self):
         aggregated = self.graph.aggregate({'InputObject': ('id',)},
                                           use_function_parameters=False,