instructlab · markmc · Jul 18, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 17, 2024
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
@@ -67,17 +67,33 @@ def generate(self, dataset) -> Dataset:
         dataset: the input dataset
         """
         for block_prop in self.chained_blocks:
+            # Parse and instantiate the block
             block_name = block_prop["name"]
             block_type = _lookup_block_type(block_prop["type"])
             block_config = block_prop["config"]
             drop_columns = block_prop.get("drop_columns", [])
             drop_duplicates_cols = block_prop.get("drop_duplicates", False)
             block = block_type(self.ctx, self, block_name, **block_config)
-
             logger.info("Running block: %s", block_name)
             logger.info(dataset)
 
-            dataset = block.generate(dataset)
+            # Execute the block and wrap errors with the block name/type
+            try:
+                dataset = block.generate(dataset)
+
+            except Exception as err:
+                block_exc_err = (
+                    f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}"
+                )
+
+                # Try to raise the same exception type. This can fail if the
+                # exception is a non-standard type that has a different init
+                # signature, so fall back to raising a RuntimeError in that case.
+                try:
+                    wrapper_err = type(err)(block_exc_err)
+                except TypeError:
+                    wrapper_err = RuntimeError(block_exc_err)
+                raise wrapper_err from err
 
             # If at any point we end up with an empty data set, the pipeline has failed
             if len(dataset) == 0:

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -0,0 +1,64 @@
+"""
+Unit tests for common Pipeline functionality
+"""
+
+# Standard
+from unittest import mock
+
+# Third Party
+import pytest
+
+# First Party
+from instructlab.sdg.pipeline import Pipeline
+
+## Helpers ##
+
+
+class CustomTypeError(TypeError):
+    pass
+
+
+class NoArgError(RuntimeError):
+    """Exception that can't be instantiated with a single argument"""
+
+    def __init__(self):
+        super().__init__("no args")
+
+
+@pytest.mark.parametrize(
+    ["failure_exc", "exp_err_type"],
+    [
+        (CustomTypeError("Oh no!"), CustomTypeError),
+        (NoArgError(), RuntimeError),
+    ],
+)
+def test_pipeline_named_errors_match_type(failure_exc, exp_err_type):
+    """Validate that block types and names appear in the error message from a
+    pipeline exception and that the type of the error is preserved.
+    """
+    mock_dataset = ["not empty"]
+    working_block = mock.MagicMock()
+    working_block().generate.return_value = mock_dataset
+    failure_block = mock.MagicMock()
+    failure_block.__name__ = "BadBlock"
+    failure_block().generate = mock.MagicMock(side_effect=failure_exc)
+    pipe_cfg = [
+        {"name": "I work", "type": "working", "config": {}},
+        {"name": "I don't", "type": "failure", "config": {}},
+    ]
+    with mock.patch(
+        "instructlab.sdg.pipeline._block_types",
+        {
+            "working": working_block,
+            "failure": failure_block,
+        },
+    ):
+        pipe = Pipeline(None, None, pipe_cfg)
+        with pytest.raises(exp_err_type) as exc_ctx:
+            pipe.generate(None)
+
+        assert exc_ctx.value.__cause__ is failure_exc
+        assert (
+            str(exc_ctx.value)
+            == f"BLOCK ERROR [{failure_block.__name__}/{pipe_cfg[1]['name']}]: {failure_exc}"
+        )