From 1cfc8af69214c023a6d3da7dde59d09301c520a3 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 16 Jul 2024 11:56:30 -0600
Subject: [PATCH 1/5] BlockNameInErrors: Add exception handling in Pipeline
 with block name/type wrapping

https://github.com/instructlab/sdg/issues/128

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/instructlab/sdg/pipeline.py | 64 ++++++++++++++++++++-------------
 tests/test_pipeline.py          | 64 +++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 25 deletions(-)
 create mode 100644 tests/test_pipeline.py

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 05044dc0..f05cb660 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -66,31 +66,45 @@ def generate(self, dataset) -> Dataset:
         Generate the dataset by running the pipeline steps.
         dataset: the input dataset
         """
-        for block_prop in self.chained_blocks:
-            block_name = block_prop["name"]
-            block_type = _lookup_block_type(block_prop["type"])
-            block_config = block_prop["config"]
-            drop_columns = block_prop.get("drop_columns", [])
-            drop_duplicates_cols = block_prop.get("drop_duplicates", False)
-            block = block_type(self.ctx, self, block_name, **block_config)
-
-            logger.info("Running block: %s", block_name)
-            logger.info(dataset)
-
-            dataset = block.generate(dataset)
-
-            # If at any point we end up with an empty data set, the pipeline has failed
-            if len(dataset) == 0:
-                raise EmptyDatasetError(
-                    f"Pipeline stopped: Empty dataset after running block: {block_name}"
-                )
-
-            drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
-            if drop_columns:
-                dataset = dataset.remove_columns(drop_columns_in_ds)
-
-            if drop_duplicates_cols:
-                dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
+        try:
+            for block_prop in self.chained_blocks:
+                block_name = block_prop["name"]
+                block_type = _lookup_block_type(block_prop["type"])
+                block_config = block_prop["config"]
+                drop_columns = block_prop.get("drop_columns", [])
+                drop_duplicates_cols = block_prop.get("drop_duplicates", False)
+                block = block_type(self.ctx, self, block_name, **block_config)
+
+                logger.info("Running block: %s", block_name)
+                logger.info(dataset)
+
+                dataset = block.generate(dataset)
+
+                # If at any point we end up with an empty data set, the pipeline has failed
+                if len(dataset) == 0:
+                    raise EmptyDatasetError(
+                        f"Pipeline stopped: Empty dataset after running block: {block_name}"
+                    )
+
+                drop_columns_in_ds = [
+                    e for e in drop_columns if e in dataset.column_names
+                ]
+                if drop_columns:
+                    dataset = dataset.remove_columns(drop_columns_in_ds)
+
+                if drop_duplicates_cols:
+                    dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
+        except Exception as err:
+            block_exc_err = f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}"
+
+            # Try to raise the same exception type. This can fail if the
+            # exception is a non-standard type that has a different init
+            # signature, so fall back to raising a RuntimeError in that case.
+            try:
+                wrapper_err = type(err)(block_exc_err)
+            except TypeError:
+                wrapper_err = RuntimeError(block_exc_err)
+            raise wrapper_err from err
 
         return dataset
 
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
new file mode 100644
index 00000000..803b2879
--- /dev/null
+++ b/tests/test_pipeline.py
@@ -0,0 +1,64 @@
+"""
+Unit tests for common Pipeline functionality
+"""
+
+# Standard
+from unittest import mock
+
+# Third Party
+import pytest
+
+# First Party
+from instructlab.sdg.pipeline import Pipeline
+
+## Helpers ##
+
+
+class CustomTypeError(TypeError):
+    pass
+
+
+class NoArgError(RuntimeError):
+    """Exception that can't be instantiated with a single argument"""
+
+    def __init__(self):
+        super().__init__("no args")
+
+
+@pytest.mark.parametrize(
+    ["failure_exc", "exp_err_type"],
+    [
+        (CustomTypeError("Oh no!"), CustomTypeError),
+        (NoArgError(), RuntimeError),
+    ],
+)
+def test_pipeline_named_errors_match_type(failure_exc, exp_err_type):
+    """Validate that block types and names appear in the error message from a
+    pipeline exception and that the type of the error is preserved.
+    """
+    mock_dataset = ["not empty"]
+    working_block = mock.MagicMock()
+    working_block().generate.return_value = mock_dataset
+    failure_block = mock.MagicMock()
+    failure_block.__name__ = "BadBlock"
+    failure_block().generate = mock.MagicMock(side_effect=failure_exc)
+    pipe_cfg = [
+        {"name": "I work", "type": "working", "config": {}},
+        {"name": "I don't", "type": "failure", "config": {}},
+    ]
+    with mock.patch(
+        "instructlab.sdg.pipeline._block_types",
+        {
+            "working": working_block,
+            "failure": failure_block,
+        },
+    ):
+        pipe = Pipeline(None, None, pipe_cfg)
+        with pytest.raises(exp_err_type) as exc_ctx:
+            pipe.generate(None)
+
+        assert exc_ctx.value.__cause__ is failure_exc
+        assert (
+            str(exc_ctx.value)
+            == f"BLOCK ERROR [{failure_block.__name__}/{pipe_cfg[1]['name']}]: {failure_exc}"
+        )

From 10955fec1643235088acf30ead25ace78708b82f Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 16 Jul 2024 13:40:16 -0600
Subject: [PATCH 2/5] BlockNameInErrors: Limit try/except to the generate call

https://github.com/instructlab/sdg/issues/128

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/instructlab/sdg/pipeline.py | 76 +++++++++++++++++----------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index f05cb660..c74024b8 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -66,45 +66,47 @@ def generate(self, dataset) -> Dataset:
         Generate the dataset by running the pipeline steps.
         dataset: the input dataset
         """
-        try:
-            for block_prop in self.chained_blocks:
-                block_name = block_prop["name"]
-                block_type = _lookup_block_type(block_prop["type"])
-                block_config = block_prop["config"]
-                drop_columns = block_prop.get("drop_columns", [])
-                drop_duplicates_cols = block_prop.get("drop_duplicates", False)
-                block = block_type(self.ctx, self, block_name, **block_config)
-
-                logger.info("Running block: %s", block_name)
-                logger.info(dataset)
-
+        for block_prop in self.chained_blocks:
+            # Parse and instantiate the block
+            block_name = block_prop["name"]
+            block_type = _lookup_block_type(block_prop["type"])
+            block_config = block_prop["config"]
+            drop_columns = block_prop.get("drop_columns", [])
+            drop_duplicates_cols = block_prop.get("drop_duplicates", False)
+            block = block_type(self.ctx, self, block_name, **block_config)
+            logger.info("Running block: %s", block_name)
+            logger.info(dataset)
+
+            # Execute the block and wrap errors with the block name/type
+            try:
                 dataset = block.generate(dataset)
 
-                # If at any point we end up with an empty data set, the pipeline has failed
-                if len(dataset) == 0:
-                    raise EmptyDatasetError(
-                        f"Pipeline stopped: Empty dataset after running block: {block_name}"
-                    )
-
-                drop_columns_in_ds = [
-                    e for e in drop_columns if e in dataset.column_names
-                ]
-                if drop_columns:
-                    dataset = dataset.remove_columns(drop_columns_in_ds)
-
-                if drop_duplicates_cols:
-                    dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
-        except Exception as err:
-            block_exc_err = f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}"
-
-            # Try to raise the same exception type. This can fail if the
-            # exception is a non-standard type that has a different init
-            # signature, so fall back to raising a RuntimeError in that case.
-            try:
-                wrapper_err = type(err)(block_exc_err)
-            except TypeError:
-                wrapper_err = RuntimeError(block_exc_err)
-            raise wrapper_err from err
+            except Exception as err:
+                block_exc_err = (
+                    f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}"
+                )
+
+                # Try to raise the same exception type. This can fail if the
+                # exception is a non-standard type that has a different init
+                # signature, so fall back to raising a RuntimeError in that case.
+                try:
+                    wrapper_err = type(err)(block_exc_err)
+                except TypeError:
+                    wrapper_err = RuntimeError(block_exc_err)
+                raise wrapper_err from err
+
+            # If at any point we end up with an empty data set, the pipeline has failed
+            if len(dataset) == 0:
+                raise EmptyDatasetError(
+                    f"Pipeline stopped: Empty dataset after running block: {block_name}"
+                )
+
+            drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
+            if drop_columns:
+                dataset = dataset.remove_columns(drop_columns_in_ds)
+
+            if drop_duplicates_cols:
+                dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
 
         return dataset
 

From 95b2ed997a2da0f3af5088ad8619a9cc282e1bdc Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 17 Jul 2024 06:49:38 -0600
Subject: [PATCH 3/5] BlockNameInErrors: Refactor to use a single
 BlockGenerationError exception

https://github.com/instructlab/sdg/issues/128

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/instructlab/sdg/pipeline.py | 41 +++++++++++++++-------
 tests/test_pipeline.py          | 62 ++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index c74024b8..02345107 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -9,6 +9,7 @@
 
 # Local
 from . import filterblock, importblock, llmblock, utilblocks
+from .block import Block
 from .logger_config import setup_logger
 
 logger = setup_logger(__name__)
@@ -32,6 +33,32 @@ def __init__(
         self.num_procs = 8
 
 
+# This is part of the public API.
+class BlockGenerationError(Exception):
+    """A BlockGenerationError occurs when a block generates an exception during
+    generation. It contains information about which block failed and why.
+    """
+
+    def __init__(self, block: Block, exception: Exception):
+        self.block = block
+        self.exception = exception
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}({self.block_type}/{self.block_name}): {self.exception_message}"
+
+    @property
+    def block_name(self) -> str:
+        return self.block.block_name
+
+    @property
+    def block_type(self) -> str:
+        return self.block.__class__.__name__
+
+    @property
+    def exception_message(self) -> str:
+        return str(self.exception)
+
+
 # This is part of the public API.
 class Pipeline:
     def __init__(self, ctx, config_path, chained_blocks: list) -> None:
@@ -80,20 +107,8 @@ def generate(self, dataset) -> Dataset:
             # Execute the block and wrap errors with the block name/type
             try:
                 dataset = block.generate(dataset)
-
             except Exception as err:
-                block_exc_err = (
-                    f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}"
-                )
-
-                # Try to raise the same exception type. This can fail if the
-                # exception is a non-standard type that has a different init
-                # signature, so fall back to raising a RuntimeError in that case.
-                try:
-                    wrapper_err = type(err)(block_exc_err)
-                except TypeError:
-                    wrapper_err = RuntimeError(block_exc_err)
-                raise wrapper_err from err
+                raise BlockGenerationError(block=block, exception=err) from err
 
             # If at any point we end up with an empty data set, the pipeline has failed
             if len(dataset) == 0:
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 803b2879..79a93d6c 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -6,41 +6,24 @@
 from unittest import mock
 
 # Third Party
+from datasets import Dataset
 import pytest
 
 # First Party
-from instructlab.sdg.pipeline import Pipeline
+from instructlab.sdg.block import Block
+from instructlab.sdg.pipeline import BlockGenerationError, Pipeline
 
-## Helpers ##
 
-
-class CustomTypeError(TypeError):
-    pass
-
-
-class NoArgError(RuntimeError):
-    """Exception that can't be instantiated with a single argument"""
-
-    def __init__(self):
-        super().__init__("no args")
-
-
-@pytest.mark.parametrize(
-    ["failure_exc", "exp_err_type"],
-    [
-        (CustomTypeError("Oh no!"), CustomTypeError),
-        (NoArgError(), RuntimeError),
-    ],
-)
-def test_pipeline_named_errors_match_type(failure_exc, exp_err_type):
-    """Validate that block types and names appear in the error message from a
-    pipeline exception and that the type of the error is preserved.
+def test_pipeline_named_errors_match_type():
+    """Validate that a BlockGenerationError is raised to wrap exceptions raised
+    in a Block's generate method
     """
     mock_dataset = ["not empty"]
     working_block = mock.MagicMock()
     working_block().generate.return_value = mock_dataset
     failure_block = mock.MagicMock()
     failure_block.__name__ = "BadBlock"
+    failure_exc = RuntimeError("Oh no!")
     failure_block().generate = mock.MagicMock(side_effect=failure_exc)
     pipe_cfg = [
         {"name": "I work", "type": "working", "config": {}},
@@ -54,11 +37,32 @@ def test_pipeline_named_errors_match_type(failure_exc, exp_err_type):
         },
     ):
         pipe = Pipeline(None, None, pipe_cfg)
-        with pytest.raises(exp_err_type) as exc_ctx:
+        with pytest.raises(BlockGenerationError) as exc_ctx:
             pipe.generate(None)
 
         assert exc_ctx.value.__cause__ is failure_exc
-        assert (
-            str(exc_ctx.value)
-            == f"BLOCK ERROR [{failure_block.__name__}/{pipe_cfg[1]['name']}]: {failure_exc}"
-        )
+        assert exc_ctx.value.exception is failure_exc
+        assert exc_ctx.value.block is failure_block()
+
+
+def test_block_generation_error_properties():
+    """Make sure the BlockGenerationError exposes its properties and string form
+    correctly
+    """
+
+    class TestBlock(Block):
+        def generate(self, dataset: Dataset) -> Dataset:
+            return dataset
+
+    block_name = "my-block"
+    block = TestBlock(None, None, block_name)
+    inner_err = TypeError("Not the right type")
+    gen_err = BlockGenerationError(block, inner_err)
+    assert gen_err.block is block
+    assert gen_err.exception is inner_err
+    assert gen_err.block_name is block_name
+    assert gen_err.block_type == TestBlock.__name__
+    assert (
+        str(gen_err)
+        == f"{BlockGenerationError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}"
+    )

From aa9fb2a3c68f13051a600a9061a5a84daaf4d84f Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 17 Jul 2024 11:29:06 -0600
Subject: [PATCH 4/5] BlockNameInErrors: Capture block instantiation errors as
 well as generation

https://github.com/instructlab/sdg/issues/128

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/instructlab/sdg/pipeline.py | 57 +++++++++++++++++++--------------
 tests/test_pipeline.py          | 51 ++++++++++++++++++++++++-----
 2 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 02345107..8b006435 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
 from importlib import resources
+from typing import Optional
 import os.path
 
 # Third Party
@@ -34,26 +35,27 @@ def __init__(
 
 
 # This is part of the public API.
-class BlockGenerationError(Exception):
-    """A BlockGenerationError occurs when a block generates an exception during
+class PipelineBlockError(Exception):
+    """A PipelineBlockError occurs when a block generates an exception during
     generation. It contains information about which block failed and why.
     """
 
-    def __init__(self, block: Block, exception: Exception):
-        self.block = block
+    def __init__(
+        self,
+        exception: Exception,
+        *,
+        block: Optional[Block] = None,
+        block_name: Optional[str] = None,
+        block_type: Optional[str] = None,
+    ):
         self.exception = exception
+        self.block = block
+        self.block_name = block_name or (block.block_name if block else None)
+        self.block_type = block_type or (block.__class__.__name__ if block else None)
 
     def __str__(self) -> str:
         return f"{self.__class__.__name__}({self.block_type}/{self.block_name}): {self.exception_message}"
 
-    @property
-    def block_name(self) -> str:
-        return self.block.block_name
-
-    @property
-    def block_type(self) -> str:
-        return self.block.__class__.__name__
-
     @property
     def exception_message(self) -> str:
         return str(self.exception)
@@ -94,21 +96,28 @@ def generate(self, dataset) -> Dataset:
         dataset: the input dataset
         """
         for block_prop in self.chained_blocks:
-            # Parse and instantiate the block
-            block_name = block_prop["name"]
-            block_type = _lookup_block_type(block_prop["type"])
-            block_config = block_prop["config"]
-            drop_columns = block_prop.get("drop_columns", [])
-            drop_duplicates_cols = block_prop.get("drop_duplicates", False)
-            block = block_type(self.ctx, self, block_name, **block_config)
-            logger.info("Running block: %s", block_name)
-            logger.info(dataset)
-
-            # Execute the block and wrap errors with the block name/type
+            # Initialize arguments for error handling to None
+            block, block_name, block_type = None, None, None
             try:
+                # Parse and instantiate the block
+                block_name = block_prop["name"]
+                block_type = _lookup_block_type(block_prop["type"])
+                block_config = block_prop["config"]
+                drop_columns = block_prop.get("drop_columns", [])
+                drop_duplicates_cols = block_prop.get("drop_duplicates", False)
+                block = block_type(self.ctx, self, block_name, **block_config)
+                logger.info("Running block: %s", block_name)
+                logger.info(dataset)
+
+                # Execute the block and wrap errors with the block name/type
                 dataset = block.generate(dataset)
             except Exception as err:
-                raise BlockGenerationError(block=block, exception=err) from err
+                raise PipelineBlockError(
+                    exception=err,
+                    block=block,
+                    block_name=block_name,
+                    block_type=block_type,
+                ) from err
 
             # If at any point we end up with an empty data set, the pipeline has failed
             if len(dataset) == 0:
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 79a93d6c..443eda26 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -11,11 +11,11 @@
 
 # First Party
 from instructlab.sdg.block import Block
-from instructlab.sdg.pipeline import BlockGenerationError, Pipeline
+from instructlab.sdg.pipeline import Pipeline, PipelineBlockError
 
 
 def test_pipeline_named_errors_match_type():
-    """Validate that a BlockGenerationError is raised to wrap exceptions raised
+    """Validate that a PipelineBlockError is raised to wrap exceptions raised
     in a Block's generate method
     """
     mock_dataset = ["not empty"]
@@ -37,7 +37,7 @@ def test_pipeline_named_errors_match_type():
         },
     ):
         pipe = Pipeline(None, None, pipe_cfg)
-        with pytest.raises(BlockGenerationError) as exc_ctx:
+        with pytest.raises(PipelineBlockError) as exc_ctx:
             pipe.generate(None)
 
         assert exc_ctx.value.__cause__ is failure_exc
@@ -45,9 +45,24 @@ def test_pipeline_named_errors_match_type():
         assert exc_ctx.value.block is failure_block()
 
 
-def test_block_generation_error_properties():
-    """Make sure the BlockGenerationError exposes its properties and string form
-    correctly
+def test_pipeline_config_error_handling():
+    """Validate that a PipelineBlockError is raised when block config is
+    incorrect
+    """
+    pipe_cfg = [
+        {"name_not_there": "I work", "type": "working", "config": {}},
+        {"name": "I don't", "type": "failure", "config": {}},
+    ]
+    pipe = Pipeline(None, None, pipe_cfg)
+    with pytest.raises(PipelineBlockError) as exc_ctx:
+        pipe.generate(None)
+
+    assert isinstance(exc_ctx.value.__cause__, KeyError)
+
+
+def test_block_generation_error_properties_from_block():
+    """Make sure the PipelineBlockError exposes its properties and string form
+    correctly when pulled from a Block instance
     """
 
     class TestBlock(Block):
@@ -57,12 +72,32 @@ def generate(self, dataset: Dataset) -> Dataset:
     block_name = "my-block"
     block = TestBlock(None, None, block_name)
     inner_err = TypeError("Not the right type")
-    gen_err = BlockGenerationError(block, inner_err)
+    gen_err = PipelineBlockError(inner_err, block=block)
     assert gen_err.block is block
     assert gen_err.exception is inner_err
     assert gen_err.block_name is block_name
     assert gen_err.block_type == TestBlock.__name__
     assert (
         str(gen_err)
-        == f"{BlockGenerationError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}"
+        == f"{PipelineBlockError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}"
+    )
+
+
+def test_block_generation_error_properties_from_strings():
+    """Make sure the PipelineBlockError exposes its properties and string form
+    correctly when pulled from strings
+    """
+    inner_err = TypeError("Not the right type")
+    block_name = "my-block"
+    block_type = "TestBlock"
+    gen_err = PipelineBlockError(
+        inner_err, block_name=block_name, block_type=block_type
+    )
+    assert gen_err.block is None
+    assert gen_err.exception is inner_err
+    assert gen_err.block_name is block_name
+    assert gen_err.block_type == block_type
+    assert (
+        str(gen_err)
+        == f"{PipelineBlockError.__name__}({block_type}/{block_name}): {inner_err}"
     )

From ceb5a82211dcd369c52c14be6ca0857363733079 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 17 Jul 2024 11:29:23 -0600
Subject: [PATCH 5/5] BlockNameInErrors: Expose PipelineBlockError at the top
 of the library

https://github.com/instructlab/sdg/issues/128

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 src/instructlab/sdg/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
index 348e26a4..06df1200 100644
--- a/src/instructlab/sdg/__init__.py
+++ b/src/instructlab/sdg/__init__.py
@@ -12,6 +12,7 @@
     "ImportBlock",
     "LLMBlock",
     "Pipeline",
+    "PipelineBlockError",
     "PipelineConfigParserError",
     "PipelineContext",
     "SamplePopulatorBlock",
@@ -33,6 +34,7 @@
     SIMPLE_PIPELINES_PACKAGE,
     EmptyDatasetError,
     Pipeline,
+    PipelineBlockError,
     PipelineConfigParserError,
     PipelineContext,
 )