From 1cfc8af69214c023a6d3da7dde59d09301c520a3 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 16 Jul 2024 11:56:30 -0600 Subject: [PATCH 1/5] BlockNameInErrors: Add exception handling in Pipeline with block name/type wrapping https://github.com/instructlab/sdg/issues/128 Signed-off-by: Gabe Goodhart --- src/instructlab/sdg/pipeline.py | 64 ++++++++++++++++++++------------- tests/test_pipeline.py | 64 +++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 25 deletions(-) create mode 100644 tests/test_pipeline.py diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 05044dc0..f05cb660 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -66,31 +66,45 @@ def generate(self, dataset) -> Dataset: Generate the dataset by running the pipeline steps. dataset: the input dataset """ - for block_prop in self.chained_blocks: - block_name = block_prop["name"] - block_type = _lookup_block_type(block_prop["type"]) - block_config = block_prop["config"] - drop_columns = block_prop.get("drop_columns", []) - drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(self.ctx, self, block_name, **block_config) - - logger.info("Running block: %s", block_name) - logger.info(dataset) - - dataset = block.generate(dataset) - - # If at any point we end up with an empty data set, the pipeline has failed - if len(dataset) == 0: - raise EmptyDatasetError( - f"Pipeline stopped: Empty dataset after running block: {block_name}" - ) - - drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names] - if drop_columns: - dataset = dataset.remove_columns(drop_columns_in_ds) - - if drop_duplicates_cols: - dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) + try: + for block_prop in self.chained_blocks: + block_name = block_prop["name"] + block_type = _lookup_block_type(block_prop["type"]) + block_config = block_prop["config"] + drop_columns = block_prop.get("drop_columns", []) + drop_duplicates_cols = block_prop.get("drop_duplicates", False) + block = block_type(self.ctx, self, block_name, **block_config) + + logger.info("Running block: %s", block_name) + logger.info(dataset) + + dataset = block.generate(dataset) + + # If at any point we end up with an empty data set, the pipeline has failed + if len(dataset) == 0: + raise EmptyDatasetError( + f"Pipeline stopped: Empty dataset after running block: {block_name}" + ) + + drop_columns_in_ds = [ + e for e in drop_columns if e in dataset.column_names + ] + if drop_columns: + dataset = dataset.remove_columns(drop_columns_in_ds) + + if drop_duplicates_cols: + dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) + except Exception as err: + block_exc_err = f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}" + + # Try to raise the same exception type. This can fail if the + # exception is a non-standard type that has a different init + # signature, so fall back to raising a RuntimeError in that case. + try: + wrapper_err = type(err)(block_exc_err) + except TypeError: + wrapper_err = RuntimeError(block_exc_err) + raise wrapper_err from err return dataset diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 00000000..803b2879 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,64 @@ +""" +Unit tests for common Pipeline functionality +""" + +# Standard +from unittest import mock + +# Third Party +import pytest + +# First Party +from instructlab.sdg.pipeline import Pipeline + +## Helpers ## + + +class CustomTypeError(TypeError): + pass + + +class NoArgError(RuntimeError): + """Exception that can't be instantiated with a single argument""" + + def __init__(self): + super().__init__("no args") + + +@pytest.mark.parametrize( + ["failure_exc", "exp_err_type"], + [ + (CustomTypeError("Oh no!"), CustomTypeError), + (NoArgError(), RuntimeError), + ], +) +def test_pipeline_named_errors_match_type(failure_exc, exp_err_type): + """Validate that block types and names appear in the error message from a + pipeline exception and that the type of the error is preserved. + """ + mock_dataset = ["not empty"] + working_block = mock.MagicMock() + working_block().generate.return_value = mock_dataset + failure_block = mock.MagicMock() + failure_block.__name__ = "BadBlock" + failure_block().generate = mock.MagicMock(side_effect=failure_exc) + pipe_cfg = [ + {"name": "I work", "type": "working", "config": {}}, + {"name": "I don't", "type": "failure", "config": {}}, + ] + with mock.patch( + "instructlab.sdg.pipeline._block_types", + { + "working": working_block, + "failure": failure_block, + }, + ): + pipe = Pipeline(None, None, pipe_cfg) + with pytest.raises(exp_err_type) as exc_ctx: + pipe.generate(None) + + assert exc_ctx.value.__cause__ is failure_exc + assert ( + str(exc_ctx.value) + == f"BLOCK ERROR [{failure_block.__name__}/{pipe_cfg[1]['name']}]: {failure_exc}" + ) From 10955fec1643235088acf30ead25ace78708b82f Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 16 Jul 2024 13:40:16 -0600 Subject: [PATCH 2/5] BlockNameInErrors: Limit try/except to the generate call https://github.com/instructlab/sdg/issues/128 Signed-off-by: Gabe Goodhart --- src/instructlab/sdg/pipeline.py | 76 +++++++++++++++++---------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index f05cb660..c74024b8 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -66,45 +66,47 @@ def generate(self, dataset) -> Dataset: Generate the dataset by running the pipeline steps. dataset: the input dataset """ - try: - for block_prop in self.chained_blocks: - block_name = block_prop["name"] - block_type = _lookup_block_type(block_prop["type"]) - block_config = block_prop["config"] - drop_columns = block_prop.get("drop_columns", []) - drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(self.ctx, self, block_name, **block_config) - - logger.info("Running block: %s", block_name) - logger.info(dataset) - + for block_prop in self.chained_blocks: + # Parse and instantiate the block + block_name = block_prop["name"] + block_type = _lookup_block_type(block_prop["type"]) + block_config = block_prop["config"] + drop_columns = block_prop.get("drop_columns", []) + drop_duplicates_cols = block_prop.get("drop_duplicates", False) + block = block_type(self.ctx, self, block_name, **block_config) + logger.info("Running block: %s", block_name) + logger.info(dataset) + + # Execute the block and wrap errors with the block name/type + try: dataset = block.generate(dataset) - # If at any point we end up with an empty data set, the pipeline has failed - if len(dataset) == 0: - raise EmptyDatasetError( - f"Pipeline stopped: Empty dataset after running block: {block_name}" - ) - - drop_columns_in_ds = [ - e for e in drop_columns if e in dataset.column_names - ] - if drop_columns: - dataset = dataset.remove_columns(drop_columns_in_ds) - - if drop_duplicates_cols: - dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) - except Exception as err: - block_exc_err = f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}" - - # Try to raise the same exception type. This can fail if the - # exception is a non-standard type that has a different init - # signature, so fall back to raising a RuntimeError in that case. - try: - wrapper_err = type(err)(block_exc_err) - except TypeError: - wrapper_err = RuntimeError(block_exc_err) - raise wrapper_err from err + except Exception as err: + block_exc_err = ( + f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}" + ) + + # Try to raise the same exception type. This can fail if the + # exception is a non-standard type that has a different init + # signature, so fall back to raising a RuntimeError in that case. + try: + wrapper_err = type(err)(block_exc_err) + except TypeError: + wrapper_err = RuntimeError(block_exc_err) + raise wrapper_err from err + + # If at any point we end up with an empty data set, the pipeline has failed + if len(dataset) == 0: + raise EmptyDatasetError( + f"Pipeline stopped: Empty dataset after running block: {block_name}" + ) + + drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names] + if drop_columns: + dataset = dataset.remove_columns(drop_columns_in_ds) + + if drop_duplicates_cols: + dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) return dataset From 95b2ed997a2da0f3af5088ad8619a9cc282e1bdc Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 17 Jul 2024 06:49:38 -0600 Subject: [PATCH 3/5] BlockNameInErrors: Refactor to use a single BlockGenerationError exception https://github.com/instructlab/sdg/issues/128 Signed-off-by: Gabe Goodhart --- src/instructlab/sdg/pipeline.py | 41 +++++++++++++++------- tests/test_pipeline.py | 62 ++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 42 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index c74024b8..02345107 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -9,6 +9,7 @@ # Local from . import filterblock, importblock, llmblock, utilblocks +from .block import Block from .logger_config import setup_logger logger = setup_logger(__name__) @@ -32,6 +33,32 @@ def __init__( self.num_procs = 8 +# This is part of the public API. +class BlockGenerationError(Exception): + """A BlockGenerationError occurs when a block generates an exception during + generation. It contains information about which block failed and why. + """ + + def __init__(self, block: Block, exception: Exception): + self.block = block + self.exception = exception + + def __str__(self) -> str: + return f"{self.__class__.__name__}({self.block_type}/{self.block_name}): {self.exception_message}" + + @property + def block_name(self) -> str: + return self.block.block_name + + @property + def block_type(self) -> str: + return self.block.__class__.__name__ + + @property + def exception_message(self) -> str: + return str(self.exception) + + # This is part of the public API. class Pipeline: def __init__(self, ctx, config_path, chained_blocks: list) -> None: @@ -80,20 +107,8 @@ def generate(self, dataset) -> Dataset: # Execute the block and wrap errors with the block name/type try: dataset = block.generate(dataset) - except Exception as err: - block_exc_err = ( - f"BLOCK ERROR [{block_type.__name__}/{block_name}]: {err}" - ) - - # Try to raise the same exception type. This can fail if the - # exception is a non-standard type that has a different init - # signature, so fall back to raising a RuntimeError in that case. - try: - wrapper_err = type(err)(block_exc_err) - except TypeError: - wrapper_err = RuntimeError(block_exc_err) - raise wrapper_err from err + raise BlockGenerationError(block=block, exception=err) from err # If at any point we end up with an empty data set, the pipeline has failed if len(dataset) == 0: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 803b2879..79a93d6c 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -6,41 +6,24 @@ from unittest import mock # Third Party +from datasets import Dataset import pytest # First Party -from instructlab.sdg.pipeline import Pipeline +from instructlab.sdg.block import Block +from instructlab.sdg.pipeline import BlockGenerationError, Pipeline -## Helpers ## - -class CustomTypeError(TypeError): - pass - - -class NoArgError(RuntimeError): - """Exception that can't be instantiated with a single argument""" - - def __init__(self): - super().__init__("no args") - - -@pytest.mark.parametrize( - ["failure_exc", "exp_err_type"], - [ - (CustomTypeError("Oh no!"), CustomTypeError), - (NoArgError(), RuntimeError), - ], -) -def test_pipeline_named_errors_match_type(failure_exc, exp_err_type): - """Validate that block types and names appear in the error message from a - pipeline exception and that the type of the error is preserved. +def test_pipeline_named_errors_match_type(): + """Validate that a BlockGenerationError is raised to wrap exceptions raised + in a Block's generate method """ mock_dataset = ["not empty"] working_block = mock.MagicMock() working_block().generate.return_value = mock_dataset failure_block = mock.MagicMock() failure_block.__name__ = "BadBlock" + failure_exc = RuntimeError("Oh no!") failure_block().generate = mock.MagicMock(side_effect=failure_exc) pipe_cfg = [ {"name": "I work", "type": "working", "config": {}}, @@ -54,11 +37,32 @@ def test_pipeline_named_errors_match_type(failure_exc, exp_err_type): }, ): pipe = Pipeline(None, None, pipe_cfg) - with pytest.raises(exp_err_type) as exc_ctx: + with pytest.raises(BlockGenerationError) as exc_ctx: pipe.generate(None) assert exc_ctx.value.__cause__ is failure_exc - assert ( - str(exc_ctx.value) - == f"BLOCK ERROR [{failure_block.__name__}/{pipe_cfg[1]['name']}]: {failure_exc}" - ) + assert exc_ctx.value.exception is failure_exc + assert exc_ctx.value.block is failure_block() + + +def test_block_generation_error_properties(): + """Make sure the BlockGenerationError exposes its properties and string form + correctly + """ + + class TestBlock(Block): + def generate(self, dataset: Dataset) -> Dataset: + return dataset + + block_name = "my-block" + block = TestBlock(None, None, block_name) + inner_err = TypeError("Not the right type") + gen_err = BlockGenerationError(block, inner_err) + assert gen_err.block is block + assert gen_err.exception is inner_err + assert gen_err.block_name is block_name + assert gen_err.block_type == TestBlock.__name__ + assert ( + str(gen_err) + == f"{BlockGenerationError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}" + ) From aa9fb2a3c68f13051a600a9061a5a84daaf4d84f Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 17 Jul 2024 11:29:06 -0600 Subject: [PATCH 4/5] BlockNameInErrors: Capture block instantiation errors as well as generation https://github.com/instructlab/sdg/issues/128 Signed-off-by: Gabe Goodhart --- src/instructlab/sdg/pipeline.py | 57 +++++++++++++++++++-------------- tests/test_pipeline.py | 51 ++++++++++++++++++++++++----- 2 files changed, 76 insertions(+), 32 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 02345107..8b006435 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Standard from importlib import resources +from typing import Optional import os.path # Third Party @@ -34,26 +35,27 @@ def __init__( # This is part of the public API. -class BlockGenerationError(Exception): - """A BlockGenerationError occurs when a block generates an exception during +class PipelineBlockError(Exception): + """A PipelineBlockError occurs when a block generates an exception during generation. It contains information about which block failed and why. """ - def __init__(self, block: Block, exception: Exception): - self.block = block + def __init__( + self, + exception: Exception, + *, + block: Optional[Block] = None, + block_name: Optional[str] = None, + block_type: Optional[str] = None, + ): self.exception = exception + self.block = block + self.block_name = block_name or (block.block_name if block else None) + self.block_type = block_type or (block.__class__.__name__ if block else None) def __str__(self) -> str: return f"{self.__class__.__name__}({self.block_type}/{self.block_name}): {self.exception_message}" - @property - def block_name(self) -> str: - return self.block.block_name - - @property - def block_type(self) -> str: - return self.block.__class__.__name__ - @property def exception_message(self) -> str: return str(self.exception) @@ -94,21 +96,28 @@ def generate(self, dataset) -> Dataset: dataset: the input dataset """ for block_prop in self.chained_blocks: - # Parse and instantiate the block - block_name = block_prop["name"] - block_type = _lookup_block_type(block_prop["type"]) - block_config = block_prop["config"] - drop_columns = block_prop.get("drop_columns", []) - drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(self.ctx, self, block_name, **block_config) - logger.info("Running block: %s", block_name) - logger.info(dataset) - - # Execute the block and wrap errors with the block name/type + # Initialize arguments for error handling to None + block, block_name, block_type = None, None, None try: + # Parse and instantiate the block + block_name = block_prop["name"] + block_type = _lookup_block_type(block_prop["type"]) + block_config = block_prop["config"] + drop_columns = block_prop.get("drop_columns", []) + drop_duplicates_cols = block_prop.get("drop_duplicates", False) + block = block_type(self.ctx, self, block_name, **block_config) + logger.info("Running block: %s", block_name) + logger.info(dataset) + + # Execute the block and wrap errors with the block name/type dataset = block.generate(dataset) except Exception as err: - raise BlockGenerationError(block=block, exception=err) from err + raise PipelineBlockError( + exception=err, + block=block, + block_name=block_name, + block_type=block_type, + ) from err # If at any point we end up with an empty data set, the pipeline has failed if len(dataset) == 0: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 79a93d6c..443eda26 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -11,11 +11,11 @@ # First Party from instructlab.sdg.block import Block -from instructlab.sdg.pipeline import BlockGenerationError, Pipeline +from instructlab.sdg.pipeline import Pipeline, PipelineBlockError def test_pipeline_named_errors_match_type(): - """Validate that a BlockGenerationError is raised to wrap exceptions raised + """Validate that a PipelineBlockError is raised to wrap exceptions raised in a Block's generate method """ mock_dataset = ["not empty"] @@ -37,7 +37,7 @@ def test_pipeline_named_errors_match_type(): }, ): pipe = Pipeline(None, None, pipe_cfg) - with pytest.raises(BlockGenerationError) as exc_ctx: + with pytest.raises(PipelineBlockError) as exc_ctx: pipe.generate(None) assert exc_ctx.value.__cause__ is failure_exc @@ -45,9 +45,24 @@ def test_pipeline_named_errors_match_type(): assert exc_ctx.value.block is failure_block() -def test_block_generation_error_properties(): - """Make sure the BlockGenerationError exposes its properties and string form - correctly +def test_pipeline_config_error_handling(): + """Validate that a PipelineBlockError is raised when block config is + incorrect + """ + pipe_cfg = [ + {"name_not_there": "I work", "type": "working", "config": {}}, + {"name": "I don't", "type": "failure", "config": {}}, + ] + pipe = Pipeline(None, None, pipe_cfg) + with pytest.raises(PipelineBlockError) as exc_ctx: + pipe.generate(None) + + assert isinstance(exc_ctx.value.__cause__, KeyError) + + +def test_block_generation_error_properties_from_block(): + """Make sure the PipelineBlockError exposes its properties and string form + correctly when pulled from a Block instance """ class TestBlock(Block): @@ -57,12 +72,32 @@ def generate(self, dataset: Dataset) -> Dataset: block_name = "my-block" block = TestBlock(None, None, block_name) inner_err = TypeError("Not the right type") - gen_err = BlockGenerationError(block, inner_err) + gen_err = PipelineBlockError(inner_err, block=block) assert gen_err.block is block assert gen_err.exception is inner_err assert gen_err.block_name is block_name assert gen_err.block_type == TestBlock.__name__ assert ( str(gen_err) - == f"{BlockGenerationError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}" + == f"{PipelineBlockError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}" + ) + + +def test_block_generation_error_properties_from_strings(): + """Make sure the PipelineBlockError exposes its properties and string form + correctly when pulled from strings + """ + inner_err = TypeError("Not the right type") + block_name = "my-block" + block_type = "TestBlock" + gen_err = PipelineBlockError( + inner_err, block_name=block_name, block_type=block_type + ) + assert gen_err.block is None + assert gen_err.exception is inner_err + assert gen_err.block_name is block_name + assert gen_err.block_type == block_type + assert ( + str(gen_err) + == f"{PipelineBlockError.__name__}({block_type}/{block_name}): {inner_err}" ) From ceb5a82211dcd369c52c14be6ca0857363733079 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 17 Jul 2024 11:29:23 -0600 Subject: [PATCH 5/5] BlockNameInErrors: Expose PipelineBlockError at the top of the library https://github.com/instructlab/sdg/issues/128 Signed-off-by: Gabe Goodhart --- src/instructlab/sdg/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py index 348e26a4..06df1200 100644 --- a/src/instructlab/sdg/__init__.py +++ b/src/instructlab/sdg/__init__.py @@ -12,6 +12,7 @@ "ImportBlock", "LLMBlock", "Pipeline", + "PipelineBlockError", "PipelineConfigParserError", "PipelineContext", "SamplePopulatorBlock", @@ -33,6 +34,7 @@ SIMPLE_PIPELINES_PACKAGE, EmptyDatasetError, Pipeline, + PipelineBlockError, PipelineConfigParserError, PipelineContext, )