diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py index a28136c4..75b0a4e8 100644 --- a/src/instructlab/sdg/block.py +++ b/src/instructlab/sdg/block.py @@ -15,8 +15,9 @@ class Block(ABC): - def __init__(self, ctx, block_name: str) -> None: + def __init__(self, ctx, pipe, block_name: str) -> None: self.ctx = ctx + self.pipe = pipe self.block_name = block_name @staticmethod @@ -50,6 +51,8 @@ def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]: :return: The loaded configuration. """ if not os.path.isabs(config_path): - config_path = os.path.join(self.ctx.sdg_base, config_path) + config_path = os.path.join( + os.path.dirname(self.pipe.config_path), config_path + ) with open(config_path, "r", encoding="utf-8") as config_file: return yaml.safe_load(config_file) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index d43a597f..3cc7b427 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -77,6 +77,7 @@ class FilterByValueBlock(Block): def __init__( self, ctx, + pipe, block_name, filter_column, filter_value, @@ -88,6 +89,7 @@ def __init__( Parameters: - ctx (PipelineContext): A PipelineContext object containing runtime parameters. + - pipe (Pipeline): The Pipeline containing this block in its chain. - block_name (str): An identifier for this block. - filter_column (str): The name of the column in the dataset to apply the filter on. - filter_value (any or list of any): The value(s) to filter by. @@ -147,7 +149,7 @@ def __init__( - This block will filter the dataset to only include rows where the "full_name" column contains the substring "John" or "Jane". """ - super().__init__(ctx, block_name) + super().__init__(ctx, pipe, block_name) self.value = filter_value if isinstance(filter_value, list) else [filter_value] self.column_name = filter_column self.operation = _get_operator_func(operation) diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py index 129311cb..5fa479b8 100644 --- a/src/instructlab/sdg/importblock.py +++ b/src/instructlab/sdg/importblock.py @@ -14,6 +14,7 @@ class ImportBlock(Block): def __init__( self, ctx, + pipe, block_name, path, ) -> None: @@ -22,10 +23,11 @@ def __init__( Parameters: - ctx (PipelineContext): A PipelineContext object containing runtime parameters. + - pipe (Pipeline): The Pipeline containing this block in its chain. - block_name (str): An identifier for this block. - path (str): A path (absolute, or relative to the instructlab.sdg package) to a pipeline config file. """ - super().__init__(ctx, block_name) + super().__init__(ctx, pipe, block_name) self.path = path self.pipeline = pipeline.Pipeline.from_file(self.ctx, self.path) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index d090e2b4..3f4d32f4 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -56,13 +56,14 @@ class LLMBlock(Block): def __init__( self, ctx, + pipe, block_name, config_path, output_cols, parser_kwargs={}, batch_kwargs={}, ) -> None: - super().__init__(ctx, block_name) + super().__init__(ctx, pipe, block_name) self.block_config = self._load_config(config_path) self.prompt_struct = ( """{system}\n{introduction}\n{principles}\n{examples}\n{generation}""" @@ -215,6 +216,7 @@ class ConditionalLLMBlock(LLMBlock): def __init__( self, ctx, + pipe, block_name, config_paths, output_cols, @@ -224,6 +226,7 @@ def __init__( ) -> None: super().__init__( ctx, + pipe, block_name, config_paths[0][0], output_cols, diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index bea672e1..3ee08306 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -22,27 +22,28 @@ def __init__( self.model_family = model_family self.model_id = model_id self.num_instructions_to_generate = num_instructions_to_generate - self.sdg_base = resources.files(__package__) # FIXME: base this on the available number of CPUs self.num_procs = 8 class Pipeline: - def __init__(self, ctx, chained_blocks: list) -> None: + def __init__(self, ctx, config_path, chained_blocks: list) -> None: """ Initialize the Pipeline class with a configuration dictionary. config_dict: the run config py or yaml loaded into a dictionary """ # ctx is a PipelineContext object that supplies context configuration to every block self.ctx = ctx + # config_path is the path of the pipeline config file used to create this pipeline + self.config_path = config_path # pipeline config is the run configuration that consists of the pipeline steps self.chained_blocks = chained_blocks @classmethod def from_file(cls, ctx, pipeline_yaml): if not os.path.isabs(pipeline_yaml): - pipeline_yaml = os.path.join(ctx.sdg_base, pipeline_yaml) - return cls(ctx, _parse_pipeline_config_file(pipeline_yaml)) + pipeline_yaml = os.path.join(resources.files(__package__), pipeline_yaml) + return cls(ctx, pipeline_yaml, _parse_pipeline_config_file(pipeline_yaml)) def _drop_duplicates(self, dataset, cols): """ @@ -64,7 +65,7 @@ def generate(self, dataset) -> Dataset: drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(self.ctx, block_name, **block_config) + block = block_type(self.ctx, self, block_name, **block_config) logger.info("Running block: %s", block_name) logger.info(dataset) diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml index 7d8d68ca..e14c059a 100644 --- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_questions type: LLMBlock config: - config_path: configs/skills/freeform_questions.yaml + config_path: ../../configs/skills/freeform_questions.yaml output_cols: - question batch_kwargs: @@ -13,7 +13,7 @@ blocks: - name: eval_questions type: LLMBlock config: - config_path: configs/skills/evaluate_freeform_questions.yaml + config_path: ../../configs/skills/evaluate_freeform_questions.yaml output_cols: - evaluation - score @@ -31,13 +31,13 @@ blocks: - name: gen_responses type: LLMBlock config: - config_path: configs/skills/freeform_responses.yaml + config_path: ../../configs/skills/freeform_responses.yaml output_cols: - response - name: evaluate_qa_pair type: LLMBlock config: - config_path: configs/skills/evaluate_freeform_pair.yaml + config_path: ../../configs/skills/evaluate_freeform_pair.yaml output_cols: - evaluation - score diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index c051433c..8fad3b83 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_contexts type: LLMBlock config: - config_path: configs/skills/contexts.yaml + config_path: ../../configs/skills/contexts.yaml output_cols: - context gen_kwargs: @@ -15,7 +15,7 @@ blocks: - name: gen_grounded_questions type: LLMBlock config: - config_path: configs/skills/grounded_questions.yaml + config_path: ../../configs/skills/grounded_questions.yaml output_cols: - question batch_kwargs: @@ -25,7 +25,7 @@ blocks: - name: eval_grounded_questions type: LLMBlock config: - config_path: configs/skills/evaluate_grounded_questions.yaml + config_path: ../../configs/skills/evaluate_grounded_questions.yaml output_cols: - evaluation - score @@ -43,13 +43,13 @@ blocks: - name: gen_grounded_responses type: LLMBlock config: - config_path: configs/skills/grounded_responses.yaml + config_path: ../../configs/skills/grounded_responses.yaml output_cols: - response - name: evaluate_grounded_qa_pair type: LLMBlock config: - config_path: configs/skills/evaluate_grounded_pair.yaml + config_path: ../../configs/skills/evaluate_grounded_pair.yaml output_cols: - evaluation - score diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml index a1ef7ecb..21802921 100644 --- a/src/instructlab/sdg/pipelines/full/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_mmlu_knowledge type: LLMBlock config: - config_path: configs/knowledge/mcq_generation.yaml + config_path: ../../configs/knowledge/mcq_generation.yaml output_cols: - mmlubench_question - mmlubench_answer @@ -15,7 +15,7 @@ blocks: - name: gen_knowledge type: LLMBlock config: - config_path: configs/knowledge/generate_questions_responses.yaml + config_path: ../../configs/knowledge/generate_questions_responses.yaml output_cols: - question - response @@ -31,7 +31,7 @@ blocks: - name: eval_faithfulness_qa_pair type: LLMBlock config: - config_path: configs/knowledge/evaluate_faithfulness.yaml + config_path: ../../configs/knowledge/evaluate_faithfulness.yaml output_cols: - explanation - judgment @@ -49,7 +49,7 @@ blocks: - name: eval_relevancy_qa_pair type: LLMBlock config: - config_path: configs/knowledge/evaluate_relevancy.yaml + config_path: ../../configs/knowledge/evaluate_relevancy.yaml output_cols: - feedback - score @@ -68,7 +68,7 @@ blocks: - name: eval_verify_question type: LLMBlock config: - config_path: configs/knowledge/evaluate_question.yaml + config_path: ../../configs/knowledge/evaluate_question.yaml output_cols: - explanation - rating diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml index de3c8f80..be589af8 100644 --- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_skill_freeform type: LLMBlock config: - config_path: configs/skills/simple_generate_qa_freeform.yaml + config_path: ../../configs/skills/simple_generate_qa_freeform.yaml output_cols: - output gen_kwargs: diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml index ed5b1839..23925034 100644 --- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_skill_grounded type: LLMBlock config: - config_path: configs/skills/simple_generate_qa_grounded.yaml + config_path: ../../configs/skills/simple_generate_qa_grounded.yaml output_cols: - output gen_kwargs: diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml index bf89c098..7e2cdc4f 100644 --- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_knowledge type: LLMBlock config: - config_path: configs/knowledge/simple_generate_qa.yaml + config_path: ../../configs/knowledge/simple_generate_qa.yaml output_cols: - output gen_kwargs: diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index b4e39a5b..6c503d28 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -10,8 +10,10 @@ class SamplePopulatorBlock(Block): - def __init__(self, ctx, block_name, config_paths, column_name, post_fix="") -> None: - super().__init__(ctx, block_name) + def __init__( + self, ctx, pipe, block_name, config_paths, column_name, post_fix="" + ) -> None: + super().__init__(ctx, pipe, block_name) self.configs = {} for config in config_paths: if post_fix: @@ -37,8 +39,10 @@ def generate(self, samples) -> Dataset: class SelectorBlock(Block): - def __init__(self, ctx, block_name, choice_map, choice_col, output_col) -> None: - super().__init__(ctx, block_name) + def __init__( + self, ctx, pipe, block_name, choice_map, choice_col, output_col + ) -> None: + super().__init__(ctx, pipe, block_name) self.choice_map = choice_map self.choice_col = choice_col self.output_col = output_col @@ -63,8 +67,10 @@ def generate(self, samples: Dataset) -> Dataset: class CombineColumnsBlock(Block): - def __init__(self, ctx, block_name, columns, output_col, separator="\n\n") -> None: - super().__init__(ctx, block_name) + def __init__( + self, ctx, pipe, block_name, columns, output_col, separator="\n\n" + ) -> None: + super().__init__(ctx, pipe, block_name) self.columns = columns self.output_col = output_col self.separator = separator diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py index cec4eff5..5dcc4d1b 100644 --- a/tests/test_filterblock.py +++ b/tests/test_filterblock.py @@ -1,5 +1,5 @@ # Standard -from unittest.mock import patch +from unittest.mock import MagicMock, patch import operator import unittest @@ -13,8 +13,12 @@ class TestFilterByValueBlock(unittest.TestCase): def setUp(self): + self.ctx = MagicMock() + self.ctx.num_procs = 1 + self.pipe = MagicMock() self.block = FilterByValueBlock( - PipelineContext(None, None, None, None), + self.ctx, + self.pipe, "filter_by_age", filter_column="age", filter_value="30", @@ -22,7 +26,8 @@ def setUp(self): convert_dtype="int", ) self.block_with_list = FilterByValueBlock( - PipelineContext(None, None, None, None), + self.ctx, + self.pipe, "filter_by_age_list", filter_column="age", filter_value=["30", "35"], diff --git a/tests/test_importblock.py b/tests/test_importblock.py index 1bc977de..80baf215 100644 --- a/tests/test_importblock.py +++ b/tests/test_importblock.py @@ -16,10 +16,11 @@ class TestImportBlockWithMockPipeline(unittest.TestCase): @patch("instructlab.sdg.pipeline.Pipeline") def setUp(self, mock_pipeline): self.ctx = MagicMock() + self.pipe = MagicMock() self.block_name = "test_block" self.path = "/path/to/config" self.mock_pipeline = mock_pipeline - self.import_block = ImportBlock(self.ctx, self.block_name, self.path) + self.import_block = ImportBlock(self.ctx, self.pipe, self.block_name, self.path) self.dataset = Dataset.from_dict({}) def test_initialization(self):