diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 25da098c..2821b373 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -22,7 +22,7 @@ # when |knowledge| << |skills| MIN_UPSAMPLE_THRESHOLD = 0.03 ALLOWED_COLS = ["id", "messages", "metadata"] -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger() class DatasetListing(TypedDict): @@ -739,10 +739,7 @@ def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data): self.num_procs, ) - def generate(self, logger=None): - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger + def generate(self): self._gen_mixed_data( self.knowledge_recipe, self.output_file_knowledge_recipe, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index ae5c6582..a640dbd3 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -340,7 +340,7 @@ def generate_data( document_output_dir = Path(output_dir) / f"documents-{date_suffix}" leaf_nodes = read_taxonomy_leaf_nodes( - taxonomy, taxonomy_base, yaml_rules, document_output_dir, logger=LOGGER + taxonomy, taxonomy_base, yaml_rules, document_output_dir ) if not leaf_nodes: raise GenerateException("Error: No new leaf nodes found in the taxonomy.") @@ -406,7 +406,6 @@ def generate_data( document_output_dir, model_name, docling_model_path=docling_model_path, - logger=LOGGER, ) if not samples: @@ -458,7 +457,7 @@ def generate_data( system_prompt, ) - mixer.generate(logger=LOGGER) + mixer.generate() generate_duration = time.time() - generate_start LOGGER.info(f"Generation took {generate_duration:.2f}s") diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 9a6e497c..63866b2b 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -23,7 +23,7 @@ from .blocks.block import Block from .registry import BlockRegistry -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger() # This is part of the public API. @@ -134,16 +134,12 @@ def from_file(cls, ctx, pipeline_yaml): pipeline_yaml = os.path.join(resources.files(__package__), pipeline_yaml) return cls(ctx, pipeline_yaml, *_parse_pipeline_config_file(pipeline_yaml)) - def generate(self, dataset, checkpoint_name=None, logger=None) -> Dataset: + def generate(self, dataset, checkpoint_name=None) -> Dataset: """ Generate the dataset by running the pipeline steps. dataset: the input dataset checkpoint_name: unique subdir name for the checkpoint within checkpoint_dir """ - - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger # The checkpointer allows us to resume from where we left off # Saving the output of pipe instances along the way checkpoint_dir = None diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 88c575e2..15e69587 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -30,7 +30,7 @@ # Initialize the pdf parser PDFParser = pdf_parser_v1() -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger() def _is_taxonomy_file(fn: str) -> bool: @@ -372,11 +372,8 @@ def read_taxonomy( def read_taxonomy_leaf_nodes( - taxonomy, taxonomy_base, yaml_rules, document_output_dir=None, logger=None + taxonomy, taxonomy_base, yaml_rules, document_output_dir=None ): - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger seed_instruction_data = read_taxonomy( taxonomy, taxonomy_base, yaml_rules, document_output_dir ) @@ -466,11 +463,7 @@ def leaf_node_to_samples( document_output_dir, model_name, docling_model_path=None, - logger=None, ): - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger if not leaf_node: return [] if leaf_node[0].get("documents"):