From 76a8624a7d03c12bd84c68938eaf4a836719b81c Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 24 Jul 2024 11:02:41 +0200 Subject: [PATCH 1/2] Use module level logger `generate_data` now uses a module-level logger like the rest of the code. The `logger` parameter is now optional. Signed-off-by: Christian Heimes --- src/instructlab/sdg/generate_data.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 730c8b39..5bf300f9 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Optional import json +import logging import os import time @@ -35,6 +36,8 @@ read_taxonomy_leaf_nodes, ) +logger = logging.getLogger(__name__) + _SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant." @@ -74,7 +77,7 @@ def _convert_to_messages(sample): def _gen_train_data( - logger, machine_instruction_data, output_file_train, output_file_messages + machine_instruction_data, output_file_train, output_file_messages ): """ Generate training data in the legacy system/user/assistant format @@ -257,9 +260,9 @@ def _mixer_init(ctx, output_dir, date_suffix): # This is part of the public API, and used by instructlab. # TODO - parameter removal needs to be done in sync with a CLI change. -# pylint: disable=unused-argument +# to be removed: logger, prompt_file_path, rouge_threshold, tls_* def generate_data( - logger, + logger: logging.Logger = logger, # pylint: disable=redefined-outer-name api_base: Optional[str] = None, api_key: Optional[str] = None, model_family: Optional[str] = None, @@ -270,9 +273,9 @@ def generate_data( taxonomy_base: Optional[str] = None, output_dir: Optional[str] = None, # TODO - not used and should be removed from the CLI - prompt_file_path: Optional[str] = None, + prompt_file_path: Optional[str] = None, # pylint: disable=unused-argument # TODO - probably should be removed - rouge_threshold: Optional[float] = None, + rouge_threshold: Optional[float] = None, # pylint: disable=unused-argument console_output=True, yaml_rules: Optional[str] = None, chunk_word_count=None, @@ -382,9 +385,9 @@ def generate_data( else: sdg = sdg_freeform_skill - logger.debug("Samples: %s" % samples) + logger.debug("Samples: %s", samples) ds = Dataset.from_list(samples) - logger.debug("Dataset: %s" % ds) + logger.debug("Dataset: %s", ds) new_generated_data = sdg.generate(ds) if len(new_generated_data) == 0: raise EmptyDatasetError( @@ -395,8 +398,8 @@ def generate_data( if generated_data is None else generated_data + [new_generated_data] ) - logger.info("Generated %d samples" % len(generated_data)) - logger.debug("Generated data: %s" % generated_data) + logger.info("Generated %d samples", len(generated_data)) + logger.debug("Generated data: %s", generated_data) if is_knowledge: # generate mmlubench data for the current leaf node @@ -414,7 +417,6 @@ def generate_data( generated_data = [] _gen_train_data( - logger, generated_data, os.path.join(output_dir, output_file_train), os.path.join(output_dir, output_file_messages), From 81a69bb633a0ef362538e06b85e7c0e6b85f7880 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 26 Jul 2024 23:37:32 +0100 Subject: [PATCH 2/2] Remove calls to logging.basicConfig on import Fixes #196 instructlab.sdg calls logging.basicConfig on import. Libraries should not configure logging themselves. It's the job of the application or main script to configure logging and set up log handlers according to their needs. logging.basicConfig is a one-shot API. The second and any following call are a no-op, unless it is called with force=True. Remove setup_logging and just use logging.getLogger in the code. Co-authored-by: Christian Heimes Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/block.py | 6 ++---- src/instructlab/sdg/datamixing.py | 4 ++-- src/instructlab/sdg/eval_data.py | 6 ++---- src/instructlab/sdg/filterblock.py | 4 ++-- src/instructlab/sdg/generate_data.py | 4 +--- src/instructlab/sdg/importblock.py | 6 ++++-- src/instructlab/sdg/llmblock.py | 4 ++-- src/instructlab/sdg/logger_config.py | 18 ------------------ src/instructlab/sdg/pipeline.py | 4 ++-- src/instructlab/sdg/utilblocks.py | 6 ++++-- 10 files changed, 21 insertions(+), 41 deletions(-) delete mode 100644 src/instructlab/sdg/logger_config.py diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py index 686ab783..c072908a 100644 --- a/src/instructlab/sdg/block.py +++ b/src/instructlab/sdg/block.py @@ -2,15 +2,13 @@ # Standard from abc import ABC from typing import Any, Dict, Union +import logging import os.path # Third Party import yaml -# Local -from .logger_config import setup_logger - -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 2b6e6d30..9d5c10f0 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -1,6 +1,7 @@ # Standard from typing import Optional import json +import logging import os.path import random import uuid @@ -10,11 +11,10 @@ import yaml # First Party -from instructlab.sdg.logger_config import setup_logger from instructlab.sdg.utils import GenerateException, pandas ALLOWED_COLS = ["id", "messages", "metadata"] -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) def _adjust_train_sample_size(ds: Dataset, num_samples: int): diff --git a/src/instructlab/sdg/eval_data.py b/src/instructlab/sdg/eval_data.py index a95bd597..509f3a32 100644 --- a/src/instructlab/sdg/eval_data.py +++ b/src/instructlab/sdg/eval_data.py @@ -1,6 +1,7 @@ # Standard from importlib import resources from typing import Any +import logging import re # Third Party @@ -10,10 +11,7 @@ # First Party from instructlab.sdg.pipeline import EVAL_PIPELINES_PKG, Pipeline -# Local -from .logger_config import setup_logger - -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) def _extract_options(text: str) -> list[Any]: diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 1ae3e5b9..a82c8c7d 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +import logging import operator # Third Party @@ -7,9 +8,8 @@ # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 5bf300f9..4fa606b6 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -76,9 +76,7 @@ def _convert_to_messages(sample): return sample -def _gen_train_data( - machine_instruction_data, output_file_train, output_file_messages -): +def _gen_train_data(machine_instruction_data, output_file_train, output_file_messages): """ Generate training data in the legacy system/user/assistant format used in train_*.jsonl as well as the legacy messages format used diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py index c2e98b31..e6aeab7c 100644 --- a/src/instructlab/sdg/importblock.py +++ b/src/instructlab/sdg/importblock.py @@ -1,12 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +import logging + # Third Party from datasets import Dataset # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 3133f932..69c7708a 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -2,6 +2,7 @@ # Standard from collections import ChainMap from typing import Any, Dict +import logging import re # Third Party @@ -10,9 +11,8 @@ # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) MODEL_FAMILY_MIXTRAL = "mixtral" MODEL_FAMILY_MERLINITE = "merlinite" diff --git a/src/instructlab/sdg/logger_config.py b/src/instructlab/sdg/logger_config.py deleted file mode 100644 index 37c958ab..00000000 --- a/src/instructlab/sdg/logger_config.py +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Standard -import logging - -# Third Party -from rich.logging import RichHandler - - -def setup_logger(name): - # Set up the logger - logging.basicConfig( - level=logging.INFO, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler()], - ) - logger = logging.getLogger(name) - return logger diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 1263c974..fa78a601 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from importlib import resources from typing import Iterable, Optional +import logging import math import os.path @@ -18,9 +19,8 @@ # Local from . import filterblock, importblock, llmblock, utilblocks from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API. diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 2eec3a38..c16acb7d 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +import logging + # Third Party from datasets import Dataset @@ -7,9 +10,8 @@ # Local from .block import Block -from .logger_config import setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # This is part of the public API.