From fa27d5b66ffd0fc589beadf31082a2d92483c5cc Mon Sep 17 00:00:00 2001 From: Mihir Rana Date: Mon, 30 Aug 2021 13:10:40 -0400 Subject: [PATCH] Minor updates. --- pytorch_common/additional_configs.py | 8 ++--- pytorch_common/datasets_dl.py | 19 ++++++++--- pytorch_common/metrics.py | 9 +++-- pytorch_common/models_dl.py | 2 +- pytorch_common/train_utils.py | 49 +++++++++++++++++++--------- pytorch_common/types.py | 3 ++ pytorch_common/utils.py | 35 ++++++++++---------- setup.py | 10 +++--- 8 files changed, 83 insertions(+), 52 deletions(-) diff --git a/pytorch_common/additional_configs.py b/pytorch_common/additional_configs.py index 77136ea..5e6af7c 100644 --- a/pytorch_common/additional_configs.py +++ b/pytorch_common/additional_configs.py @@ -12,8 +12,8 @@ class BaseDatasetConfig(Munch): Base configuration class for dataset-related settings. - Class attributes can be accessed with - both `config["key"]` and `config.key`. + Class attributes can be accessed with both + `configobj["key"]` and `configobj.key`. """ def __init__(self, dictionary: Optional[_StringDict] = None): @@ -43,8 +43,8 @@ class BaseModelConfig(Munch): Base configuration class for model-related settings. - Class attributes can be accessed with - both `config["key"]` and `config.key`. + Class attributes can be accessed with both + `configobj["key"]` and `configobj.key`. """ def __init__(self, dictionary: Optional[_StringDict] = None, model_type: Optional[str] = "classification"): diff --git a/pytorch_common/datasets_dl.py b/pytorch_common/datasets_dl.py index 067cd22..37e458d 100644 --- a/pytorch_common/datasets_dl.py +++ b/pytorch_common/datasets_dl.py @@ -40,7 +40,7 @@ def __getitem__(self, index): raise NotImplementedError def __len__(self): - raise NotImplementedError + return len(self.data) def print_dataset(self) -> None: """ @@ -48,8 +48,11 @@ def print_dataset(self) -> None: """ logging.info("\n" + "-" * 40) print_dataframe(self.data) - value_counts = self.data[self.target_col].value_counts() - logging.info(f"Target value counts:\n{value_counts}") + + if self.target_col in self.data: + value_counts = self.data[self.target_col].value_counts() + logging.info(f"Target value counts:\n{value_counts}") + logging.info("\n" + "-" * 40) def save(self, *args, **kwargs) -> None: @@ -85,11 +88,14 @@ def remove(cls, *args, **kwargs) -> None: """ remove_object(*args, **kwargs) - def progress_apply(self, data: pd.DataFrame, func: Callable, *args, **kwargs) -> pd.DataFrame: + def progress_apply(self, data: Union[pd.DataFrame, pd.Series], func: Callable, *args, **kwargs) -> pd.DataFrame: """ Generic function to `progress_apply` a given row-level function `func` on the given `data` (chunk). """ + if isinstance(data, pd.Series): + return data.progress_apply(func, *args, **kwargs) + assert isinstance(data, pd.DataFrame) return data.progress_apply(func, *args, **kwargs, axis=1) def sample_class( @@ -193,7 +199,10 @@ def undersample_class( self.shuffle_and_reindex_data() def _get_class_info( - self, class_to_sample: Optional[Union[float, str]] = None, column: Optional[str] = None, minority: bool = True, + self, + class_to_sample: Optional[Union[float, str]] = None, + column: Optional[str] = None, + minority: bool = True, ) -> Tuple[Union[float, str], int, List[int]]: """ Get the label, counts, and indices of each class. diff --git a/pytorch_common/metrics.py b/pytorch_common/metrics.py index 01d15c6..e0860e7 100644 --- a/pytorch_common/metrics.py +++ b/pytorch_common/metrics.py @@ -99,7 +99,7 @@ def get_loss_criterion_function(config: _Config, criterion: Optional[str] = "cro agg_func = torch.mean else: raise ValueError( - f"Param 'multilabel_reduction' ('{multilabel_reduction}') " f"must be one of ['sum', 'mean']." + f"Param 'multilabel_reduction' ('{multilabel_reduction}') must be one of ['sum', 'mean']." ) # Get per-label loss @@ -124,7 +124,10 @@ def get_loss_criterion_function(config: _Config, criterion: Optional[str] = "cro # Multilabel classification else: return lambda output_hist, y_hist: agg_func( - torch.stack([loss_criterion(output_hist, y_hist[..., i]) for i in range(y_hist.shape[-1])], dim=0,) + torch.stack( + [loss_criterion(output_hist, y_hist[..., i]) for i in range(y_hist.shape[-1])], + dim=0, + ) ) @@ -153,7 +156,7 @@ def get_eval_criterion_function( agg_func = np.mean else: raise ValueError( - f"Param 'multilabel_reduction' ('{multilabel_reduction}') " f"must be one of ['mean', 'none']." + f"Param 'multilabel_reduction' ('{multilabel_reduction}') must be one of ['mean', 'none']." ) # Get per-label eval criterion diff --git a/pytorch_common/models_dl.py b/pytorch_common/models_dl.py index 843842f..3f4d808 100644 --- a/pytorch_common/models_dl.py +++ b/pytorch_common/models_dl.py @@ -108,7 +108,7 @@ def predict_proba( :return probs: Predicted probabilities of each class """ if self.model_type != "classification" and threshold is not None: - raise ValueError(f"Param 'threshold' ('{threshold}') can only " f"be provided for classification models.") + raise ValueError(f"Param 'threshold' ('{threshold}') can only be provided for classification models.") probs = F.softmax(outputs, dim=-1) # Get probabilities of each class num_classes = probs.shape[-1] diff --git a/pytorch_common/train_utils.py b/pytorch_common/train_utils.py index d2f0e7a..0b10ab6 100644 --- a/pytorch_common/train_utils.py +++ b/pytorch_common/train_utils.py @@ -155,7 +155,14 @@ def train_model( if not config.disable_checkpointing: logging.info("Replacing current best model checkpoint...") best_checkpoint_file = save_model( - model, config, epoch, train_logger, val_logger, optimizer, scheduler, config_info_dict, + model, + config, + epoch, + train_logger, + val_logger, + optimizer, + scheduler, + config_info_dict, ) remove_model(config, best_epoch, config_info_dict) best_epoch = epoch @@ -177,7 +184,14 @@ def train_model( if not config.disable_checkpointing: logging.info("Dumping model and results...") save_model( - model, config, stop_epoch, train_logger, val_logger, optimizer, scheduler, config_info_dict, + model, + config, + stop_epoch, + train_logger, + val_logger, + optimizer, + scheduler, + config_info_dict, ) # Save current and best models @@ -370,10 +384,10 @@ def perform_one_epoch( # Store all required items to be returned loss_hist: List[float] = [] - targets_hist: List[torch.Tensor] = [] - outputs_hist: List[torch.Tensor] = [] - preds_hist: List[torch.Tensor] = [] - probs_hist: List[torch.Tensor] = [] + targets_hist: _TensorOrTensors = [] + outputs_hist: _TensorOrTensors = [] + preds_hist: _TensorOrTensors = [] + probs_hist: _TensorOrTensors = [] # Enable gradient computation if training to be performed else disable it. # Technically not required if this function is called from other supported @@ -411,9 +425,7 @@ def perform_one_epoch( # Print progess if batch_idx in batches_to_print: - logging.info( - f"{num_examples_complete}/{num_examples} " f"({percent_batches_complete:.0f}%) complete." - ) + logging.info(f"{num_examples_complete}/{num_examples} ({percent_batches_complete:.0f}%) complete.") else: # Perform training / evaluation # Compute and store loss @@ -514,7 +526,7 @@ def take_scheduler_step(scheduler: object, val_metric: Optional[float] = None) - scheduler_name = scheduler.__class__.__name__ if scheduler_name in REQUIRE_VAL_METRIC: - assert val_metric is not None, f"Param 'val_metric' must be provided " f"for '{scheduler_name}' scheduler." + assert val_metric is not None, f"Param 'val_metric' must be provided for '{scheduler_name}' scheduler." scheduler.step(val_metric) else: scheduler.step() @@ -604,7 +616,8 @@ def generate_checkpoint_dict( # Save items if provided for name, obj in zip( - ("train_logger", "val_logger", "optimizer", "scheduler"), (train_logger, val_logger, optimizer, scheduler), + ("train_logger", "val_logger", "optimizer", "scheduler"), + (train_logger, val_logger, optimizer, scheduler), ): if obj is not None: checkpoint[name] = obj if name in ["train_logger", "val_logger"] else obj.state_dict() @@ -742,7 +755,7 @@ def load_state_dict( if state_dict is not None: obj.load_state_dict(state_dict) else: - raise KeyError(f"{key} argument expected its state dict in " f"the loaded checkpoint but none was found.") + raise KeyError(f"{key} argument expected its state dict in the loaded checkpoint but none was found.") return obj # Load optimizer @@ -794,9 +807,9 @@ def validate_checkpoint_type(checkpoint_type: str, checkpoint_file: Optional[str `checkpoint_file`, if provided. """ ALLOWED_CHECKPOINT_TYPES = ["state", "model"] - assert checkpoint_type in ALLOWED_CHECKPOINT_TYPES, ( - f"Param 'checkpoint_type' ('{checkpoint_type}') " f"must be one of {ALLOWED_CHECKPOINT_TYPES}." - ) + assert ( + checkpoint_type in ALLOWED_CHECKPOINT_TYPES + ), f"Param 'checkpoint_type' ('{checkpoint_type}') must be one of {ALLOWED_CHECKPOINT_TYPES}." # Check that provided checkpoint_type matches that of checkpoint_file if checkpoint_file is not None: @@ -853,7 +866,11 @@ def __init__( """ self.criterion = criterion self._init_params( - mode=mode, min_delta=min_delta, patience=patience, best_val=best_val, best_val_tol=best_val_tol, + mode=mode, + min_delta=min_delta, + patience=patience, + best_val=best_val, + best_val_tol=best_val_tol, ) self._validate_params() self.best: Optional[float] = None diff --git a/pytorch_common/types.py b/pytorch_common/types.py index ae2657b..b2e0485 100644 --- a/pytorch_common/types.py +++ b/pytorch_common/types.py @@ -1,5 +1,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import numpy as np import torch import torch.nn as nn from matplotlib.figure import Figure @@ -17,6 +18,7 @@ "Union", "Munch", "_StringDict", + "_StringArrayDict", "_Config", "_Device", "_Batch", @@ -35,6 +37,7 @@ _StringDict = Dict[str, Any] +_StringArrayDict = Dict[str, np.ndarray] _Config = Union[_StringDict, Munch] _Device = Union[str, torch.device] _Batch = Iterable diff --git a/pytorch_common/utils.py b/pytorch_common/utils.py index 057d584..88d0b09 100644 --- a/pytorch_common/utils.py +++ b/pytorch_common/utils.py @@ -35,7 +35,7 @@ def create_dir_if_not_exists(dir_path: str) -> None: if it doesn't exist already. """ if not os.path.isdir(dir_path): - os.makedirs(dir_path, exist_ok=True) # exist_ok=True to avoid concurrent dir creation + os.makedirs(dir_path, exist_ok=True) # `exist_ok=True` to avoid concurrent dir creation # Create parent dir create_dir_if_not_exists(parent_dir_path) @@ -162,9 +162,7 @@ def save_plot( fig.savefig(get_file_path(config.plot_dir, f"{file_name}.{ext}"), dpi=300) -def save_object( - obj: Any, primary_path: str, file_name: Optional[str] = None, module: Optional[str] = "pickle" -) -> None: +def save_object(obj: Any, primary_path: str, file_name: Optional[str] = None, module: Optional[str] = "pickle") -> None: """ This is a generic function to save any given object using different `module`s, e.g. pickle, @@ -293,11 +291,9 @@ def get_pickle_module(pickle_module: Optional[str] = "pickle") -> Union[pickle, Return the correct module for pickling. :param pickle_module: must be one of ["pickle", "dill"] """ - if pickle_module == "pickle": - return pickle - elif pickle_module == "dill": - return dill - raise ValueError(f"Param 'pickle_module' ('{pickle_module}') must be one of ['pickle', 'dill'].") + if not pickle_module in ["pickle", "dill"]: + raise ValueError(f"Param 'pickle_module' ('{pickle_module}') must be one of ['pickle', 'dill'].") + return eval(pickle_module) def delete_model(model: nn.Module) -> None: @@ -358,7 +354,10 @@ def get_unique_config_name(primary_name: str, config_info_dict: Optional[_String def get_checkpoint_name( - checkpoint_type: str, model_name: str, epoch: int, config_info_dict: Optional[_StringDict] = None, + checkpoint_type: str, + model_name: str, + epoch: int, + config_info_dict: Optional[_StringDict] = None, ) -> str: """ Returns the appropriate name of checkpoint file @@ -383,7 +382,7 @@ def get_trainable_params(model: nn.Module) -> Dict[str, int]: num_params = sum(p.numel() for p in model.parameters()) num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) model_name = getattr(model, "__name__", model.__class__.__name__) - logging.info(f"Number of trainable/total parameters in {model_name}: " f"{num_trainable_params}/{num_params}") + logging.info(f"Number of trainable/total parameters in {model_name}: {num_trainable_params}/{num_params}") return {"trainable": num_trainable_params, "total": num_params} @@ -714,7 +713,10 @@ def add_eval_metrics(self, eval_metrics: Dict[str, float], epoch: Optional[int] self.eval_metrics_hist[eval_criterion][epoch] = eval_metrics[eval_criterion] def get_eval_metrics( - self, eval_criterion: Optional[str] = None, epoch: Optional[int] = None, flatten: Optional[bool] = False, + self, + eval_criterion: Optional[str] = None, + epoch: Optional[int] = None, + flatten: Optional[bool] = False, ) -> Union[float, List[float], OrderedDict[str, Union[float, List[float]]]]: """ Get the evaluation metrics history. @@ -742,10 +744,7 @@ def get_eval_metrics( return self.eval_metrics_hist[eval_criterion] # Return ordered dict elif epoch is not None: return OrderedDict( - { - eval_criterion: self.eval_metrics_hist[eval_criterion][epoch] - for eval_criterion in self.eval_criteria - } + {eval_criterion: self.eval_metrics_hist[eval_criterion][epoch] for eval_criterion in self.eval_criteria} ) return self.eval_metrics_hist @@ -779,7 +778,7 @@ def log_epoch_metrics(self, epoch: Optional[int] = -1) -> str: assert epoch_loss == epoch_eval_metrics dataset_type = "TRAIN" if self.is_train else "VAL " mean_loss_epoch = np.mean(self.get_losses(epoch=epoch_loss)) - result_str = f"\n\033[1m{dataset_type} Epoch: {epoch_loss}" f"\tAverage loss: {mean_loss_epoch:.4f}, " + result_str = f"\n\033[1m{dataset_type} Epoch: {epoch_loss}\tAverage loss: {mean_loss_epoch:.4f}, " result_str += ", ".join( [ f"{eval_criterion}: {self.get_eval_metrics(eval_criterion, epoch_loss):.4f}" @@ -957,7 +956,7 @@ def _set_pooler(self, model_type: str) -> None: self.pooler = self.POOLER_MAPPING[self.model_type] else: logging.warning( - f"No supported sequence pooler was found for model of " f"type '{model_type}'. Using the default one." + f"No supported sequence pooler was found for model of type '{model_type}'. Using the default one." ) self.model_type = self.DEFAULT_POOLER_TYPE self.pooler = self._default_pooler diff --git a/setup.py b/setup.py index 15ad233..156a407 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( # Application info name="pytorch_common", - version="1.4", + version="1.5", author="Mihir Rana", author_email="ranamihir@gmail.com", description="Repo for common PyTorch code", @@ -15,16 +15,16 @@ install_requires=[ "numpy>=1.17.2", "pandas>=0.24.0", - "matplotlib>=3.2.1", - "dask[dataframe]==2.21.0", + "matplotlib>=3.3.2", + "dask[dataframe]>=2.30.0", "toolz==0.10.0", "scikit-learn>=0.22.1", - "dill==0.3.2", + "dill>=0.3.3", "munch>=2.5.0", "locket==0.2.0", ], # Optional dependencies - extras_require={"nlp": ["transformers>=3.0.2"]}, # for NLP related projects + extras_require={"nlp": ["transformers==4.9.2"]}, # for NLP related projects # Add config and sql files to the package # https://python-packaging.readthedocs.io/en/latest/non-code-files.html include_package_data=True,