From 5be6e40b00a02006d932cb0d8f7f118e837535c2 Mon Sep 17 00:00:00 2001 From: Allen Date: Mon, 11 Nov 2024 18:01:17 +0000 Subject: [PATCH 01/29] move rc logging to warnings --- .../lowcode/anomaly/model/anomaly_merlion.py | 11 +- .../operator/lowcode/anomaly/model/automlx.py | 20 ++-- .../operator/lowcode/anomaly/model/autots.py | 9 +- .../lowcode/anomaly/model/base_model.py | 26 +++-- .../lowcode/anomaly/model/isolationforest.py | 19 ++-- .../lowcode/anomaly/model/oneclasssvm.py | 21 ++-- .../lowcode/anomaly/model/randomcutforest.py | 8 +- .../operator/lowcode/forecast/model/arima.py | 26 +++-- .../lowcode/forecast/model/automlx.py | 52 ++++----- .../operator/lowcode/forecast/model/autots.py | 34 +++--- .../lowcode/forecast/model/base_model.py | 81 ++++++++------ .../lowcode/forecast/model/ml_forecast.py | 3 + .../lowcode/forecast/model/neuralprophet.py | 76 +++++-------- .../lowcode/forecast/model/prophet.py | 35 +++--- ads/opctl/operator/lowcode/forecast/utils.py | 62 +++++------ .../operator/lowcode/pii/model/report.py | 18 +-- .../lowcode/recommender/model/base_model.py | 103 ++++++++++-------- .../operator/lowcode/recommender/model/svd.py | 70 +++++++----- pyproject.toml | 8 +- 19 files changed, 361 insertions(+), 321 deletions(-) diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py index cc1e80b52..8999b2674 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +++ b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py @@ -4,9 +4,11 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import importlib +import logging import numpy as np import pandas as pd +import report_creator as rc from merlion.post_process.threshold import AggregateAlarms from merlion.utils import TimeSeries @@ -21,6 +23,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("report_creator").setLevel(logging.WARNING) + class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel): """Class representing Merlion Anomaly Detection operator model.""" @@ -84,7 +88,7 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): data = df.set_index(date_column) data = TimeSeries.from_pd(data) - for model_name, (model_config, model) in model_config_map.items(): + for _, (model_config, model) in model_config_map.items(): if self.spec.model == SupportedModels.BOCPD: model_config = model_config(**self.spec.model_kwargs) else: @@ -115,7 +119,7 @@ def _build_model(self) -> AnomalyOutput: y_pred = (y_pred.to_pd().reset_index()["anom_score"] > 0).astype( int ) - except Exception as e: + except Exception: y_pred = ( scores["anom_score"] > np.percentile( @@ -135,15 +139,12 @@ def _build_model(self) -> AnomalyOutput: OutputColumns.SCORE_COL: scores["anom_score"], } ).reset_index(drop=True) - # model_objects[model_name].append(model) anomaly_output.add_output(target, anomaly, score) return anomaly_output def _generate_report(self): """Genreates a report for the model.""" - import report_creator as rc - other_sections = [ rc.Heading("Selected Models Overview", level=2), rc.Text( diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py index a6deef1fa..059545cf8 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/automlx.py +++ b/ads/opctl/operator/lowcode/anomaly/model/automlx.py @@ -1,16 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency -from .anomaly_dataset import AnomalyOutput +from ads.opctl import logger +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns + +logging.getLogger("report_creator").setLevel(logging.WARNING) class AutoMLXOperatorModel(AnomalyOperatorBaseModel): @@ -25,16 +30,17 @@ class AutoMLXOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> pd.DataFrame: - from automlx import init import logging + import automlx + try: - init( + automlx.init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, ) - except Exception as e: + except Exception: logger.info("Ray already initialized") date_column = self.spec.datetime_column.name anomaly_output = AnomalyOutput(date_column=date_column) @@ -73,8 +79,6 @@ def _build_model(self) -> pd.DataFrame: return anomaly_output def _generate_report(self): - import report_creator as rc - """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py index c795440de..550833a67 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/autots.py +++ b/ads/opctl/operator/lowcode/anomaly/model/autots.py @@ -1,9 +1,12 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger from ads.opctl.operator.lowcode.anomaly.const import OutputColumns @@ -12,6 +15,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("report_creator").setLevel(logging.WARNING) + class AutoTSOperatorModel(AnomalyOperatorBaseModel): """Class representing AutoTS Anomaly Detection operator model.""" @@ -91,8 +96,6 @@ def _build_model(self) -> AnomalyOutput: return anomaly_output def _generate_report(self): - import report_creator as rc - """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index e8de5213e..c9ca984be 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -3,6 +3,7 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import tempfile import time @@ -12,6 +13,7 @@ import fsspec import numpy as np import pandas as pd +import report_creator as rc from sklearn import linear_model from ads.common.object_storage_details import ObjectStorageDetails @@ -33,6 +35,8 @@ from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData +logging.getLogger("report_creator").setLevel(logging.WARNING) + class AnomalyOperatorBaseModel(ABC): """The base class for the anomaly detection operator models.""" @@ -59,8 +63,8 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): def generate_report(self): """Generates the report.""" import matplotlib.pyplot as plt - plt.rcParams.update({'figure.max_open_warning': 0}) - import report_creator as rc + + plt.rcParams.update({"figure.max_open_warning": 0}) start_time = time.time() # fallback using sklearn oneclasssvm when the sub model _build_model fails @@ -84,7 +88,13 @@ def generate_report(self): anomaly_output, test_data, elapsed_time ) table_blocks = [ - rc.DataTable(df.head(SUBSAMPLE_THRESHOLD) if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD else df, label=col, index=True) + rc.DataTable( + df.head(SUBSAMPLE_THRESHOLD) + if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD + else df, + label=col, + index=True, + ) for col, df in self.datasets.full_data_dict.items() ] data_table = rc.Select(blocks=table_blocks) @@ -144,7 +154,9 @@ def generate_report(self): else: figure_blocks = None - blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None + blocks.append( + rc.Group(*figure_blocks, label=target) + ) if figure_blocks else None plots = rc.Select(blocks) report_sections = [] @@ -154,7 +166,9 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), + rc.Text( + f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n" + ), rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." @@ -285,8 +299,6 @@ def _save_report( test_metrics: pd.DataFrame, ): """Saves resulting reports to the given folder.""" - import report_creator as rc - unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py index 0083ad0fd..ef7715653 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py @@ -1,17 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .base_model import AnomalyOperatorBaseModel from .anomaly_dataset import AnomalyOutput -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .base_model import AnomalyOperatorBaseModel + +logging.getLogger("report_creator").setLevel(logging.WARNING) class IsolationForestOperatorModel(AnomalyOperatorBaseModel): @@ -36,13 +40,9 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = IsolationForest(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)( - model.predict(df) - ) + y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) - scores = model.score_samples( - df - ) + scores = model.score_samples(df) index_col = df.columns[0] @@ -59,7 +59,6 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py index 157f7eb60..f6177e63d 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +++ b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py @@ -1,17 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .base_model import AnomalyOperatorBaseModel from .anomaly_dataset import AnomalyOutput -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .base_model import AnomalyOperatorBaseModel + +logging.getLogger("report_creator").setLevel(logging.WARNING) class OneClassSVMOperatorModel(AnomalyOperatorBaseModel): @@ -36,13 +40,9 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = OneClassSVM(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)( - model.predict(df) - ) + y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) - scores = model.score_samples( - df - ) + scores = model.score_samples(df) index_col = df.columns[0] @@ -54,12 +54,11 @@ def _build_model(self) -> AnomalyOutput: ).reset_index(drop=True) anomaly_output.add_output(target, anomaly, score) - + return anomaly_output def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index 17f19351d..ad34159ab 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -3,8 +3,11 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger @@ -13,6 +16,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("report_creator").setLevel(logging.WARNING) + class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): """ @@ -27,7 +32,7 @@ class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> AnomalyOutput: - from rrcf import RCTree + import rrcf model_kwargs = self.spec.model_kwargs @@ -96,7 +101,6 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py index 6bbd58d34..17817257f 100644 --- a/ads/opctl/operator/lowcode/forecast/model/arima.py +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -1,23 +1,26 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + import pandas as pd -import numpy as np import pmdarima as pm +import report_creator as rc from joblib import Parallel, delayed from ads.opctl import logger - -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe + +from ..const import ForecastOutputColumns, SupportedModels from ..operator_config import ForecastOperatorConfig -import traceback +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ..const import ForecastOutputColumns, SupportedModels + +logging.getLogger("report_creator").setLevel(logging.WARNING) class ArimaOperatorModel(ForecastOperatorBaseModel): @@ -39,7 +42,7 @@ def set_kwargs(self): ) model_kwargs = self.spec.model_kwargs model_kwargs["alpha"] = 1 - self.spec.confidence_interval_width - if "error_action" not in model_kwargs.keys(): + if "error_action" not in model_kwargs: model_kwargs["error_action"] = "ignore" return model_kwargs @@ -129,13 +132,14 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc()} + "error_trace": traceback.format_exc(), + } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() + self.models = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -154,8 +158,6 @@ def _build_model(self) -> pd.DataFrame: def _generate_report(self): """The method that needs to be implemented on the particular model level.""" - import report_creator as rc - all_sections = [] if len(self.models) > 0: sec5_text = rc.Heading("ARIMA Model Parameters", level=2) diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index eda6112b4..b36c383ad 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -1,29 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -import traceback - # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback -import pandas as pd import numpy as np +import pandas as pd +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import ( + seconds_to_datetime, +) from ads.opctl.operator.lowcode.forecast.const import ( AUTOMLX_METRIC_MAP, ForecastOutputColumns, SupportedModels, ) -from ads.opctl import logger +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe -from .base_model import ForecastOperatorBaseModel from ..operator_config import ForecastOperatorConfig +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ads.opctl.operator.lowcode.common.utils import ( - seconds_to_datetime, - datetime_to_seconds, -) -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe +logging.getLogger("report_creator").setLevel(logging.WARNING) AUTOMLX_N_ALGOS_TUNED = 4 AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" @@ -47,12 +48,13 @@ def set_kwargs(self): ) model_kwargs_cleaned.pop("task", None) time_budget = model_kwargs_cleaned.pop("time_budget", -1) - model_kwargs_cleaned[ - "preprocessing" - ] = self.spec.preprocessing.enabled or model_kwargs_cleaned.get("preprocessing", True) + model_kwargs_cleaned["preprocessing"] = ( + self.spec.preprocessing.enabled + or model_kwargs_cleaned.get("preprocessing", True) + ) return model_kwargs_cleaned, time_budget - def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanations + def preprocess(self, data): # TODO: re-use self.le for explanations _, df_encoded = _label_encode_dataframe( data, no_encode={self.spec.datetime_column.name, self.original_target_column}, @@ -74,11 +76,12 @@ def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanat ), ) def _build_model(self) -> pd.DataFrame: - from automlx import init import logging + import automlx + try: - init( + automlx.init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, @@ -88,7 +91,7 @@ def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() + self.models = {} horizon = self.spec.horizon self.spec.confidence_interval_width = self.spec.confidence_interval_width or 0.8 self.forecast_output = ForecastOutput( @@ -101,7 +104,7 @@ def _build_model(self) -> pd.DataFrame: # Clean up kwargs for pass through model_kwargs_cleaned, time_budget = self.set_kwargs() - for i, (s_id, df) in enumerate(full_data_dict.items()): + for s_id, df in full_data_dict.items(): try: logger.debug(f"Running automlx on series {s_id}") model_kwargs = model_kwargs_cleaned.copy() @@ -170,7 +173,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -197,15 +200,12 @@ def _generate_report(self): - ds_forecast_col (pd.Series): The pd.Series object representing the forecasted column. - ci_col_names (List[str]): A list of column names for the confidence interval in the report. """ - import report_creator as rc - - """The method that needs to be implemented on the particular model level.""" - selected_models = dict() + selected_models = {} models = self.models other_sections = [] if len(self.models) > 0: - for i, (s_id, m) in enumerate(models.items()): + for s_id, m in models.items(): selected_models[s_id] = { "series_id": s_id, "selected_model": m.selected_model_, @@ -352,7 +352,7 @@ def _custom_predict_automlx(self, data): """ data_temp = pd.DataFrame( data, - columns=[col for col in self.dataset_cols], + columns=list(self.dataset_cols), ) return self.models.get(self.series_id).forecast( diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py index 37b57ca75..ca3310bab 100644 --- a/ads/opctl/operator/lowcode/forecast/model/autots.py +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -1,24 +1,26 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import copy +import logging import traceback + import pandas as pd -import numpy as np +import report_creator as rc import yaml +from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list + +from ..const import ForecastOutputColumns, SupportedModels from ..operator_config import ForecastOperatorConfig -from ads.common.decorator.runtime_dependency import runtime_dependency +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ..const import ForecastOutputColumns, SupportedModels -from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list +logging.getLogger("report_creator").setLevel(logging.WARNING) AUTOTS_MAX_GENERATION = 10 AUTOTS_MODELS_TO_VALIDATE = 0.15 @@ -43,10 +45,9 @@ def _build_model(self) -> pd.DataFrame: """ # Import necessary libraries - from autots import AutoTS, create_regressor + from autots import AutoTS self.outputs = None - models = dict() # Get the name of the datetime column self.forecast_output = ForecastOutput( confidence_interval_width=self.spec.confidence_interval_width, @@ -208,7 +209,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -231,7 +232,6 @@ def _generate_report(self) -> tuple: - ds_forecast_col (pd.Index): A pandas Index containing the forecast column values. - ci_col_names (list): A list of column names for confidence intervals. """ - import report_creator as rc all_sections = [] if self.models: @@ -258,18 +258,16 @@ def _generate_report(self) -> tuple: yaml.dump(list(self.models.best_model.T.to_dict().values())[0]), ) - except KeyError as ke: - logger.warn( - f"Issue generating Model Parameters Table Section. Skipping" - ) + except KeyError: + logger.warn("Issue generating Model Parameters Table Section. Skipping") sec2 = rc.Text("Error generating model parameters.") section_2 = rc.Block(sec2_text, sec2) - all_sections = [sec_1_plots, section_2] + all_sections = [section_1, section_2] if self.spec.generate_explanations: - logger.warn(f"Explanations not yet supported for the AutoTS Module") + logger.warn("Explanations not yet supported for the AutoTS Module") # Model Description model_description = rc.Text( @@ -305,7 +303,7 @@ def generate_train_metrics(self) -> pd.DataFrame: ).T df = pd.concat([mapes, scores]) except Exception as e: - logger.debug(f"Failed to generate training metrics") + logger.debug("Failed to generate training metrics") logger.debug(f"Received Error Statement: {e}") return df diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 6045826f1..357426a79 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -1,52 +1,57 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import fsspec -import numpy as np +import logging import os -import pandas as pd import tempfile import time import traceback from abc import ABC, abstractmethod from typing import Tuple +import fsspec +import numpy as np +import pandas as pd +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger from ads.opctl.operator.lowcode.common.utils import ( - human_time_friendly, - enable_print, + datetime_to_seconds, disable_print, - write_data, + enable_print, + human_time_friendly, merged_category_column_name, - datetime_to_seconds, seconds_to_datetime, + write_data, ) from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData from ads.opctl.operator.lowcode.forecast.utils import ( + _build_metrics_df, + _build_metrics_per_horizon, + _label_encode_dataframe, default_signer, evaluate_train_metrics, - get_forecast_plots, get_auto_select_plot, - _build_metrics_df, - _build_metrics_per_horizon, + get_forecast_plots, load_pkl, write_pkl, - _label_encode_dataframe, ) -from .forecast_datasets import ForecastDatasets + from ..const import ( + AUTO_SELECT, SUMMARY_METRICS_HORIZON_LIMIT, + SpeedAccuracyMode, SupportedMetrics, SupportedModels, - SpeedAccuracyMode, - AUTO_SELECT ) from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec +from .forecast_datasets import ForecastDatasets + +logging.getLogger("report_creator").setLevel(logging.WARNING) class ForecastOperatorBaseModel(ABC): @@ -70,7 +75,7 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): self.original_target_column = self.spec.target_column self.dt_column_name = self.spec.datetime_column.name - self.model_parameters = dict() + self.model_parameters = {} self.loaded_models = None # these fields are populated in the _build_model() method @@ -79,20 +84,21 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): # "outputs" is a list of outputs generated by the models. These should only be generated when the framework requires the original output for plotting self.outputs = None self.forecast_output = None - self.errors_dict = dict() - self.le = dict() + self.errors_dict = {} + self.le = {} self.formatted_global_explanation = None self.formatted_local_explanation = None self.forecast_col_name = "yhat" - self.perform_tuning = (self.spec.tuning != None) and ( - self.spec.tuning.n_trials != None + self.perform_tuning = (self.spec.tuning is not None) and ( + self.spec.tuning.n_trials is not None ) def generate_report(self): """Generates the forecasting report.""" import warnings + from sklearn.exceptions import ConvergenceWarning with warnings.catch_warnings(): @@ -100,7 +106,6 @@ def generate_report(self): warnings.simplefilter(action="ignore", category=UserWarning) warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=ConvergenceWarning) - import report_creator as rc # load models if given if self.spec.previous_output_dir is not None: @@ -128,7 +133,7 @@ def generate_report(self): ) = self._test_evaluate_metrics( elapsed_time=elapsed_time, ) - except Exception as e: + except Exception: logger.warn("Unable to generate Test Metrics.") logger.debug(f"Full Traceback: {traceback.format_exc()}") report_sections = [] @@ -253,25 +258,30 @@ def generate_report(self): backtest_report_name = "backtest_stats.csv" file_path = f"{output_dir}/{backtest_report_name}" if self.spec.model == AUTO_SELECT: - backtest_sections.append(rc.Heading("Auto-select statistics", level=2)) + backtest_sections.append( + rc.Heading("Auto-select statistics", level=2) + ) if not os.path.exists(file_path): - failure_msg = rc.Text("auto-select could not be executed. Please check the " - "logs for more details.") + failure_msg = rc.Text( + "auto-select could not be executed. Please check the " + "logs for more details." + ) backtest_sections.append(failure_msg) else: backtest_stats = pd.read_csv(file_path) average_dict = backtest_stats.mean().to_dict() - del average_dict['backtest'] + del average_dict["backtest"] best_model = min(average_dict, key=average_dict.get) backtest_text = rc.Heading("Back Testing Metrics", level=3) summary_text = rc.Text( f"Overall, the average scores for the models are {average_dict}, with {best_model}" - f" being identified as the top-performing model during backtesting.") + f" being identified as the top-performing model during backtesting." + ) backtest_table = rc.DataTable(backtest_stats, index=True) liner_plot = get_auto_select_plot(backtest_stats) - backtest_sections.extend([backtest_text, backtest_table, summary_text, - liner_plot]) - + backtest_sections.extend( + [backtest_text, backtest_table, summary_text, liner_plot] + ) forecast_plots = [] if len(self.forecast_output.list_series_ids()) > 0: @@ -431,14 +441,13 @@ def _save_report( test_metrics_df: pd.DataFrame, ): """Saves resulting reports to the given folder.""" - import report_creator as rc unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = dict() + storage_options = {} # report-creator html report if self.spec.generate_report: @@ -580,7 +589,7 @@ def _save_report( indent=4, ) else: - logger.info(f"All modeling completed successfully.") + logger.info("All modeling completed successfully.") def preprocess(self, df, series_id): """The method that needs to be implemented on the particular model level.""" @@ -622,8 +631,8 @@ def generate_train_metrics(self) -> pd.DataFrame: def _load_model(self): try: self.loaded_models = load_pkl(self.spec.previous_output_dir + "/model.pkl") - except: - logger.info("model.pkl is not present") + except Exception as e: + logger.info(f"model.pkl is not present. Error: {e}") def _save_model(self, output_dir, storage_options): write_pkl( @@ -693,7 +702,7 @@ def explain_model(self): if not len(kernel_explnr_vals): logger.warn( - f"No explanations generated. Ensure that additional data has been provided." + "No explanations generated. Ensure that additional data has been provided." ) else: self.global_explanation[s_id] = dict( diff --git a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py index 5af3e304b..1911ebf0c 100644 --- a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +++ b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import traceback import pandas as pd @@ -192,6 +193,8 @@ def _generate_report(self): import report_creator as rc from utilsforecast.plotting import plot_series + logging.getLogger("report_creator").setLevel(logging.WARNING) + # Section 1: Forecast Overview sec1_text = rc.Block( rc.Heading("Forecast Overview", level=2), diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py index 769b3948a..040f05748 100644 --- a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -1,45 +1,35 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + import numpy as np import optuna import pandas as pd -from joblib import Parallel, delayed from torch import Tensor -from torchmetrics.regression import ( - MeanAbsoluteError, - MeanAbsolutePercentageError, - MeanSquaredError, - R2Score, - SymmetricMeanAbsolutePercentageError, -) from ads.common.decorator.runtime_dependency import ( OptionalDependency, runtime_dependency, ) from ads.opctl import logger - -from ..const import DEFAULT_TRIALS, ForecastOutputColumns, SupportedModels -from ads.opctl.operator.lowcode.forecast.utils import ( - load_pkl, - write_pkl, - _select_plot_list, - _label_encode_dataframe, -) from ads.opctl.operator.lowcode.common.utils import ( disable_print, enable_print, - seconds_to_datetime, ) -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, + load_pkl, + write_pkl, +) + +from ..const import DEFAULT_TRIALS, SupportedModels from ..operator_config import ForecastOperatorConfig +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -import traceback - # def _get_np_metrics_dict(selected_metric): # metric_translation = { @@ -62,7 +52,7 @@ object="NeuralProphet", install_from=OptionalDependency.FORECAST, ) -def _fit_model(data, params, additional_regressors, select_metric): +def _fit_model(data, params, additional_regressors): from neuralprophet import NeuralProphet, set_log_level if logger.level > 10: @@ -70,13 +60,12 @@ def _fit_model(data, params, additional_regressors, select_metric): disable_print() m = NeuralProphet(**params) - # m.metrics = _get_np_metrics_dict(select_metric) for add_reg in additional_regressors: m = m.add_future_regressor(name=add_reg) m.fit(df=data) - accepted_regressors_config = m.config_regressors or dict() + accepted_regressors_config = m.config_regressors or {} if hasattr(accepted_regressors_config, "regressors"): - accepted_regressors_config = accepted_regressors_config.regressors or dict() + accepted_regressors_config = accepted_regressors_config.regressors or {} enable_print() return m, list(accepted_regressors_config.keys()) @@ -97,11 +86,12 @@ def _load_model(self): self.loaded_trainers = load_pkl( self.spec.previous_output_dir + "/trainer.pkl" ) - except: - logger.debug("model.pkl/trainer.pkl is not present") + except Exception as e: + logger.debug(f"model.pkl/trainer.pkl is not present. Error message: {e}") def set_kwargs(self): # Extract the Confidence Interval Width and convert to prophet's equivalent - interval_width + model_kwargs = self.spec.model_kwargs if self.spec.confidence_interval_width is None: quantiles = model_kwargs.get("quantiles", [0.05, 0.95]) self.spec.confidence_interval_width = float(quantiles[1]) - float( @@ -110,8 +100,6 @@ def set_kwargs(self): else: boundaries = round((1 - self.spec.confidence_interval_width) / 2, 2) quantiles = [boundaries, self.spec.confidence_interval_width + boundaries] - - model_kwargs = self.spec.model_kwargs model_kwargs["quantiles"] = quantiles return model_kwargs @@ -124,12 +112,10 @@ def _train_model(self, i, s_id, df, model_kwargs): if self.loaded_models is not None and s_id in self.loaded_models: model = self.loaded_models[s_id] - accepted_regressors_config = ( - model.config_regressors.regressors or dict() - ) + accepted_regressors_config = model.config_regressors.regressors or {} if hasattr(accepted_regressors_config, "regressors"): accepted_regressors_config = ( - accepted_regressors_config.regressors or dict() + accepted_regressors_config.regressors or {} ) self.accepted_regressors[s_id] = list(accepted_regressors_config.keys()) if self.loaded_trainers is not None and s_id in self.loaded_trainers: @@ -143,8 +129,6 @@ def _train_model(self, i, s_id, df, model_kwargs): data=data_i, params=model_kwargs, additional_regressors=self.additional_regressors, - select_metric=None, - # select_metric=self.spec.metric, ) logger.debug( @@ -205,7 +189,6 @@ def _train_model(self, i, s_id, df, model_kwargs): "config_normalization": model.config_normalization, "config_missing": model.config_missing, "config_model": model.config_model, - "config_normalization": model.config_normalization, "data_freq": model.data_freq, "fitted": model.fitted, "data_params": model.data_params, @@ -220,19 +203,19 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(traceback.format_exc()) raise e def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() - self.trainers = dict() - self.outputs = dict() - self.errors_dict = dict() - self.explanations_info = dict() - self.accepted_regressors = dict() + self.models = {} + self.trainers = {} + self.outputs = {} + self.errors_dict = {} + self.explanations_info = {} + self.accepted_regressors = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -282,7 +265,6 @@ def objective(trial): data=df_train, params=params, additional_regressors=self.additional_regressors, - select_metric=self.spec.metric, ) df_test = df_test[["y", "ds"] + accepted_regressors] @@ -326,6 +308,8 @@ def objective(trial): def _generate_report(self): import report_creator as rc + logging.getLogger("report_creator").setLevel(logging.WARNING) + series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -371,7 +355,7 @@ def _generate_report(self): sec5_text = rc.Heading("Neural Prophet Model Parameters", level=2) model_states = [] - for i, (s_id, m) in enumerate(self.models.items()): + for s_id, m in self.models.items(): model_states.append( pd.Series( m.state_dict(), @@ -449,7 +433,7 @@ def _save_model(self, output_dir, storage_options): ) def explain_model(self): - self.local_explanation = dict() + self.local_explanation = {} global_expl = [] rename_cols = { f"future_regressor_{col}": col diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index 40c842911..24121b531 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -1,17 +1,23 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + +import matplotlib as mpl import numpy as np import optuna import pandas as pd -import logging from joblib import Parallel, delayed -from ads.common.decorator.runtime_dependency import runtime_dependency + from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import set_log_level from ads.opctl.operator.lowcode.forecast.operator_config import ForecastOperatorConfig +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, +) from ..const import ( DEFAULT_TRIALS, @@ -19,23 +25,14 @@ ForecastOutputColumns, SupportedModels, ) -from ads.opctl.operator.lowcode.forecast.utils import ( - _select_plot_list, - _label_encode_dataframe, -) -from ads.opctl.operator.lowcode.common.utils import set_log_level from .base_model import ForecastOperatorBaseModel -from ..operator_config import ForecastOperatorConfig from .forecast_datasets import ForecastDatasets, ForecastOutput -import traceback -import matplotlib as mpl - try: set_log_level("prophet", logger.level) set_log_level("cmdstanpy", logger.level) mpl.rcParams["figure.max_open_warning"] = 100 -except: +except Exception: pass @@ -73,9 +70,6 @@ def set_kwargs(self): def _train_model(self, i, series_id, df, model_kwargs): try: - from prophet import Prophet - from prophet.diagnostics import cross_validation, performance_metrics - self.forecast_output.init_series_output( series_id=series_id, data_at_series=df ) @@ -130,15 +124,15 @@ def _train_model(self, i, series_id, df, model_kwargs): self.errors_dict[series_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() - self.outputs = dict() + self.models = {} + self.outputs = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -249,6 +243,8 @@ def _generate_report(self): import report_creator as rc from prophet.plot import add_changepoints_to_plot + logging.getLogger("report_creator").setLevel(logging.WARNING) + series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -351,7 +347,6 @@ def _generate_report(self): # Append the global explanation text and section to the "all_sections" list all_sections = all_sections + [ global_explanation_section, - local_explanation_text, local_explanation_section, ] except Exception as e: diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py index 76f554ff8..72c7b727a 100644 --- a/ads/opctl/operator/lowcode/forecast/utils.py +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -1,41 +1,41 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os -import sys -from typing import List +from typing import Set +import cloudpickle import fsspec import numpy as np import pandas as pd -import cloudpickle -import plotly.express as px +import report_creator as rc from plotly import graph_objects as go +from scipy.stats import linregress from sklearn.metrics import ( explained_variance_score, mean_absolute_percentage_error, mean_squared_error, + r2_score, ) -from scipy.stats import linregress -from sklearn.metrics import r2_score - from ads.common.object_storage_details import ObjectStorageDetails from ads.dataset.label_encoder import DataFrameLabelEncoder from ads.opctl import logger - -from .const import SupportedMetrics, SupportedModels, RENDER_LIMIT -from .errors import ForecastInputDataError, ForecastSchemaYamlError -from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig -from ads.opctl.operator.lowcode.common.utils import merge_category_columns from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns -import report_creator as rc +from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ( + ForecastOutput, + TestData, +) + +from .const import RENDER_LIMIT, SupportedMetrics +logging.getLogger("report_creator").setLevel(logging.WARNING) -def _label_encode_dataframe(df, no_encode=set()): + +def _label_encode_dataframe(df, no_encode: Set = None): df_to_encode = df[list(set(df.columns) - no_encode)] le = DataFrameLabelEncoder().fit(df_to_encode) return le, le.transform(df) @@ -54,15 +54,14 @@ def smape(actual, predicted) -> float: denominator[zero_mask] = 1 numerator = np.abs(actual - predicted) - default_output = np.ones_like(numerator) * np.inf abs_error = np.divide(numerator, denominator) return round(np.mean(abs_error) * 100, 2) def _build_metrics_per_horizon( - test_data: "TestData", - output: "ForecastOutput", + test_data: TestData, + output: ForecastOutput, ) -> pd.DataFrame: """ Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon @@ -172,7 +171,7 @@ def _build_metrics_per_horizon( def load_pkl(filepath): - storage_options = dict() + storage_options = {} if ObjectStorageDetails.is_oci_path(filepath): storage_options = default_signer() @@ -194,13 +193,13 @@ def write_pkl(obj, filename, output_dir, storage_options): def _build_metrics_df(y_true, y_pred, series_id): if len(y_true) == 0 or len(y_pred) == 0: return pd.DataFrame() - metrics = dict() + metrics = {} metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred) metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred) metrics["RMSE"] = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred)) try: metrics["r2"] = linregress(y_true, y_pred).rvalue ** 2 - except: + except Exception: metrics["r2"] = r2_score(y_true=y_true, y_pred=y_pred) metrics["Explained Variance"] = explained_variance_score( y_true=y_true, y_pred=y_pred @@ -208,16 +207,13 @@ def _build_metrics_df(y_true, y_pred, series_id): return pd.DataFrame.from_dict(metrics, orient="index", columns=[series_id]) -def evaluate_train_metrics(output, metrics_col_name=None): +def evaluate_train_metrics(output): """ Training metrics Parameters: output: ForecastOutputs - metrics_col_name: str - Only passed in if the series column was created artifically. - When passed in, replaces s_id as the column name in the metrics table """ total_metrics = pd.DataFrame() for s_id in output.list_series_ids(): @@ -262,20 +258,21 @@ def _select_plot_list(fn, series_ids): def _add_unit(num, unit): return f"{num} {unit}" + def get_auto_select_plot(backtest_results): fig = go.Figure() columns = backtest_results.columns.tolist() back_test_column = "backtest" columns.remove(back_test_column) - for i, column in enumerate(columns): - color = 0 #int(i * 255 / len(columns)) + for column in columns: fig.add_trace( go.Scatter( - x=backtest_results[back_test_column], - y=backtest_results[column], - mode="lines", - name=column, - )) + x=backtest_results[back_test_column], + y=backtest_results[column], + mode="lines", + name=column, + ) + ) return rc.Widget(fig) @@ -383,6 +380,7 @@ def plot_forecast_plotly(s_id): return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids()) + def convert_target(target: str, target_col: str): """ Removes the target_column that got appended to target. diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py index 50e0fe579..d4fca2d9b 100644 --- a/ads/opctl/operator/lowcode/pii/model/report.py +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -1,10 +1,10 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import random import tempfile @@ -40,11 +40,13 @@ try: import report_creator as rc -except ImportError: +except ImportError as e: raise ModuleNotFoundError( f"`report-creator` module was not found. Please run " f"`pip install {OptionalDependency.PII}`." - ) + ) from e + +logging.getLogger("report_creator").setLevel(logging.WARNING) @dataclass(repr=True) @@ -139,13 +141,13 @@ def make_model_card(model_name="", readme_path=""): fig = go.Figure( data=[ go.Table( - header=dict(values=list(df.columns)), - cells=dict(values=[df.Metrics, df.Values]), + header={"Columns": df.columns}, + cells={"Metrics": df.Metrics, "Values": df.Values}, ) ] ) eval_res_tb = rc.Widget(data=fig, caption="Evaluation Results") - except: + except Exception: eval_res_tb = rc.Text("-") logger.warning( "The given readme.md doesn't have correct template for Evaluation Results." @@ -321,7 +323,9 @@ def make_view(self): self.report_sections = [title_text, report_description, time_proceed, structure] return self - def save_report(self, report_sections=None, report_uri=None, storage_options={}): + def save_report( + self, report_sections=None, report_uri=None, storage_options: Dict = None + ): with tempfile.TemporaryDirectory() as temp_dir: report_local_path = os.path.join(temp_dir, "___report.html") disable_print() diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index f317b19b0..bd4ab9f3c 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -1,39 +1,43 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import tempfile import time from abc import ABC, abstractmethod -from typing import Tuple, Dict +from typing import Dict, Tuple import fsspec import pandas as pd import report_creator as rc +from plotly import graph_objects as go from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import default_signer from ads.opctl.operator.lowcode.common.utils import ( - human_time_friendly, - enable_print, + default_signer, disable_print, + enable_print, + human_time_friendly, write_data, ) + +from ..operator_config import RecommenderOperatorConfig from .factory import SupportedModels from .recommender_dataset import RecommenderDatasets -from ..operator_config import RecommenderOperatorConfig -from plotly import graph_objects as go -import matplotlib.pyplot as plt + +logging.getLogger("report_creator").setLevel(logging.WARNING) class RecommenderOperatorBaseModel(ABC): """The base class for the recommender detection operator models.""" - def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): + def __init__( + self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets + ): self.config = config self.spec = self.config.spec self.datasets = datasets @@ -71,7 +75,7 @@ def generate_report(self): rc.Metric( heading="Num items", value=len(self.datasets.items), - ) + ), ), ) @@ -83,62 +87,67 @@ def generate_report(self): user_rating_counts = self.datasets.interactions[user_col].value_counts() fig_user = go.Figure(data=[go.Histogram(x=user_rating_counts, nbinsx=100)]) fig_user.update_layout( - title=f'Distribution of the number of interactions by {user_col}', - xaxis_title=f'Number of {interaction_col}', - yaxis_title=f'Number of {user_col}', - bargap=0.2 + title=f"Distribution of the number of interactions by {user_col}", + xaxis_title=f"Number of {interaction_col}", + yaxis_title=f"Number of {user_col}", + bargap=0.2, ) item_title = rc.Heading("Item Statistics", level=2) item_rating_counts = self.datasets.interactions[item_col].value_counts() fig_item = go.Figure(data=[go.Histogram(x=item_rating_counts, nbinsx=100)]) fig_item.update_layout( - title=f'Distribution of the number of interactions by {item_col}', - xaxis_title=f'Number of {interaction_col}', - yaxis_title=f'Number of {item_col}', - bargap=0.2 + title=f"Distribution of the number of interactions by {item_col}", + xaxis_title=f"Number of {interaction_col}", + yaxis_title=f"Number of {item_col}", + bargap=0.2, ) result_heatmap_title = rc.Heading("Sample Recommendations", level=2) sample_items = result_df[item_col].head(100).index filtered_df = result_df[result_df[item_col].isin(sample_items)] - data = filtered_df.pivot(index=user_col, columns=item_col, values=interaction_col) - fig = go.Figure(data=go.Heatmap( - z=data.values, - x=data.columns, - y=data.index, - colorscale='Viridis' - )) + data = filtered_df.pivot( + index=user_col, columns=item_col, values=interaction_col + ) + fig = go.Figure( + data=go.Heatmap( + z=data.values, x=data.columns, y=data.index, colorscale="Viridis" + ) + ) fig.update_layout( - title='Recommendation heatmap of User-Item Interactions (sample)', + title="Recommendation heatmap of User-Item Interactions (sample)", width=1500, height=800, xaxis_title=item_col, yaxis_title=user_col, - coloraxis_colorbar=dict(title=interaction_col) + coloraxis_colorbar={"title": interaction_col}, ) - plots = [user_title, rc.Widget(fig_user), - item_title, rc.Widget(fig_item), - result_heatmap_title, rc.Widget(fig)] + plots = [ + user_title, + rc.Widget(fig_user), + item_title, + rc.Widget(fig_item), + result_heatmap_title, + rc.Widget(fig), + ] test_metrics_sections = [rc.DataTable(pd.DataFrame(metrics, index=[0]))] yaml_appendix_title = rc.Heading("Reference: YAML File", level=2) yaml_appendix = rc.Yaml(self.config.to_dict()) report_sections = ( - [summary] - + plots - + test_metrics_sections - + other_sections - + [yaml_appendix_title, yaml_appendix] + [summary] + + plots + + test_metrics_sections + + other_sections + + [yaml_appendix_title, yaml_appendix] ) # save the report and result CSV - self._save_report( - report_sections=report_sections, - result_df=result_df - ) + self._save_report(report_sections=report_sections, result_df=result_df) + @abstractmethod def _evaluation_metrics(self): pass + @abstractmethod def _test_data_evaluate_metrics(self): pass @@ -150,7 +159,7 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = dict() + storage_options = {} # report-creator html report if self.spec.generate_report: @@ -161,19 +170,23 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): report.save(rc.Block(*report_sections), report_local_path) enable_print() - report_path = os.path.join(unique_output_dir, self.spec.report_filename) + report_path = os.path.join( + unique_output_dir, self.spec.report_filename + ) with open(report_local_path) as f1: with fsspec.open( - report_path, - "w", - **storage_options, + report_path, + "w", + **storage_options, ) as f2: f2.write(f1.read()) # recommender csv report write_data( data=result_df, - filename=os.path.join(unique_output_dir, self.spec.recommendations_filename), + filename=os.path.join( + unique_output_dir, self.spec.recommendations_filename + ), format="csv", storage_options=storage_options, ) diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index 968170986..8411c6967 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -1,28 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -from typing import Tuple, Dict, Any - # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +from typing import Dict, Tuple import pandas as pd +import report_creator as rc from pandas import DataFrame +from surprise import SVD, Dataset, Reader +from surprise.accuracy import mae, rmse +from surprise.model_selection import train_test_split -from .recommender_dataset import RecommenderDatasets +from ..constant import SupportedMetrics from ..operator_config import RecommenderOperatorConfig from .factory import RecommenderOperatorBaseModel -from surprise import Dataset, Reader -from surprise.model_selection import train_test_split -from surprise import SVD -from surprise.accuracy import rmse, mae -import report_creator as rc -from ..constant import SupportedMetrics +from .recommender_dataset import RecommenderDatasets + +logging.getLogger("report_creator").setLevel(logging.WARNING) class SVDOperatorModel(RecommenderOperatorBaseModel): """Class representing scikit surprise SVD operator model.""" - def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): + def __init__( + self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets + ): super().__init__(config, datasets) self.interactions = datasets.interactions self.users = datasets.users @@ -35,8 +37,12 @@ def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatas def _get_recommendations(self, user_id, n): all_item_ids = self.items[self.item_id].unique() - rated_items = self.interactions[self.interactions[self.user_id] == user_id][self.item_id] - unrated_items = [item_id for item_id in all_item_ids if item_id not in rated_items.values] + rated_items = self.interactions[self.interactions[self.user_id] == user_id][ + self.item_id + ] + unrated_items = [ + item_id for item_id in all_item_ids if item_id not in rated_items.values + ] predictions = [self.algo.predict(user_id, item_id) for item_id in unrated_items] predictions.sort(key=lambda x: x.est, reverse=True) top_n_recommendations = predictions[:n] @@ -46,7 +52,10 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: min_rating = self.interactions[self.interaction_column].min() max_rating = self.interactions[self.interaction_column].max() reader = Reader(rating_scale=(min_rating, max_rating)) - data = Dataset.load_from_df(self.interactions[[self.user_id, self.item_id, self.interaction_column]], reader) + data = Dataset.load_from_df( + self.interactions[[self.user_id, self.item_id, self.interaction_column]], + reader, + ) trainset, testset = train_test_split(data, test_size=self.test_size) self.algo.fit(trainset) predictions = self.algo.test(testset) @@ -58,11 +67,13 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: for user_id in self.users[self.user_id]: recommendations = self._get_recommendations(user_id, n=self.spec.top_k) for item_id, est_rating in recommendations: - all_recommendations.append({ - self.user_id: user_id, - self.item_id: item_id, - self.interaction_column: est_rating - }) + all_recommendations.append( + { + self.user_id: user_id, + self.item_id: item_id, + self.interaction_column: est_rating, + } + ) recommendations_df = pd.DataFrame(all_recommendations) return recommendations_df, metric @@ -72,17 +83,18 @@ def _generate_report(self): decompose a user-item interaction matrix into three constituent matrices. These matrices capture the latent factors that explain the observed interactions. """ - new_user_recommendations = self._get_recommendations("__new_user__", self.spec.top_k) + new_user_recommendations = self._get_recommendations( + "__new_user__", self.spec.top_k + ) new_recommendations = [] for item_id, est_rating in new_user_recommendations: - new_recommendations.append({ - self.user_id: "__new_user__", - self.item_id: item_id, - self.interaction_column: est_rating - }) + new_recommendations.append( + { + self.user_id: "__new_user__", + self.item_id: item_id, + self.interaction_column: est_rating, + } + ) title = rc.Heading("Recommendations for new users", level=2) other_sections = [title, rc.DataTable(new_recommendations)] - return ( - model_description, - other_sections - ) + return (model_description, other_sections) diff --git a/pyproject.toml b/pyproject.toml index 833fefed6..269333122 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,13 +171,13 @@ forecast = [ "statsmodels", "plotly", "oracledb", - "report-creator==1.0.9", + "report-creator==1.0.26", ] anomaly = [ "oracle_ads[opctl]", "autots", "oracledb", - "report-creator==1.0.9", + "report-creator==1.0.26", "rrcf==0.4.4", "scikit-learn", "salesforce-merlion[all]==2.0.4" @@ -186,7 +186,7 @@ recommender = [ "oracle_ads[opctl]", "scikit-surprise", "plotly", - "report-creator==1.0.9", + "report-creator==1.0.26", ] feature-store-marketplace = [ "oracle-ads[opctl]", @@ -202,7 +202,7 @@ pii = [ "scrubadub_spacy", "spacy-transformers==1.2.5", "spacy==3.6.1", - "report-creator==1.0.9", + "report-creator==1.0.26", ] llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] From c98445a32a334149a07462ca036309bc6d3177c4 Mon Sep 17 00:00:00 2001 From: Allen Date: Mon, 11 Nov 2024 18:12:46 +0000 Subject: [PATCH 02/29] change report-creator logging to 'root' --- ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/automlx.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/autots.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/base_model.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/isolationforest.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py | 2 +- ads/opctl/operator/lowcode/forecast/model/arima.py | 2 +- ads/opctl/operator/lowcode/forecast/model/automlx.py | 2 +- ads/opctl/operator/lowcode/forecast/model/autots.py | 2 +- ads/opctl/operator/lowcode/forecast/model/base_model.py | 2 +- ads/opctl/operator/lowcode/forecast/model/ml_forecast.py | 2 +- ads/opctl/operator/lowcode/forecast/model/neuralprophet.py | 2 +- ads/opctl/operator/lowcode/forecast/model/prophet.py | 2 +- ads/opctl/operator/lowcode/forecast/utils.py | 2 +- ads/opctl/operator/lowcode/pii/model/report.py | 2 +- ads/opctl/operator/lowcode/recommender/model/base_model.py | 2 +- ads/opctl/operator/lowcode/recommender/model/svd.py | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py index 8999b2674..308d97370 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +++ b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py @@ -23,7 +23,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py index 059545cf8..6e665c125 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/automlx.py +++ b/ads/opctl/operator/lowcode/anomaly/model/automlx.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class AutoMLXOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py index 550833a67..32702596c 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/autots.py +++ b/ads/opctl/operator/lowcode/anomaly/model/autots.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class AutoTSOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index c9ca984be..c24068ccb 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -35,7 +35,7 @@ from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class AnomalyOperatorBaseModel(ABC): diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py index ef7715653..b5adfd6cc 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class IsolationForestOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py index f6177e63d..c6d3269ad 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +++ b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class OneClassSVMOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index ad34159ab..0ea344228 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -16,7 +16,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py index 17817257f..87edccdfa 100644 --- a/ads/opctl/operator/lowcode/forecast/model/arima.py +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -20,7 +20,7 @@ from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class ArimaOperatorModel(ForecastOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index b36c383ad..41846a5d3 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -24,7 +24,7 @@ from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) AUTOMLX_N_ALGOS_TUNED = 4 AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py index ca3310bab..fac04a898 100644 --- a/ads/opctl/operator/lowcode/forecast/model/autots.py +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -20,7 +20,7 @@ from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) AUTOTS_MAX_GENERATION = 10 AUTOTS_MODELS_TO_VALIDATE = 0.15 diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 357426a79..84aa53208 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -51,7 +51,7 @@ from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec from .forecast_datasets import ForecastDatasets -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class ForecastOperatorBaseModel(ABC): diff --git a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py index 1911ebf0c..9907a26e7 100644 --- a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +++ b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py @@ -193,7 +193,7 @@ def _generate_report(self): import report_creator as rc from utilsforecast.plotting import plot_series - logging.getLogger("report_creator").setLevel(logging.WARNING) + logging.getLogger("root").setLevel(logging.WARNING) # Section 1: Forecast Overview sec1_text = rc.Block( diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py index 040f05748..08afa092a 100644 --- a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -308,7 +308,7 @@ def objective(trial): def _generate_report(self): import report_creator as rc - logging.getLogger("report_creator").setLevel(logging.WARNING) + logging.getLogger("root").setLevel(logging.WARNING) series_ids = self.models.keys() all_sections = [] diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index 24121b531..fc70b6c11 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -243,7 +243,7 @@ def _generate_report(self): import report_creator as rc from prophet.plot import add_changepoints_to_plot - logging.getLogger("report_creator").setLevel(logging.WARNING) + logging.getLogger("root").setLevel(logging.WARNING) series_ids = self.models.keys() all_sections = [] diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py index 72c7b727a..e3a88d7b7 100644 --- a/ads/opctl/operator/lowcode/forecast/utils.py +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -32,7 +32,7 @@ from .const import RENDER_LIMIT, SupportedMetrics -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) def _label_encode_dataframe(df, no_encode: Set = None): diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py index d4fca2d9b..70ef098d8 100644 --- a/ads/opctl/operator/lowcode/pii/model/report.py +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -46,7 +46,7 @@ f"`pip install {OptionalDependency.PII}`." ) from e -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) @dataclass(repr=True) diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index bd4ab9f3c..c345f84a7 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -29,7 +29,7 @@ from .factory import SupportedModels from .recommender_dataset import RecommenderDatasets -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class RecommenderOperatorBaseModel(ABC): diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index 8411c6967..a92a51fda 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -16,7 +16,7 @@ from .factory import RecommenderOperatorBaseModel from .recommender_dataset import RecommenderDatasets -logging.getLogger("report_creator").setLevel(logging.WARNING) +logging.getLogger("root").setLevel(logging.WARNING) class SVDOperatorModel(RecommenderOperatorBaseModel): From 47cb1f87e03fde8746425618da89a2cf94a0a51e Mon Sep 17 00:00:00 2001 From: Allen Date: Mon, 11 Nov 2024 18:24:33 +0000 Subject: [PATCH 03/29] clean up formatting --- ads/opctl/operator/lowcode/common/data.py | 35 ++++---- .../forecast/model/forecast_datasets.py | 84 ++++++++----------- 2 files changed, 55 insertions(+), 64 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py index 530a1d392..9426bd284 100644 --- a/ads/opctl/operator/lowcode/common/data.py +++ b/ads/opctl/operator/lowcode/common/data.py @@ -1,29 +1,28 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import time -from .transformations import Transformations +from abc import ABC, abstractmethod + +import pandas as pd + from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns -from ads.opctl.operator.lowcode.common.utils import load_data from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, InvalidParameterError, - PermissionsError, - DataMismatchError, ) -from abc import ABC -import pandas as pd +from ads.opctl.operator.lowcode.common.utils import load_data + +from .transformations import Transformations class AbstractData(ABC): def __init__(self, spec: dict, name="input_data"): self.Transformations = Transformations self.data = None - self._data_dict = dict() + self._data_dict = {} self.name = name self.spec = spec self.load_transform_ingest_data(spec) @@ -35,12 +34,15 @@ def get_raw_data_by_cat(self, category): condition = pd.Series(True, index=self.raw_data.index) if category in mapping: for col, val in mapping[category].items(): - condition &= (self.raw_data[col] == val) + condition &= self.raw_data[col] == val data_by_cat = self.raw_data[condition].reset_index(drop=True) - data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat + data_by_cat = ( + self._data_transformer._format_datetime_col(data_by_cat) + if self.spec.datetime_column + else data_by_cat + ) return data_by_cat - def get_dict_by_series(self): if not self._data_dict: for s_id in self.list_series_ids(): @@ -59,12 +61,12 @@ def get_data_for_series(self, series_id): data_dict = self.get_dict_by_series() try: return data_dict[series_id] - except: + except Exception as e: raise InvalidParameterError( f"Unable to retrieve series {series_id} from {self.name}. Available series ids are: {self.list_series_ids()}" - ) + ) from e - def _load_data(self, data_spec, **kwargs): + def _load_data(self, data_spec): loading_start_time = time.time() try: raw_data = load_data(data_spec) @@ -77,7 +79,7 @@ def _load_data(self, data_spec, **kwargs): ) return raw_data - def _transform_data(self, spec, raw_data, **kwargs): + def _transform_data(self, spec, raw_data): transformation_start_time = time.time() self._data_transformer = self.Transformations(spec, name=self.name) data = self._data_transformer.run(raw_data) @@ -92,6 +94,7 @@ def load_transform_ingest_data(self, spec): self.data = self._transform_data(spec, self.raw_data) self._ingest_data(spec) + @abstractmethod def _ingest_data(self, spec): pass diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py index c3804f88d..96807d7a1 100644 --- a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +++ b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py @@ -1,33 +1,23 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2023 Oracle and/or its affiliates. +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import time import pandas as pd -from pandas.api.types import is_datetime64_any_dtype, is_string_dtype, is_numeric_dtype -from ..operator_config import ForecastOperatorConfig from ads.opctl import logger -from ..const import ForecastOutputColumns, PROPHET_INTERNAL_DATE_COL -from ads.common.object_storage_details import ObjectStorageDetails -from ads.opctl.operator.lowcode.common.utils import ( - get_frequency_in_seconds, - get_frequency_of_datetime, -) from ads.opctl.operator.lowcode.common.data import AbstractData -from ads.opctl.operator.lowcode.forecast.utils import ( - default_signer, -) from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, - InvalidParameterError, - PermissionsError, DataMismatchError, + InvalidParameterError, +) +from ads.opctl.operator.lowcode.common.utils import ( + get_frequency_in_seconds, + get_frequency_of_datetime, ) -from ..const import SupportedModels -from abc import ABC, abstractmethod + +from ..const import ForecastOutputColumns, SupportedModels +from ..operator_config import ForecastOperatorConfig class HistoricalData(AbstractData): @@ -51,13 +41,12 @@ def _verify_dt_col(self, spec): self.freq_in_secs = get_frequency_in_seconds( self.data.index.get_level_values(0) ) - if spec.model == SupportedModels.AutoMLX: - if abs(self.freq_in_secs) < 3600: - message = ( - "{} requires data with a frequency of at least one hour. Please try using a different model," - " or select the 'auto' option.".format(SupportedModels.AutoMLX) - ) - raise InvalidParameterError(message) + if spec.model == SupportedModels.AutoMLX and abs(self.freq_in_secs) < 3600: + message = ( + f"{SupportedModels.AutoMLX} requires data with a frequency of at least one hour. Please try using a different model," + " or select the 'auto' option." + ) + raise InvalidParameterError(message) class AdditionalData(AbstractData): @@ -77,11 +66,11 @@ def __init__(self, spec, historical_data): else: self.name = "additional_data" self.data = None - self._data_dict = dict() + self._data_dict = {} self.create_horizon(spec, historical_data) def create_horizon(self, spec, historical_data): - logger.debug(f"No additional data provided. Constructing horizon.") + logger.debug("No additional data provided. Constructing horizon.") future_dates = pd.Series( pd.date_range( start=historical_data.get_max_time(), @@ -108,7 +97,7 @@ def create_horizon(self, spec, historical_data): ) self.additional_regressors = [] - def _ingest_data(self, spec): + def _ingest_data(self): self.additional_regressors = list(self.data.columns) if not self.additional_regressors: logger.warn( @@ -146,12 +135,11 @@ def _load_data(self, spec): self.historical_data = HistoricalData(spec) self.additional_data = AdditionalData(spec, self.historical_data) - if spec.generate_explanations: - if spec.additional_data is None: - logger.warn( - f"Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." - ) - spec.generate_explanations = False + if spec.generate_explanations and spec.additional_data is None: + logger.warn( + "Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." + ) + spec.generate_explanations = False def get_all_data_long(self, include_horizon=True): how = "outer" if include_horizon else "left" @@ -182,7 +170,7 @@ def get_data_multi_indexed(self): ) def get_data_by_series(self, include_horizon=True): - total_dict = dict() + total_dict = {} hist_data = self.historical_data.get_dict_by_series() add_data = self.additional_data.get_dict_by_series() how = "outer" if include_horizon else "left" @@ -200,10 +188,10 @@ def get_data_at_series(self, s_id, include_horizon=True): all_data = self.get_data_by_series(include_horizon=include_horizon) try: return all_data[s_id] - except: + except Exception as e: raise InvalidParameterError( f"Unable to retrieve series id: {s_id} from data. Available series ids are: {self.list_series_ids()}" - ) + ) from e def get_horizon_at_series(self, s_id): return self.get_data_at_series(s_id)[-self._horizon :] @@ -234,7 +222,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except: + except Exception: pass return series_ids @@ -269,7 +257,7 @@ def __init__( target_column: str the name of the original target column dt_column: the name of the original datetime column """ - self.series_id_map = dict() + self.series_id_map = {} self._set_ci_column_names(confidence_interval_width) self.horizon = horizon self.target_column_name = target_column @@ -281,7 +269,7 @@ def add_series_id( forecast: pd.DataFrame, overwrite: bool = False, ): - if not overwrite and series_id in self.series_id_map.keys(): + if not overwrite and series_id in self.series_id_map: raise ValueError( f"Attempting to update ForecastOutput for series_id {series_id} when this already exists. Set overwrite to True." ) @@ -321,15 +309,15 @@ def populate_series_output( """ try: output_i = self.series_id_map[series_id] - except KeyError: + except KeyError as e: raise ValueError( f"Attempting to update output for series: {series_id}, however no series output has been initialized." - ) + ) from e if (output_i.shape[0] - self.horizon) == len(fit_val): - output_i["fitted_value"].iloc[ - : -self.horizon - ] = fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon + output_i["fitted_value"].iloc[: -self.horizon] = ( + fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon + ) elif (output_i.shape[0] - self.horizon) > len(fit_val): logger.debug( f"Fitted Values were only generated on a subset ({len(fit_val)}/{(output_i.shape[0] - self.horizon)}) of the data for Series: {series_id}." @@ -378,7 +366,7 @@ def get_horizon_long(self): def get_forecast(self, series_id): try: return self.series_id_map[series_id] - except KeyError as ke: + except KeyError: logger.debug( f"No Forecast found for series_id: {series_id}. Returning empty DataFrame." ) @@ -389,7 +377,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except: + except Exception: pass return series_ids From caa75bac0cef44f51345366d875ce5f95834c553 Mon Sep 17 00:00:00 2001 From: Allen Date: Mon, 11 Nov 2024 18:35:48 +0000 Subject: [PATCH 04/29] add back param --- ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py index 96807d7a1..73a81ac0b 100644 --- a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +++ b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py @@ -97,7 +97,8 @@ def create_horizon(self, spec, historical_data): ) self.additional_regressors = [] - def _ingest_data(self): + def _ingest_data(self, spec): + _spec = spec self.additional_regressors = list(self.data.columns) if not self.additional_regressors: logger.warn( From d1d2c215cc69cc704fa04736645ec024697ccda2 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 08:10:36 +0000 Subject: [PATCH 05/29] drop rep-cr version --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 269333122..833fefed6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,13 +171,13 @@ forecast = [ "statsmodels", "plotly", "oracledb", - "report-creator==1.0.26", + "report-creator==1.0.9", ] anomaly = [ "oracle_ads[opctl]", "autots", "oracledb", - "report-creator==1.0.26", + "report-creator==1.0.9", "rrcf==0.4.4", "scikit-learn", "salesforce-merlion[all]==2.0.4" @@ -186,7 +186,7 @@ recommender = [ "oracle_ads[opctl]", "scikit-surprise", "plotly", - "report-creator==1.0.26", + "report-creator==1.0.9", ] feature-store-marketplace = [ "oracle-ads[opctl]", @@ -202,7 +202,7 @@ pii = [ "scrubadub_spacy", "spacy-transformers==1.2.5", "spacy==3.6.1", - "report-creator==1.0.26", + "report-creator==1.0.9", ] llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] From d4c2e14494d19683978066d4d700887f5b91ed10 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 08:37:04 +0000 Subject: [PATCH 06/29] revert to main --- .../lowcode/anomaly/model/anomaly_merlion.py | 11 +- .../operator/lowcode/anomaly/model/automlx.py | 20 ++-- .../operator/lowcode/anomaly/model/autots.py | 9 +- .../lowcode/anomaly/model/base_model.py | 26 ++--- .../lowcode/anomaly/model/isolationforest.py | 19 ++-- .../lowcode/anomaly/model/oneclasssvm.py | 21 ++-- .../lowcode/anomaly/model/randomcutforest.py | 8 +- ads/opctl/operator/lowcode/common/data.py | 35 +++--- .../operator/lowcode/forecast/model/arima.py | 26 ++--- .../lowcode/forecast/model/automlx.py | 52 ++++----- .../operator/lowcode/forecast/model/autots.py | 34 +++--- .../lowcode/forecast/model/base_model.py | 81 ++++++-------- .../forecast/model/forecast_datasets.py | 83 ++++++++------ .../lowcode/forecast/model/ml_forecast.py | 3 - .../lowcode/forecast/model/neuralprophet.py | 76 ++++++++----- .../lowcode/forecast/model/prophet.py | 35 +++--- ads/opctl/operator/lowcode/forecast/utils.py | 62 ++++++----- .../operator/lowcode/pii/model/report.py | 18 ++- .../lowcode/recommender/model/base_model.py | 103 ++++++++---------- .../operator/lowcode/recommender/model/svd.py | 70 +++++------- 20 files changed, 380 insertions(+), 412 deletions(-) diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py index 308d97370..cc1e80b52 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +++ b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py @@ -4,11 +4,9 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import importlib -import logging import numpy as np import pandas as pd -import report_creator as rc from merlion.post_process.threshold import AggregateAlarms from merlion.utils import TimeSeries @@ -23,8 +21,6 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) - class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel): """Class representing Merlion Anomaly Detection operator model.""" @@ -88,7 +84,7 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): data = df.set_index(date_column) data = TimeSeries.from_pd(data) - for _, (model_config, model) in model_config_map.items(): + for model_name, (model_config, model) in model_config_map.items(): if self.spec.model == SupportedModels.BOCPD: model_config = model_config(**self.spec.model_kwargs) else: @@ -119,7 +115,7 @@ def _build_model(self) -> AnomalyOutput: y_pred = (y_pred.to_pd().reset_index()["anom_score"] > 0).astype( int ) - except Exception: + except Exception as e: y_pred = ( scores["anom_score"] > np.percentile( @@ -139,12 +135,15 @@ def _build_model(self) -> AnomalyOutput: OutputColumns.SCORE_COL: scores["anom_score"], } ).reset_index(drop=True) + # model_objects[model_name].append(model) anomaly_output.add_output(target, anomaly, score) return anomaly_output def _generate_report(self): """Genreates a report for the model.""" + import report_creator as rc + other_sections = [ rc.Heading("Selected Models Overview", level=2), rc.Text( diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py index 6e665c125..a6deef1fa 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/automlx.py +++ b/ads/opctl/operator/lowcode/anomaly/model/automlx.py @@ -1,21 +1,16 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging - import pandas as pd -import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency -from ads.opctl import logger -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns - from .anomaly_dataset import AnomalyOutput -from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +from .base_model import AnomalyOperatorBaseModel +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns class AutoMLXOperatorModel(AnomalyOperatorBaseModel): @@ -30,17 +25,16 @@ class AutoMLXOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> pd.DataFrame: + from automlx import init import logging - import automlx - try: - automlx.init( + init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, ) - except Exception: + except Exception as e: logger.info("Ray already initialized") date_column = self.spec.datetime_column.name anomaly_output = AnomalyOutput(date_column=date_column) @@ -79,6 +73,8 @@ def _build_model(self) -> pd.DataFrame: return anomaly_output def _generate_report(self): + import report_creator as rc + """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py index 32702596c..c795440de 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/autots.py +++ b/ads/opctl/operator/lowcode/anomaly/model/autots.py @@ -1,12 +1,9 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging - -import report_creator as rc - from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger from ads.opctl.operator.lowcode.anomaly.const import OutputColumns @@ -15,8 +12,6 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) - class AutoTSOperatorModel(AnomalyOperatorBaseModel): """Class representing AutoTS Anomaly Detection operator model.""" @@ -96,6 +91,8 @@ def _build_model(self) -> AnomalyOutput: return anomaly_output def _generate_report(self): + import report_creator as rc + """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index c24068ccb..e8de5213e 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -3,7 +3,6 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging import os import tempfile import time @@ -13,7 +12,6 @@ import fsspec import numpy as np import pandas as pd -import report_creator as rc from sklearn import linear_model from ads.common.object_storage_details import ObjectStorageDetails @@ -35,8 +33,6 @@ from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData -logging.getLogger("root").setLevel(logging.WARNING) - class AnomalyOperatorBaseModel(ABC): """The base class for the anomaly detection operator models.""" @@ -63,8 +59,8 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): def generate_report(self): """Generates the report.""" import matplotlib.pyplot as plt - - plt.rcParams.update({"figure.max_open_warning": 0}) + plt.rcParams.update({'figure.max_open_warning': 0}) + import report_creator as rc start_time = time.time() # fallback using sklearn oneclasssvm when the sub model _build_model fails @@ -88,13 +84,7 @@ def generate_report(self): anomaly_output, test_data, elapsed_time ) table_blocks = [ - rc.DataTable( - df.head(SUBSAMPLE_THRESHOLD) - if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD - else df, - label=col, - index=True, - ) + rc.DataTable(df.head(SUBSAMPLE_THRESHOLD) if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD else df, label=col, index=True) for col, df in self.datasets.full_data_dict.items() ] data_table = rc.Select(blocks=table_blocks) @@ -154,9 +144,7 @@ def generate_report(self): else: figure_blocks = None - blocks.append( - rc.Group(*figure_blocks, label=target) - ) if figure_blocks else None + blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None plots = rc.Select(blocks) report_sections = [] @@ -166,9 +154,7 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text( - f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n" - ), + rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." @@ -299,6 +285,8 @@ def _save_report( test_metrics: pd.DataFrame, ): """Saves resulting reports to the given folder.""" + import report_creator as rc + unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py index b5adfd6cc..0083ad0fd 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py @@ -1,21 +1,17 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging - import numpy as np import pandas as pd -import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel - -logging.getLogger("root").setLevel(logging.WARNING) +from .anomaly_dataset import AnomalyOutput +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns class IsolationForestOperatorModel(AnomalyOperatorBaseModel): @@ -40,9 +36,13 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = IsolationForest(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) + y_pred = np.vectorize(self.outlier_map.get)( + model.predict(df) + ) - scores = model.score_samples(df) + scores = model.score_samples( + df + ) index_col = df.columns[0] @@ -59,6 +59,7 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" + import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py index c6d3269ad..157f7eb60 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +++ b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py @@ -1,21 +1,17 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging - import numpy as np import pandas as pd -import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel - -logging.getLogger("root").setLevel(logging.WARNING) +from .anomaly_dataset import AnomalyOutput +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns class OneClassSVMOperatorModel(AnomalyOperatorBaseModel): @@ -40,9 +36,13 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = OneClassSVM(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) + y_pred = np.vectorize(self.outlier_map.get)( + model.predict(df) + ) - scores = model.score_samples(df) + scores = model.score_samples( + df + ) index_col = df.columns[0] @@ -54,11 +54,12 @@ def _build_model(self) -> AnomalyOutput: ).reset_index(drop=True) anomaly_output.add_output(target, anomaly, score) - + return anomaly_output def _generate_report(self): """Generates the report.""" + import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index 0ea344228..17f19351d 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -3,11 +3,8 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging - import numpy as np import pandas as pd -import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger @@ -16,8 +13,6 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) - class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): """ @@ -32,7 +27,7 @@ class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> AnomalyOutput: - import rrcf + from rrcf import RCTree model_kwargs = self.spec.model_kwargs @@ -101,6 +96,7 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" + import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py index 9426bd284..530a1d392 100644 --- a/ads/opctl/operator/lowcode/common/data.py +++ b/ads/opctl/operator/lowcode/common/data.py @@ -1,28 +1,29 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import time -from abc import ABC, abstractmethod - -import pandas as pd - +from .transformations import Transformations from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns +from ads.opctl.operator.lowcode.common.utils import load_data from ads.opctl.operator.lowcode.common.errors import ( + InputDataError, InvalidParameterError, + PermissionsError, + DataMismatchError, ) -from ads.opctl.operator.lowcode.common.utils import load_data - -from .transformations import Transformations +from abc import ABC +import pandas as pd class AbstractData(ABC): def __init__(self, spec: dict, name="input_data"): self.Transformations = Transformations self.data = None - self._data_dict = {} + self._data_dict = dict() self.name = name self.spec = spec self.load_transform_ingest_data(spec) @@ -34,15 +35,12 @@ def get_raw_data_by_cat(self, category): condition = pd.Series(True, index=self.raw_data.index) if category in mapping: for col, val in mapping[category].items(): - condition &= self.raw_data[col] == val + condition &= (self.raw_data[col] == val) data_by_cat = self.raw_data[condition].reset_index(drop=True) - data_by_cat = ( - self._data_transformer._format_datetime_col(data_by_cat) - if self.spec.datetime_column - else data_by_cat - ) + data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat return data_by_cat + def get_dict_by_series(self): if not self._data_dict: for s_id in self.list_series_ids(): @@ -61,12 +59,12 @@ def get_data_for_series(self, series_id): data_dict = self.get_dict_by_series() try: return data_dict[series_id] - except Exception as e: + except: raise InvalidParameterError( f"Unable to retrieve series {series_id} from {self.name}. Available series ids are: {self.list_series_ids()}" - ) from e + ) - def _load_data(self, data_spec): + def _load_data(self, data_spec, **kwargs): loading_start_time = time.time() try: raw_data = load_data(data_spec) @@ -79,7 +77,7 @@ def _load_data(self, data_spec): ) return raw_data - def _transform_data(self, spec, raw_data): + def _transform_data(self, spec, raw_data, **kwargs): transformation_start_time = time.time() self._data_transformer = self.Transformations(spec, name=self.name) data = self._data_transformer.run(raw_data) @@ -94,7 +92,6 @@ def load_transform_ingest_data(self, spec): self.data = self._transform_data(spec, self.raw_data) self._ingest_data(spec) - @abstractmethod def _ingest_data(self, spec): pass diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py index 87edccdfa..6bbd58d34 100644 --- a/ads/opctl/operator/lowcode/forecast/model/arima.py +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -1,26 +1,23 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging -import traceback - import pandas as pd +import numpy as np import pmdarima as pm -import report_creator as rc from joblib import Parallel, delayed from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe -from ..const import ForecastOutputColumns, SupportedModels -from ..operator_config import ForecastOperatorConfig +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe +from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +import traceback from .forecast_datasets import ForecastDatasets, ForecastOutput - -logging.getLogger("root").setLevel(logging.WARNING) +from ..const import ForecastOutputColumns, SupportedModels class ArimaOperatorModel(ForecastOperatorBaseModel): @@ -42,7 +39,7 @@ def set_kwargs(self): ) model_kwargs = self.spec.model_kwargs model_kwargs["alpha"] = 1 - self.spec.confidence_interval_width - if "error_action" not in model_kwargs: + if "error_action" not in model_kwargs.keys(): model_kwargs["error_action"] = "ignore" return model_kwargs @@ -132,14 +129,13 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc(), - } + "error_trace": traceback.format_exc()} logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = {} + self.models = dict() self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -158,6 +154,8 @@ def _build_model(self) -> pd.DataFrame: def _generate_report(self): """The method that needs to be implemented on the particular model level.""" + import report_creator as rc + all_sections = [] if len(self.models) > 0: sec5_text = rc.Heading("ARIMA Model Parameters", level=2) diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index 41846a5d3..eda6112b4 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -1,30 +1,29 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- +import traceback + # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging -import traceback -import numpy as np import pandas as pd -import report_creator as rc - +import numpy as np from ads.common.decorator.runtime_dependency import runtime_dependency -from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import ( - seconds_to_datetime, -) from ads.opctl.operator.lowcode.forecast.const import ( AUTOMLX_METRIC_MAP, ForecastOutputColumns, SupportedModels, ) -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe +from ads.opctl import logger -from ..operator_config import ForecastOperatorConfig from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig from .forecast_datasets import ForecastDatasets, ForecastOutput +from ads.opctl.operator.lowcode.common.utils import ( + seconds_to_datetime, + datetime_to_seconds, +) +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe -logging.getLogger("root").setLevel(logging.WARNING) AUTOMLX_N_ALGOS_TUNED = 4 AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" @@ -48,13 +47,12 @@ def set_kwargs(self): ) model_kwargs_cleaned.pop("task", None) time_budget = model_kwargs_cleaned.pop("time_budget", -1) - model_kwargs_cleaned["preprocessing"] = ( - self.spec.preprocessing.enabled - or model_kwargs_cleaned.get("preprocessing", True) - ) + model_kwargs_cleaned[ + "preprocessing" + ] = self.spec.preprocessing.enabled or model_kwargs_cleaned.get("preprocessing", True) return model_kwargs_cleaned, time_budget - def preprocess(self, data): # TODO: re-use self.le for explanations + def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanations _, df_encoded = _label_encode_dataframe( data, no_encode={self.spec.datetime_column.name, self.original_target_column}, @@ -76,12 +74,11 @@ def preprocess(self, data): # TODO: re-use self.le for explanations ), ) def _build_model(self) -> pd.DataFrame: + from automlx import init import logging - import automlx - try: - automlx.init( + init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, @@ -91,7 +88,7 @@ def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = {} + self.models = dict() horizon = self.spec.horizon self.spec.confidence_interval_width = self.spec.confidence_interval_width or 0.8 self.forecast_output = ForecastOutput( @@ -104,7 +101,7 @@ def _build_model(self) -> pd.DataFrame: # Clean up kwargs for pass through model_kwargs_cleaned, time_budget = self.set_kwargs() - for s_id, df in full_data_dict.items(): + for i, (s_id, df) in enumerate(full_data_dict.items()): try: logger.debug(f"Running automlx on series {s_id}") model_kwargs = model_kwargs_cleaned.copy() @@ -173,7 +170,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc(), + "error_trace": traceback.format_exc() } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -200,12 +197,15 @@ def _generate_report(self): - ds_forecast_col (pd.Series): The pd.Series object representing the forecasted column. - ci_col_names (List[str]): A list of column names for the confidence interval in the report. """ - selected_models = {} + import report_creator as rc + + """The method that needs to be implemented on the particular model level.""" + selected_models = dict() models = self.models other_sections = [] if len(self.models) > 0: - for s_id, m in models.items(): + for i, (s_id, m) in enumerate(models.items()): selected_models[s_id] = { "series_id": s_id, "selected_model": m.selected_model_, @@ -352,7 +352,7 @@ def _custom_predict_automlx(self, data): """ data_temp = pd.DataFrame( data, - columns=list(self.dataset_cols), + columns=[col for col in self.dataset_cols], ) return self.models.get(self.series_id).forecast( diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py index fac04a898..37b57ca75 100644 --- a/ads/opctl/operator/lowcode/forecast/model/autots.py +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -1,26 +1,24 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import copy -import logging import traceback - import pandas as pd -import report_creator as rc +import numpy as np import yaml -from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger -from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list - -from ..const import ForecastOutputColumns, SupportedModels -from ..operator_config import ForecastOperatorConfig +from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig +from ads.common.decorator.runtime_dependency import runtime_dependency from .forecast_datasets import ForecastDatasets, ForecastOutput +from ..const import ForecastOutputColumns, SupportedModels +from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list -logging.getLogger("root").setLevel(logging.WARNING) AUTOTS_MAX_GENERATION = 10 AUTOTS_MODELS_TO_VALIDATE = 0.15 @@ -45,9 +43,10 @@ def _build_model(self) -> pd.DataFrame: """ # Import necessary libraries - from autots import AutoTS + from autots import AutoTS, create_regressor self.outputs = None + models = dict() # Get the name of the datetime column self.forecast_output = ForecastOutput( confidence_interval_width=self.spec.confidence_interval_width, @@ -209,7 +208,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc(), + "error_trace": traceback.format_exc() } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -232,6 +231,7 @@ def _generate_report(self) -> tuple: - ds_forecast_col (pd.Index): A pandas Index containing the forecast column values. - ci_col_names (list): A list of column names for confidence intervals. """ + import report_creator as rc all_sections = [] if self.models: @@ -258,16 +258,18 @@ def _generate_report(self) -> tuple: yaml.dump(list(self.models.best_model.T.to_dict().values())[0]), ) - except KeyError: - logger.warn("Issue generating Model Parameters Table Section. Skipping") + except KeyError as ke: + logger.warn( + f"Issue generating Model Parameters Table Section. Skipping" + ) sec2 = rc.Text("Error generating model parameters.") section_2 = rc.Block(sec2_text, sec2) - all_sections = [section_1, section_2] + all_sections = [sec_1_plots, section_2] if self.spec.generate_explanations: - logger.warn("Explanations not yet supported for the AutoTS Module") + logger.warn(f"Explanations not yet supported for the AutoTS Module") # Model Description model_description = rc.Text( @@ -303,7 +305,7 @@ def generate_train_metrics(self) -> pd.DataFrame: ).T df = pd.concat([mapes, scores]) except Exception as e: - logger.debug("Failed to generate training metrics") + logger.debug(f"Failed to generate training metrics") logger.debug(f"Received Error Statement: {e}") return df diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 84aa53208..6045826f1 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -1,57 +1,52 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging +import fsspec +import numpy as np import os +import pandas as pd import tempfile import time import traceback from abc import ABC, abstractmethod from typing import Tuple -import fsspec -import numpy as np -import pandas as pd -import report_creator as rc - from ads.common.decorator.runtime_dependency import runtime_dependency from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger from ads.opctl.operator.lowcode.common.utils import ( - datetime_to_seconds, - disable_print, - enable_print, human_time_friendly, + enable_print, + disable_print, + write_data, merged_category_column_name, + datetime_to_seconds, seconds_to_datetime, - write_data, ) from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData from ads.opctl.operator.lowcode.forecast.utils import ( - _build_metrics_df, - _build_metrics_per_horizon, - _label_encode_dataframe, default_signer, evaluate_train_metrics, - get_auto_select_plot, get_forecast_plots, + get_auto_select_plot, + _build_metrics_df, + _build_metrics_per_horizon, load_pkl, write_pkl, + _label_encode_dataframe, ) - +from .forecast_datasets import ForecastDatasets from ..const import ( - AUTO_SELECT, SUMMARY_METRICS_HORIZON_LIMIT, - SpeedAccuracyMode, SupportedMetrics, SupportedModels, + SpeedAccuracyMode, + AUTO_SELECT ) from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec -from .forecast_datasets import ForecastDatasets - -logging.getLogger("root").setLevel(logging.WARNING) class ForecastOperatorBaseModel(ABC): @@ -75,7 +70,7 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): self.original_target_column = self.spec.target_column self.dt_column_name = self.spec.datetime_column.name - self.model_parameters = {} + self.model_parameters = dict() self.loaded_models = None # these fields are populated in the _build_model() method @@ -84,21 +79,20 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): # "outputs" is a list of outputs generated by the models. These should only be generated when the framework requires the original output for plotting self.outputs = None self.forecast_output = None - self.errors_dict = {} - self.le = {} + self.errors_dict = dict() + self.le = dict() self.formatted_global_explanation = None self.formatted_local_explanation = None self.forecast_col_name = "yhat" - self.perform_tuning = (self.spec.tuning is not None) and ( - self.spec.tuning.n_trials is not None + self.perform_tuning = (self.spec.tuning != None) and ( + self.spec.tuning.n_trials != None ) def generate_report(self): """Generates the forecasting report.""" import warnings - from sklearn.exceptions import ConvergenceWarning with warnings.catch_warnings(): @@ -106,6 +100,7 @@ def generate_report(self): warnings.simplefilter(action="ignore", category=UserWarning) warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=ConvergenceWarning) + import report_creator as rc # load models if given if self.spec.previous_output_dir is not None: @@ -133,7 +128,7 @@ def generate_report(self): ) = self._test_evaluate_metrics( elapsed_time=elapsed_time, ) - except Exception: + except Exception as e: logger.warn("Unable to generate Test Metrics.") logger.debug(f"Full Traceback: {traceback.format_exc()}") report_sections = [] @@ -258,30 +253,25 @@ def generate_report(self): backtest_report_name = "backtest_stats.csv" file_path = f"{output_dir}/{backtest_report_name}" if self.spec.model == AUTO_SELECT: - backtest_sections.append( - rc.Heading("Auto-select statistics", level=2) - ) + backtest_sections.append(rc.Heading("Auto-select statistics", level=2)) if not os.path.exists(file_path): - failure_msg = rc.Text( - "auto-select could not be executed. Please check the " - "logs for more details." - ) + failure_msg = rc.Text("auto-select could not be executed. Please check the " + "logs for more details.") backtest_sections.append(failure_msg) else: backtest_stats = pd.read_csv(file_path) average_dict = backtest_stats.mean().to_dict() - del average_dict["backtest"] + del average_dict['backtest'] best_model = min(average_dict, key=average_dict.get) backtest_text = rc.Heading("Back Testing Metrics", level=3) summary_text = rc.Text( f"Overall, the average scores for the models are {average_dict}, with {best_model}" - f" being identified as the top-performing model during backtesting." - ) + f" being identified as the top-performing model during backtesting.") backtest_table = rc.DataTable(backtest_stats, index=True) liner_plot = get_auto_select_plot(backtest_stats) - backtest_sections.extend( - [backtest_text, backtest_table, summary_text, liner_plot] - ) + backtest_sections.extend([backtest_text, backtest_table, summary_text, + liner_plot]) + forecast_plots = [] if len(self.forecast_output.list_series_ids()) > 0: @@ -441,13 +431,14 @@ def _save_report( test_metrics_df: pd.DataFrame, ): """Saves resulting reports to the given folder.""" + import report_creator as rc unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = {} + storage_options = dict() # report-creator html report if self.spec.generate_report: @@ -589,7 +580,7 @@ def _save_report( indent=4, ) else: - logger.info("All modeling completed successfully.") + logger.info(f"All modeling completed successfully.") def preprocess(self, df, series_id): """The method that needs to be implemented on the particular model level.""" @@ -631,8 +622,8 @@ def generate_train_metrics(self) -> pd.DataFrame: def _load_model(self): try: self.loaded_models = load_pkl(self.spec.previous_output_dir + "/model.pkl") - except Exception as e: - logger.info(f"model.pkl is not present. Error: {e}") + except: + logger.info("model.pkl is not present") def _save_model(self, output_dir, storage_options): write_pkl( @@ -702,7 +693,7 @@ def explain_model(self): if not len(kernel_explnr_vals): logger.warn( - "No explanations generated. Ensure that additional data has been provided." + f"No explanations generated. Ensure that additional data has been provided." ) else: self.global_explanation[s_id] = dict( diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py index 73a81ac0b..c3804f88d 100644 --- a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +++ b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py @@ -1,23 +1,33 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- -# Copyright (c) 2023, 2024 Oracle and/or its affiliates. +# Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import time import pandas as pd +from pandas.api.types import is_datetime64_any_dtype, is_string_dtype, is_numeric_dtype +from ..operator_config import ForecastOperatorConfig from ads.opctl import logger -from ads.opctl.operator.lowcode.common.data import AbstractData -from ads.opctl.operator.lowcode.common.errors import ( - DataMismatchError, - InvalidParameterError, -) +from ..const import ForecastOutputColumns, PROPHET_INTERNAL_DATE_COL +from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl.operator.lowcode.common.utils import ( get_frequency_in_seconds, get_frequency_of_datetime, ) - -from ..const import ForecastOutputColumns, SupportedModels -from ..operator_config import ForecastOperatorConfig +from ads.opctl.operator.lowcode.common.data import AbstractData +from ads.opctl.operator.lowcode.forecast.utils import ( + default_signer, +) +from ads.opctl.operator.lowcode.common.errors import ( + InputDataError, + InvalidParameterError, + PermissionsError, + DataMismatchError, +) +from ..const import SupportedModels +from abc import ABC, abstractmethod class HistoricalData(AbstractData): @@ -41,12 +51,13 @@ def _verify_dt_col(self, spec): self.freq_in_secs = get_frequency_in_seconds( self.data.index.get_level_values(0) ) - if spec.model == SupportedModels.AutoMLX and abs(self.freq_in_secs) < 3600: - message = ( - f"{SupportedModels.AutoMLX} requires data with a frequency of at least one hour. Please try using a different model," - " or select the 'auto' option." - ) - raise InvalidParameterError(message) + if spec.model == SupportedModels.AutoMLX: + if abs(self.freq_in_secs) < 3600: + message = ( + "{} requires data with a frequency of at least one hour. Please try using a different model," + " or select the 'auto' option.".format(SupportedModels.AutoMLX) + ) + raise InvalidParameterError(message) class AdditionalData(AbstractData): @@ -66,11 +77,11 @@ def __init__(self, spec, historical_data): else: self.name = "additional_data" self.data = None - self._data_dict = {} + self._data_dict = dict() self.create_horizon(spec, historical_data) def create_horizon(self, spec, historical_data): - logger.debug("No additional data provided. Constructing horizon.") + logger.debug(f"No additional data provided. Constructing horizon.") future_dates = pd.Series( pd.date_range( start=historical_data.get_max_time(), @@ -98,7 +109,6 @@ def create_horizon(self, spec, historical_data): self.additional_regressors = [] def _ingest_data(self, spec): - _spec = spec self.additional_regressors = list(self.data.columns) if not self.additional_regressors: logger.warn( @@ -136,11 +146,12 @@ def _load_data(self, spec): self.historical_data = HistoricalData(spec) self.additional_data = AdditionalData(spec, self.historical_data) - if spec.generate_explanations and spec.additional_data is None: - logger.warn( - "Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." - ) - spec.generate_explanations = False + if spec.generate_explanations: + if spec.additional_data is None: + logger.warn( + f"Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." + ) + spec.generate_explanations = False def get_all_data_long(self, include_horizon=True): how = "outer" if include_horizon else "left" @@ -171,7 +182,7 @@ def get_data_multi_indexed(self): ) def get_data_by_series(self, include_horizon=True): - total_dict = {} + total_dict = dict() hist_data = self.historical_data.get_dict_by_series() add_data = self.additional_data.get_dict_by_series() how = "outer" if include_horizon else "left" @@ -189,10 +200,10 @@ def get_data_at_series(self, s_id, include_horizon=True): all_data = self.get_data_by_series(include_horizon=include_horizon) try: return all_data[s_id] - except Exception as e: + except: raise InvalidParameterError( f"Unable to retrieve series id: {s_id} from data. Available series ids are: {self.list_series_ids()}" - ) from e + ) def get_horizon_at_series(self, s_id): return self.get_data_at_series(s_id)[-self._horizon :] @@ -223,7 +234,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except Exception: + except: pass return series_ids @@ -258,7 +269,7 @@ def __init__( target_column: str the name of the original target column dt_column: the name of the original datetime column """ - self.series_id_map = {} + self.series_id_map = dict() self._set_ci_column_names(confidence_interval_width) self.horizon = horizon self.target_column_name = target_column @@ -270,7 +281,7 @@ def add_series_id( forecast: pd.DataFrame, overwrite: bool = False, ): - if not overwrite and series_id in self.series_id_map: + if not overwrite and series_id in self.series_id_map.keys(): raise ValueError( f"Attempting to update ForecastOutput for series_id {series_id} when this already exists. Set overwrite to True." ) @@ -310,15 +321,15 @@ def populate_series_output( """ try: output_i = self.series_id_map[series_id] - except KeyError as e: + except KeyError: raise ValueError( f"Attempting to update output for series: {series_id}, however no series output has been initialized." - ) from e + ) if (output_i.shape[0] - self.horizon) == len(fit_val): - output_i["fitted_value"].iloc[: -self.horizon] = ( - fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon - ) + output_i["fitted_value"].iloc[ + : -self.horizon + ] = fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon elif (output_i.shape[0] - self.horizon) > len(fit_val): logger.debug( f"Fitted Values were only generated on a subset ({len(fit_val)}/{(output_i.shape[0] - self.horizon)}) of the data for Series: {series_id}." @@ -367,7 +378,7 @@ def get_horizon_long(self): def get_forecast(self, series_id): try: return self.series_id_map[series_id] - except KeyError: + except KeyError as ke: logger.debug( f"No Forecast found for series_id: {series_id}. Returning empty DataFrame." ) @@ -378,7 +389,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except Exception: + except: pass return series_ids diff --git a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py index 9907a26e7..5af3e304b 100644 --- a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +++ b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py @@ -2,7 +2,6 @@ # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging import traceback import pandas as pd @@ -193,8 +192,6 @@ def _generate_report(self): import report_creator as rc from utilsforecast.plotting import plot_series - logging.getLogger("root").setLevel(logging.WARNING) - # Section 1: Forecast Overview sec1_text = rc.Block( rc.Heading("Forecast Overview", level=2), diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py index 08afa092a..769b3948a 100644 --- a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -1,35 +1,45 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging -import traceback - import numpy as np import optuna import pandas as pd +from joblib import Parallel, delayed from torch import Tensor +from torchmetrics.regression import ( + MeanAbsoluteError, + MeanAbsolutePercentageError, + MeanSquaredError, + R2Score, + SymmetricMeanAbsolutePercentageError, +) from ads.common.decorator.runtime_dependency import ( OptionalDependency, runtime_dependency, ) from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import ( - disable_print, - enable_print, -) + +from ..const import DEFAULT_TRIALS, ForecastOutputColumns, SupportedModels from ads.opctl.operator.lowcode.forecast.utils import ( - _select_plot_list, load_pkl, write_pkl, + _select_plot_list, + _label_encode_dataframe, +) +from ads.opctl.operator.lowcode.common.utils import ( + disable_print, + enable_print, + seconds_to_datetime, ) - -from ..const import DEFAULT_TRIALS, SupportedModels -from ..operator_config import ForecastOperatorConfig from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig from .forecast_datasets import ForecastDatasets, ForecastOutput +import traceback + # def _get_np_metrics_dict(selected_metric): # metric_translation = { @@ -52,7 +62,7 @@ object="NeuralProphet", install_from=OptionalDependency.FORECAST, ) -def _fit_model(data, params, additional_regressors): +def _fit_model(data, params, additional_regressors, select_metric): from neuralprophet import NeuralProphet, set_log_level if logger.level > 10: @@ -60,12 +70,13 @@ def _fit_model(data, params, additional_regressors): disable_print() m = NeuralProphet(**params) + # m.metrics = _get_np_metrics_dict(select_metric) for add_reg in additional_regressors: m = m.add_future_regressor(name=add_reg) m.fit(df=data) - accepted_regressors_config = m.config_regressors or {} + accepted_regressors_config = m.config_regressors or dict() if hasattr(accepted_regressors_config, "regressors"): - accepted_regressors_config = accepted_regressors_config.regressors or {} + accepted_regressors_config = accepted_regressors_config.regressors or dict() enable_print() return m, list(accepted_regressors_config.keys()) @@ -86,12 +97,11 @@ def _load_model(self): self.loaded_trainers = load_pkl( self.spec.previous_output_dir + "/trainer.pkl" ) - except Exception as e: - logger.debug(f"model.pkl/trainer.pkl is not present. Error message: {e}") + except: + logger.debug("model.pkl/trainer.pkl is not present") def set_kwargs(self): # Extract the Confidence Interval Width and convert to prophet's equivalent - interval_width - model_kwargs = self.spec.model_kwargs if self.spec.confidence_interval_width is None: quantiles = model_kwargs.get("quantiles", [0.05, 0.95]) self.spec.confidence_interval_width = float(quantiles[1]) - float( @@ -100,6 +110,8 @@ def set_kwargs(self): else: boundaries = round((1 - self.spec.confidence_interval_width) / 2, 2) quantiles = [boundaries, self.spec.confidence_interval_width + boundaries] + + model_kwargs = self.spec.model_kwargs model_kwargs["quantiles"] = quantiles return model_kwargs @@ -112,10 +124,12 @@ def _train_model(self, i, s_id, df, model_kwargs): if self.loaded_models is not None and s_id in self.loaded_models: model = self.loaded_models[s_id] - accepted_regressors_config = model.config_regressors.regressors or {} + accepted_regressors_config = ( + model.config_regressors.regressors or dict() + ) if hasattr(accepted_regressors_config, "regressors"): accepted_regressors_config = ( - accepted_regressors_config.regressors or {} + accepted_regressors_config.regressors or dict() ) self.accepted_regressors[s_id] = list(accepted_regressors_config.keys()) if self.loaded_trainers is not None and s_id in self.loaded_trainers: @@ -129,6 +143,8 @@ def _train_model(self, i, s_id, df, model_kwargs): data=data_i, params=model_kwargs, additional_regressors=self.additional_regressors, + select_metric=None, + # select_metric=self.spec.metric, ) logger.debug( @@ -189,6 +205,7 @@ def _train_model(self, i, s_id, df, model_kwargs): "config_normalization": model.config_normalization, "config_missing": model.config_missing, "config_model": model.config_model, + "config_normalization": model.config_normalization, "data_freq": model.data_freq, "fitted": model.fitted, "data_params": model.data_params, @@ -203,19 +220,19 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc(), + "error_trace": traceback.format_exc() } logger.warn(traceback.format_exc()) raise e def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = {} - self.trainers = {} - self.outputs = {} - self.errors_dict = {} - self.explanations_info = {} - self.accepted_regressors = {} + self.models = dict() + self.trainers = dict() + self.outputs = dict() + self.errors_dict = dict() + self.explanations_info = dict() + self.accepted_regressors = dict() self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -265,6 +282,7 @@ def objective(trial): data=df_train, params=params, additional_regressors=self.additional_regressors, + select_metric=self.spec.metric, ) df_test = df_test[["y", "ds"] + accepted_regressors] @@ -308,8 +326,6 @@ def objective(trial): def _generate_report(self): import report_creator as rc - logging.getLogger("root").setLevel(logging.WARNING) - series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -355,7 +371,7 @@ def _generate_report(self): sec5_text = rc.Heading("Neural Prophet Model Parameters", level=2) model_states = [] - for s_id, m in self.models.items(): + for i, (s_id, m) in enumerate(self.models.items()): model_states.append( pd.Series( m.state_dict(), @@ -433,7 +449,7 @@ def _save_model(self, output_dir, storage_options): ) def explain_model(self): - self.local_explanation = {} + self.local_explanation = dict() global_expl = [] rename_cols = { f"future_regressor_{col}": col diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index fc70b6c11..40c842911 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -1,23 +1,17 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging -import traceback - -import matplotlib as mpl import numpy as np import optuna import pandas as pd +import logging from joblib import Parallel, delayed - +from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import set_log_level from ads.opctl.operator.lowcode.forecast.operator_config import ForecastOperatorConfig -from ads.opctl.operator.lowcode.forecast.utils import ( - _select_plot_list, -) from ..const import ( DEFAULT_TRIALS, @@ -25,14 +19,23 @@ ForecastOutputColumns, SupportedModels, ) +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, + _label_encode_dataframe, +) +from ads.opctl.operator.lowcode.common.utils import set_log_level from .base_model import ForecastOperatorBaseModel +from ..operator_config import ForecastOperatorConfig from .forecast_datasets import ForecastDatasets, ForecastOutput +import traceback +import matplotlib as mpl + try: set_log_level("prophet", logger.level) set_log_level("cmdstanpy", logger.level) mpl.rcParams["figure.max_open_warning"] = 100 -except Exception: +except: pass @@ -70,6 +73,9 @@ def set_kwargs(self): def _train_model(self, i, series_id, df, model_kwargs): try: + from prophet import Prophet + from prophet.diagnostics import cross_validation, performance_metrics + self.forecast_output.init_series_output( series_id=series_id, data_at_series=df ) @@ -124,15 +130,15 @@ def _train_model(self, i, series_id, df, model_kwargs): self.errors_dict[series_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc(), + "error_trace": traceback.format_exc() } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = {} - self.outputs = {} + self.models = dict() + self.outputs = dict() self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -243,8 +249,6 @@ def _generate_report(self): import report_creator as rc from prophet.plot import add_changepoints_to_plot - logging.getLogger("root").setLevel(logging.WARNING) - series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -347,6 +351,7 @@ def _generate_report(self): # Append the global explanation text and section to the "all_sections" list all_sections = all_sections + [ global_explanation_section, + local_explanation_text, local_explanation_section, ] except Exception as e: diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py index e3a88d7b7..76f554ff8 100644 --- a/ads/opctl/operator/lowcode/forecast/utils.py +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -1,41 +1,41 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging import os -from typing import Set +import sys +from typing import List -import cloudpickle import fsspec import numpy as np import pandas as pd -import report_creator as rc +import cloudpickle +import plotly.express as px from plotly import graph_objects as go -from scipy.stats import linregress from sklearn.metrics import ( explained_variance_score, mean_absolute_percentage_error, mean_squared_error, - r2_score, ) +from scipy.stats import linregress +from sklearn.metrics import r2_score + from ads.common.object_storage_details import ObjectStorageDetails from ads.dataset.label_encoder import DataFrameLabelEncoder from ads.opctl import logger -from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns -from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ( - ForecastOutput, - TestData, -) -from .const import RENDER_LIMIT, SupportedMetrics - -logging.getLogger("root").setLevel(logging.WARNING) +from .const import SupportedMetrics, SupportedModels, RENDER_LIMIT +from .errors import ForecastInputDataError, ForecastSchemaYamlError +from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig +from ads.opctl.operator.lowcode.common.utils import merge_category_columns +from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns +import report_creator as rc -def _label_encode_dataframe(df, no_encode: Set = None): +def _label_encode_dataframe(df, no_encode=set()): df_to_encode = df[list(set(df.columns) - no_encode)] le = DataFrameLabelEncoder().fit(df_to_encode) return le, le.transform(df) @@ -54,14 +54,15 @@ def smape(actual, predicted) -> float: denominator[zero_mask] = 1 numerator = np.abs(actual - predicted) + default_output = np.ones_like(numerator) * np.inf abs_error = np.divide(numerator, denominator) return round(np.mean(abs_error) * 100, 2) def _build_metrics_per_horizon( - test_data: TestData, - output: ForecastOutput, + test_data: "TestData", + output: "ForecastOutput", ) -> pd.DataFrame: """ Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon @@ -171,7 +172,7 @@ def _build_metrics_per_horizon( def load_pkl(filepath): - storage_options = {} + storage_options = dict() if ObjectStorageDetails.is_oci_path(filepath): storage_options = default_signer() @@ -193,13 +194,13 @@ def write_pkl(obj, filename, output_dir, storage_options): def _build_metrics_df(y_true, y_pred, series_id): if len(y_true) == 0 or len(y_pred) == 0: return pd.DataFrame() - metrics = {} + metrics = dict() metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred) metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred) metrics["RMSE"] = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred)) try: metrics["r2"] = linregress(y_true, y_pred).rvalue ** 2 - except Exception: + except: metrics["r2"] = r2_score(y_true=y_true, y_pred=y_pred) metrics["Explained Variance"] = explained_variance_score( y_true=y_true, y_pred=y_pred @@ -207,13 +208,16 @@ def _build_metrics_df(y_true, y_pred, series_id): return pd.DataFrame.from_dict(metrics, orient="index", columns=[series_id]) -def evaluate_train_metrics(output): +def evaluate_train_metrics(output, metrics_col_name=None): """ Training metrics Parameters: output: ForecastOutputs + metrics_col_name: str + Only passed in if the series column was created artifically. + When passed in, replaces s_id as the column name in the metrics table """ total_metrics = pd.DataFrame() for s_id in output.list_series_ids(): @@ -258,21 +262,20 @@ def _select_plot_list(fn, series_ids): def _add_unit(num, unit): return f"{num} {unit}" - def get_auto_select_plot(backtest_results): fig = go.Figure() columns = backtest_results.columns.tolist() back_test_column = "backtest" columns.remove(back_test_column) - for column in columns: + for i, column in enumerate(columns): + color = 0 #int(i * 255 / len(columns)) fig.add_trace( go.Scatter( - x=backtest_results[back_test_column], - y=backtest_results[column], - mode="lines", - name=column, - ) - ) + x=backtest_results[back_test_column], + y=backtest_results[column], + mode="lines", + name=column, + )) return rc.Widget(fig) @@ -380,7 +383,6 @@ def plot_forecast_plotly(s_id): return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids()) - def convert_target(target: str, target_col: str): """ Removes the target_column that got appended to target. diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py index 70ef098d8..50e0fe579 100644 --- a/ads/opctl/operator/lowcode/pii/model/report.py +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -1,10 +1,10 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging import os import random import tempfile @@ -40,13 +40,11 @@ try: import report_creator as rc -except ImportError as e: +except ImportError: raise ModuleNotFoundError( f"`report-creator` module was not found. Please run " f"`pip install {OptionalDependency.PII}`." - ) from e - -logging.getLogger("root").setLevel(logging.WARNING) + ) @dataclass(repr=True) @@ -141,13 +139,13 @@ def make_model_card(model_name="", readme_path=""): fig = go.Figure( data=[ go.Table( - header={"Columns": df.columns}, - cells={"Metrics": df.Metrics, "Values": df.Values}, + header=dict(values=list(df.columns)), + cells=dict(values=[df.Metrics, df.Values]), ) ] ) eval_res_tb = rc.Widget(data=fig, caption="Evaluation Results") - except Exception: + except: eval_res_tb = rc.Text("-") logger.warning( "The given readme.md doesn't have correct template for Evaluation Results." @@ -323,9 +321,7 @@ def make_view(self): self.report_sections = [title_text, report_description, time_proceed, structure] return self - def save_report( - self, report_sections=None, report_uri=None, storage_options: Dict = None - ): + def save_report(self, report_sections=None, report_uri=None, storage_options={}): with tempfile.TemporaryDirectory() as temp_dir: report_local_path = os.path.join(temp_dir, "___report.html") disable_print() diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index c345f84a7..f317b19b0 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -1,43 +1,39 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging import os import tempfile import time from abc import ABC, abstractmethod -from typing import Dict, Tuple +from typing import Tuple, Dict import fsspec import pandas as pd import report_creator as rc -from plotly import graph_objects as go from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import default_signer from ads.opctl.operator.lowcode.common.utils import ( - default_signer, - disable_print, - enable_print, human_time_friendly, + enable_print, + disable_print, write_data, ) - -from ..operator_config import RecommenderOperatorConfig from .factory import SupportedModels from .recommender_dataset import RecommenderDatasets - -logging.getLogger("root").setLevel(logging.WARNING) +from ..operator_config import RecommenderOperatorConfig +from plotly import graph_objects as go +import matplotlib.pyplot as plt class RecommenderOperatorBaseModel(ABC): """The base class for the recommender detection operator models.""" - def __init__( - self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets - ): + def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): self.config = config self.spec = self.config.spec self.datasets = datasets @@ -75,7 +71,7 @@ def generate_report(self): rc.Metric( heading="Num items", value=len(self.datasets.items), - ), + ) ), ) @@ -87,67 +83,62 @@ def generate_report(self): user_rating_counts = self.datasets.interactions[user_col].value_counts() fig_user = go.Figure(data=[go.Histogram(x=user_rating_counts, nbinsx=100)]) fig_user.update_layout( - title=f"Distribution of the number of interactions by {user_col}", - xaxis_title=f"Number of {interaction_col}", - yaxis_title=f"Number of {user_col}", - bargap=0.2, + title=f'Distribution of the number of interactions by {user_col}', + xaxis_title=f'Number of {interaction_col}', + yaxis_title=f'Number of {user_col}', + bargap=0.2 ) item_title = rc.Heading("Item Statistics", level=2) item_rating_counts = self.datasets.interactions[item_col].value_counts() fig_item = go.Figure(data=[go.Histogram(x=item_rating_counts, nbinsx=100)]) fig_item.update_layout( - title=f"Distribution of the number of interactions by {item_col}", - xaxis_title=f"Number of {interaction_col}", - yaxis_title=f"Number of {item_col}", - bargap=0.2, + title=f'Distribution of the number of interactions by {item_col}', + xaxis_title=f'Number of {interaction_col}', + yaxis_title=f'Number of {item_col}', + bargap=0.2 ) result_heatmap_title = rc.Heading("Sample Recommendations", level=2) sample_items = result_df[item_col].head(100).index filtered_df = result_df[result_df[item_col].isin(sample_items)] - data = filtered_df.pivot( - index=user_col, columns=item_col, values=interaction_col - ) - fig = go.Figure( - data=go.Heatmap( - z=data.values, x=data.columns, y=data.index, colorscale="Viridis" - ) - ) + data = filtered_df.pivot(index=user_col, columns=item_col, values=interaction_col) + fig = go.Figure(data=go.Heatmap( + z=data.values, + x=data.columns, + y=data.index, + colorscale='Viridis' + )) fig.update_layout( - title="Recommendation heatmap of User-Item Interactions (sample)", + title='Recommendation heatmap of User-Item Interactions (sample)', width=1500, height=800, xaxis_title=item_col, yaxis_title=user_col, - coloraxis_colorbar={"title": interaction_col}, + coloraxis_colorbar=dict(title=interaction_col) ) - plots = [ - user_title, - rc.Widget(fig_user), - item_title, - rc.Widget(fig_item), - result_heatmap_title, - rc.Widget(fig), - ] + plots = [user_title, rc.Widget(fig_user), + item_title, rc.Widget(fig_item), + result_heatmap_title, rc.Widget(fig)] test_metrics_sections = [rc.DataTable(pd.DataFrame(metrics, index=[0]))] yaml_appendix_title = rc.Heading("Reference: YAML File", level=2) yaml_appendix = rc.Yaml(self.config.to_dict()) report_sections = ( - [summary] - + plots - + test_metrics_sections - + other_sections - + [yaml_appendix_title, yaml_appendix] + [summary] + + plots + + test_metrics_sections + + other_sections + + [yaml_appendix_title, yaml_appendix] ) # save the report and result CSV - self._save_report(report_sections=report_sections, result_df=result_df) + self._save_report( + report_sections=report_sections, + result_df=result_df + ) - @abstractmethod def _evaluation_metrics(self): pass - @abstractmethod def _test_data_evaluate_metrics(self): pass @@ -159,7 +150,7 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = {} + storage_options = dict() # report-creator html report if self.spec.generate_report: @@ -170,23 +161,19 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): report.save(rc.Block(*report_sections), report_local_path) enable_print() - report_path = os.path.join( - unique_output_dir, self.spec.report_filename - ) + report_path = os.path.join(unique_output_dir, self.spec.report_filename) with open(report_local_path) as f1: with fsspec.open( - report_path, - "w", - **storage_options, + report_path, + "w", + **storage_options, ) as f2: f2.write(f1.read()) # recommender csv report write_data( data=result_df, - filename=os.path.join( - unique_output_dir, self.spec.recommendations_filename - ), + filename=os.path.join(unique_output_dir, self.spec.recommendations_filename), format="csv", storage_options=storage_options, ) diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index a92a51fda..968170986 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -1,30 +1,28 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- +from typing import Tuple, Dict, Any + # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import logging -from typing import Dict, Tuple import pandas as pd -import report_creator as rc from pandas import DataFrame -from surprise import SVD, Dataset, Reader -from surprise.accuracy import mae, rmse -from surprise.model_selection import train_test_split -from ..constant import SupportedMetrics +from .recommender_dataset import RecommenderDatasets from ..operator_config import RecommenderOperatorConfig from .factory import RecommenderOperatorBaseModel -from .recommender_dataset import RecommenderDatasets - -logging.getLogger("root").setLevel(logging.WARNING) +from surprise import Dataset, Reader +from surprise.model_selection import train_test_split +from surprise import SVD +from surprise.accuracy import rmse, mae +import report_creator as rc +from ..constant import SupportedMetrics class SVDOperatorModel(RecommenderOperatorBaseModel): """Class representing scikit surprise SVD operator model.""" - def __init__( - self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets - ): + def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): super().__init__(config, datasets) self.interactions = datasets.interactions self.users = datasets.users @@ -37,12 +35,8 @@ def __init__( def _get_recommendations(self, user_id, n): all_item_ids = self.items[self.item_id].unique() - rated_items = self.interactions[self.interactions[self.user_id] == user_id][ - self.item_id - ] - unrated_items = [ - item_id for item_id in all_item_ids if item_id not in rated_items.values - ] + rated_items = self.interactions[self.interactions[self.user_id] == user_id][self.item_id] + unrated_items = [item_id for item_id in all_item_ids if item_id not in rated_items.values] predictions = [self.algo.predict(user_id, item_id) for item_id in unrated_items] predictions.sort(key=lambda x: x.est, reverse=True) top_n_recommendations = predictions[:n] @@ -52,10 +46,7 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: min_rating = self.interactions[self.interaction_column].min() max_rating = self.interactions[self.interaction_column].max() reader = Reader(rating_scale=(min_rating, max_rating)) - data = Dataset.load_from_df( - self.interactions[[self.user_id, self.item_id, self.interaction_column]], - reader, - ) + data = Dataset.load_from_df(self.interactions[[self.user_id, self.item_id, self.interaction_column]], reader) trainset, testset = train_test_split(data, test_size=self.test_size) self.algo.fit(trainset) predictions = self.algo.test(testset) @@ -67,13 +58,11 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: for user_id in self.users[self.user_id]: recommendations = self._get_recommendations(user_id, n=self.spec.top_k) for item_id, est_rating in recommendations: - all_recommendations.append( - { - self.user_id: user_id, - self.item_id: item_id, - self.interaction_column: est_rating, - } - ) + all_recommendations.append({ + self.user_id: user_id, + self.item_id: item_id, + self.interaction_column: est_rating + }) recommendations_df = pd.DataFrame(all_recommendations) return recommendations_df, metric @@ -83,18 +72,17 @@ def _generate_report(self): decompose a user-item interaction matrix into three constituent matrices. These matrices capture the latent factors that explain the observed interactions. """ - new_user_recommendations = self._get_recommendations( - "__new_user__", self.spec.top_k - ) + new_user_recommendations = self._get_recommendations("__new_user__", self.spec.top_k) new_recommendations = [] for item_id, est_rating in new_user_recommendations: - new_recommendations.append( - { - self.user_id: "__new_user__", - self.item_id: item_id, - self.interaction_column: est_rating, - } - ) + new_recommendations.append({ + self.user_id: "__new_user__", + self.item_id: item_id, + self.interaction_column: est_rating + }) title = rc.Heading("Recommendations for new users", level=2) other_sections = [title, rc.DataTable(new_recommendations)] - return (model_description, other_sections) + return ( + model_description, + other_sections + ) From 342cd58a9fb3532dd9648b243ad54ae8d9822c74 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 08:37:45 +0000 Subject: [PATCH 07/29] clean utils file --- ads/opctl/operator/lowcode/forecast/utils.py | 62 ++++++++++---------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py index 76f554ff8..e3a88d7b7 100644 --- a/ads/opctl/operator/lowcode/forecast/utils.py +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -1,41 +1,41 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os -import sys -from typing import List +from typing import Set +import cloudpickle import fsspec import numpy as np import pandas as pd -import cloudpickle -import plotly.express as px +import report_creator as rc from plotly import graph_objects as go +from scipy.stats import linregress from sklearn.metrics import ( explained_variance_score, mean_absolute_percentage_error, mean_squared_error, + r2_score, ) -from scipy.stats import linregress -from sklearn.metrics import r2_score - from ads.common.object_storage_details import ObjectStorageDetails from ads.dataset.label_encoder import DataFrameLabelEncoder from ads.opctl import logger - -from .const import SupportedMetrics, SupportedModels, RENDER_LIMIT -from .errors import ForecastInputDataError, ForecastSchemaYamlError -from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig -from ads.opctl.operator.lowcode.common.utils import merge_category_columns from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns -import report_creator as rc +from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import ( + ForecastOutput, + TestData, +) + +from .const import RENDER_LIMIT, SupportedMetrics +logging.getLogger("root").setLevel(logging.WARNING) -def _label_encode_dataframe(df, no_encode=set()): + +def _label_encode_dataframe(df, no_encode: Set = None): df_to_encode = df[list(set(df.columns) - no_encode)] le = DataFrameLabelEncoder().fit(df_to_encode) return le, le.transform(df) @@ -54,15 +54,14 @@ def smape(actual, predicted) -> float: denominator[zero_mask] = 1 numerator = np.abs(actual - predicted) - default_output = np.ones_like(numerator) * np.inf abs_error = np.divide(numerator, denominator) return round(np.mean(abs_error) * 100, 2) def _build_metrics_per_horizon( - test_data: "TestData", - output: "ForecastOutput", + test_data: TestData, + output: ForecastOutput, ) -> pd.DataFrame: """ Calculates Mean sMAPE, Median sMAPE, Mean MAPE, Median MAPE, Mean wMAPE, Median wMAPE for each horizon @@ -172,7 +171,7 @@ def _build_metrics_per_horizon( def load_pkl(filepath): - storage_options = dict() + storage_options = {} if ObjectStorageDetails.is_oci_path(filepath): storage_options = default_signer() @@ -194,13 +193,13 @@ def write_pkl(obj, filename, output_dir, storage_options): def _build_metrics_df(y_true, y_pred, series_id): if len(y_true) == 0 or len(y_pred) == 0: return pd.DataFrame() - metrics = dict() + metrics = {} metrics["sMAPE"] = smape(actual=y_true, predicted=y_pred) metrics["MAPE"] = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred) metrics["RMSE"] = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred)) try: metrics["r2"] = linregress(y_true, y_pred).rvalue ** 2 - except: + except Exception: metrics["r2"] = r2_score(y_true=y_true, y_pred=y_pred) metrics["Explained Variance"] = explained_variance_score( y_true=y_true, y_pred=y_pred @@ -208,16 +207,13 @@ def _build_metrics_df(y_true, y_pred, series_id): return pd.DataFrame.from_dict(metrics, orient="index", columns=[series_id]) -def evaluate_train_metrics(output, metrics_col_name=None): +def evaluate_train_metrics(output): """ Training metrics Parameters: output: ForecastOutputs - metrics_col_name: str - Only passed in if the series column was created artifically. - When passed in, replaces s_id as the column name in the metrics table """ total_metrics = pd.DataFrame() for s_id in output.list_series_ids(): @@ -262,20 +258,21 @@ def _select_plot_list(fn, series_ids): def _add_unit(num, unit): return f"{num} {unit}" + def get_auto_select_plot(backtest_results): fig = go.Figure() columns = backtest_results.columns.tolist() back_test_column = "backtest" columns.remove(back_test_column) - for i, column in enumerate(columns): - color = 0 #int(i * 255 / len(columns)) + for column in columns: fig.add_trace( go.Scatter( - x=backtest_results[back_test_column], - y=backtest_results[column], - mode="lines", - name=column, - )) + x=backtest_results[back_test_column], + y=backtest_results[column], + mode="lines", + name=column, + ) + ) return rc.Widget(fig) @@ -383,6 +380,7 @@ def plot_forecast_plotly(s_id): return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids()) + def convert_target(target: str, target_col: str): """ Removes the target_column that got appended to target. From f9c45d1193a9b5a90461e4be257bcaa4bd60831b Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 08:46:51 +0000 Subject: [PATCH 08/29] add in models folder --- .../operator/lowcode/forecast/model/arima.py | 26 +++--- .../lowcode/forecast/model/automlx.py | 52 ++++++------ .../operator/lowcode/forecast/model/autots.py | 34 ++++---- .../lowcode/forecast/model/base_model.py | 81 ++++++++++-------- .../forecast/model/forecast_datasets.py | 83 ++++++++----------- .../lowcode/forecast/model/ml_forecast.py | 3 + .../lowcode/forecast/model/neuralprophet.py | 76 +++++++---------- .../lowcode/forecast/model/prophet.py | 35 ++++---- 8 files changed, 185 insertions(+), 205 deletions(-) diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py index 6bbd58d34..87edccdfa 100644 --- a/ads/opctl/operator/lowcode/forecast/model/arima.py +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -1,23 +1,26 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + import pandas as pd -import numpy as np import pmdarima as pm +import report_creator as rc from joblib import Parallel, delayed from ads.opctl import logger - -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe + +from ..const import ForecastOutputColumns, SupportedModels from ..operator_config import ForecastOperatorConfig -import traceback +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ..const import ForecastOutputColumns, SupportedModels + +logging.getLogger("root").setLevel(logging.WARNING) class ArimaOperatorModel(ForecastOperatorBaseModel): @@ -39,7 +42,7 @@ def set_kwargs(self): ) model_kwargs = self.spec.model_kwargs model_kwargs["alpha"] = 1 - self.spec.confidence_interval_width - if "error_action" not in model_kwargs.keys(): + if "error_action" not in model_kwargs: model_kwargs["error_action"] = "ignore" return model_kwargs @@ -129,13 +132,14 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc()} + "error_trace": traceback.format_exc(), + } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() + self.models = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -154,8 +158,6 @@ def _build_model(self) -> pd.DataFrame: def _generate_report(self): """The method that needs to be implemented on the particular model level.""" - import report_creator as rc - all_sections = [] if len(self.models) > 0: sec5_text = rc.Heading("ARIMA Model Parameters", level=2) diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index eda6112b4..41846a5d3 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -1,29 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -import traceback - # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback -import pandas as pd import numpy as np +import pandas as pd +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import ( + seconds_to_datetime, +) from ads.opctl.operator.lowcode.forecast.const import ( AUTOMLX_METRIC_MAP, ForecastOutputColumns, SupportedModels, ) -from ads.opctl import logger +from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe -from .base_model import ForecastOperatorBaseModel from ..operator_config import ForecastOperatorConfig +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ads.opctl.operator.lowcode.common.utils import ( - seconds_to_datetime, - datetime_to_seconds, -) -from ads.opctl.operator.lowcode.forecast.utils import _label_encode_dataframe +logging.getLogger("root").setLevel(logging.WARNING) AUTOMLX_N_ALGOS_TUNED = 4 AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" @@ -47,12 +48,13 @@ def set_kwargs(self): ) model_kwargs_cleaned.pop("task", None) time_budget = model_kwargs_cleaned.pop("time_budget", -1) - model_kwargs_cleaned[ - "preprocessing" - ] = self.spec.preprocessing.enabled or model_kwargs_cleaned.get("preprocessing", True) + model_kwargs_cleaned["preprocessing"] = ( + self.spec.preprocessing.enabled + or model_kwargs_cleaned.get("preprocessing", True) + ) return model_kwargs_cleaned, time_budget - def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanations + def preprocess(self, data): # TODO: re-use self.le for explanations _, df_encoded = _label_encode_dataframe( data, no_encode={self.spec.datetime_column.name, self.original_target_column}, @@ -74,11 +76,12 @@ def preprocess(self, data, series_id=None): # TODO: re-use self.le for explanat ), ) def _build_model(self) -> pd.DataFrame: - from automlx import init import logging + import automlx + try: - init( + automlx.init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, @@ -88,7 +91,7 @@ def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() + self.models = {} horizon = self.spec.horizon self.spec.confidence_interval_width = self.spec.confidence_interval_width or 0.8 self.forecast_output = ForecastOutput( @@ -101,7 +104,7 @@ def _build_model(self) -> pd.DataFrame: # Clean up kwargs for pass through model_kwargs_cleaned, time_budget = self.set_kwargs() - for i, (s_id, df) in enumerate(full_data_dict.items()): + for s_id, df in full_data_dict.items(): try: logger.debug(f"Running automlx on series {s_id}") model_kwargs = model_kwargs_cleaned.copy() @@ -170,7 +173,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -197,15 +200,12 @@ def _generate_report(self): - ds_forecast_col (pd.Series): The pd.Series object representing the forecasted column. - ci_col_names (List[str]): A list of column names for the confidence interval in the report. """ - import report_creator as rc - - """The method that needs to be implemented on the particular model level.""" - selected_models = dict() + selected_models = {} models = self.models other_sections = [] if len(self.models) > 0: - for i, (s_id, m) in enumerate(models.items()): + for s_id, m in models.items(): selected_models[s_id] = { "series_id": s_id, "selected_model": m.selected_model_, @@ -352,7 +352,7 @@ def _custom_predict_automlx(self, data): """ data_temp = pd.DataFrame( data, - columns=[col for col in self.dataset_cols], + columns=list(self.dataset_cols), ) return self.models.get(self.series_id).forecast( diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py index 37b57ca75..fac04a898 100644 --- a/ads/opctl/operator/lowcode/forecast/model/autots.py +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -1,24 +1,26 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import copy +import logging import traceback + import pandas as pd -import numpy as np +import report_creator as rc import yaml +from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import seconds_to_datetime -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list + +from ..const import ForecastOutputColumns, SupportedModels from ..operator_config import ForecastOperatorConfig -from ads.common.decorator.runtime_dependency import runtime_dependency +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -from ..const import ForecastOutputColumns, SupportedModels -from ads.opctl.operator.lowcode.forecast.utils import _select_plot_list +logging.getLogger("root").setLevel(logging.WARNING) AUTOTS_MAX_GENERATION = 10 AUTOTS_MODELS_TO_VALIDATE = 0.15 @@ -43,10 +45,9 @@ def _build_model(self) -> pd.DataFrame: """ # Import necessary libraries - from autots import AutoTS, create_regressor + from autots import AutoTS self.outputs = None - models = dict() # Get the name of the datetime column self.forecast_output = ForecastOutput( confidence_interval_width=self.spec.confidence_interval_width, @@ -208,7 +209,7 @@ def _build_model(self) -> pd.DataFrame: self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) @@ -231,7 +232,6 @@ def _generate_report(self) -> tuple: - ds_forecast_col (pd.Index): A pandas Index containing the forecast column values. - ci_col_names (list): A list of column names for confidence intervals. """ - import report_creator as rc all_sections = [] if self.models: @@ -258,18 +258,16 @@ def _generate_report(self) -> tuple: yaml.dump(list(self.models.best_model.T.to_dict().values())[0]), ) - except KeyError as ke: - logger.warn( - f"Issue generating Model Parameters Table Section. Skipping" - ) + except KeyError: + logger.warn("Issue generating Model Parameters Table Section. Skipping") sec2 = rc.Text("Error generating model parameters.") section_2 = rc.Block(sec2_text, sec2) - all_sections = [sec_1_plots, section_2] + all_sections = [section_1, section_2] if self.spec.generate_explanations: - logger.warn(f"Explanations not yet supported for the AutoTS Module") + logger.warn("Explanations not yet supported for the AutoTS Module") # Model Description model_description = rc.Text( @@ -305,7 +303,7 @@ def generate_train_metrics(self) -> pd.DataFrame: ).T df = pd.concat([mapes, scores]) except Exception as e: - logger.debug(f"Failed to generate training metrics") + logger.debug("Failed to generate training metrics") logger.debug(f"Received Error Statement: {e}") return df diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 6045826f1..84aa53208 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -1,52 +1,57 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import fsspec -import numpy as np +import logging import os -import pandas as pd import tempfile import time import traceback from abc import ABC, abstractmethod from typing import Tuple +import fsspec +import numpy as np +import pandas as pd +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger from ads.opctl.operator.lowcode.common.utils import ( - human_time_friendly, - enable_print, + datetime_to_seconds, disable_print, - write_data, + enable_print, + human_time_friendly, merged_category_column_name, - datetime_to_seconds, seconds_to_datetime, + write_data, ) from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData from ads.opctl.operator.lowcode.forecast.utils import ( + _build_metrics_df, + _build_metrics_per_horizon, + _label_encode_dataframe, default_signer, evaluate_train_metrics, - get_forecast_plots, get_auto_select_plot, - _build_metrics_df, - _build_metrics_per_horizon, + get_forecast_plots, load_pkl, write_pkl, - _label_encode_dataframe, ) -from .forecast_datasets import ForecastDatasets + from ..const import ( + AUTO_SELECT, SUMMARY_METRICS_HORIZON_LIMIT, + SpeedAccuracyMode, SupportedMetrics, SupportedModels, - SpeedAccuracyMode, - AUTO_SELECT ) from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec +from .forecast_datasets import ForecastDatasets + +logging.getLogger("root").setLevel(logging.WARNING) class ForecastOperatorBaseModel(ABC): @@ -70,7 +75,7 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): self.original_target_column = self.spec.target_column self.dt_column_name = self.spec.datetime_column.name - self.model_parameters = dict() + self.model_parameters = {} self.loaded_models = None # these fields are populated in the _build_model() method @@ -79,20 +84,21 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets): # "outputs" is a list of outputs generated by the models. These should only be generated when the framework requires the original output for plotting self.outputs = None self.forecast_output = None - self.errors_dict = dict() - self.le = dict() + self.errors_dict = {} + self.le = {} self.formatted_global_explanation = None self.formatted_local_explanation = None self.forecast_col_name = "yhat" - self.perform_tuning = (self.spec.tuning != None) and ( - self.spec.tuning.n_trials != None + self.perform_tuning = (self.spec.tuning is not None) and ( + self.spec.tuning.n_trials is not None ) def generate_report(self): """Generates the forecasting report.""" import warnings + from sklearn.exceptions import ConvergenceWarning with warnings.catch_warnings(): @@ -100,7 +106,6 @@ def generate_report(self): warnings.simplefilter(action="ignore", category=UserWarning) warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=ConvergenceWarning) - import report_creator as rc # load models if given if self.spec.previous_output_dir is not None: @@ -128,7 +133,7 @@ def generate_report(self): ) = self._test_evaluate_metrics( elapsed_time=elapsed_time, ) - except Exception as e: + except Exception: logger.warn("Unable to generate Test Metrics.") logger.debug(f"Full Traceback: {traceback.format_exc()}") report_sections = [] @@ -253,25 +258,30 @@ def generate_report(self): backtest_report_name = "backtest_stats.csv" file_path = f"{output_dir}/{backtest_report_name}" if self.spec.model == AUTO_SELECT: - backtest_sections.append(rc.Heading("Auto-select statistics", level=2)) + backtest_sections.append( + rc.Heading("Auto-select statistics", level=2) + ) if not os.path.exists(file_path): - failure_msg = rc.Text("auto-select could not be executed. Please check the " - "logs for more details.") + failure_msg = rc.Text( + "auto-select could not be executed. Please check the " + "logs for more details." + ) backtest_sections.append(failure_msg) else: backtest_stats = pd.read_csv(file_path) average_dict = backtest_stats.mean().to_dict() - del average_dict['backtest'] + del average_dict["backtest"] best_model = min(average_dict, key=average_dict.get) backtest_text = rc.Heading("Back Testing Metrics", level=3) summary_text = rc.Text( f"Overall, the average scores for the models are {average_dict}, with {best_model}" - f" being identified as the top-performing model during backtesting.") + f" being identified as the top-performing model during backtesting." + ) backtest_table = rc.DataTable(backtest_stats, index=True) liner_plot = get_auto_select_plot(backtest_stats) - backtest_sections.extend([backtest_text, backtest_table, summary_text, - liner_plot]) - + backtest_sections.extend( + [backtest_text, backtest_table, summary_text, liner_plot] + ) forecast_plots = [] if len(self.forecast_output.list_series_ids()) > 0: @@ -431,14 +441,13 @@ def _save_report( test_metrics_df: pd.DataFrame, ): """Saves resulting reports to the given folder.""" - import report_creator as rc unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = dict() + storage_options = {} # report-creator html report if self.spec.generate_report: @@ -580,7 +589,7 @@ def _save_report( indent=4, ) else: - logger.info(f"All modeling completed successfully.") + logger.info("All modeling completed successfully.") def preprocess(self, df, series_id): """The method that needs to be implemented on the particular model level.""" @@ -622,8 +631,8 @@ def generate_train_metrics(self) -> pd.DataFrame: def _load_model(self): try: self.loaded_models = load_pkl(self.spec.previous_output_dir + "/model.pkl") - except: - logger.info("model.pkl is not present") + except Exception as e: + logger.info(f"model.pkl is not present. Error: {e}") def _save_model(self, output_dir, storage_options): write_pkl( @@ -693,7 +702,7 @@ def explain_model(self): if not len(kernel_explnr_vals): logger.warn( - f"No explanations generated. Ensure that additional data has been provided." + "No explanations generated. Ensure that additional data has been provided." ) else: self.global_explanation[s_id] = dict( diff --git a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py index c3804f88d..73a81ac0b 100644 --- a/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +++ b/ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py @@ -1,33 +1,23 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2023 Oracle and/or its affiliates. +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import time import pandas as pd -from pandas.api.types import is_datetime64_any_dtype, is_string_dtype, is_numeric_dtype -from ..operator_config import ForecastOperatorConfig from ads.opctl import logger -from ..const import ForecastOutputColumns, PROPHET_INTERNAL_DATE_COL -from ads.common.object_storage_details import ObjectStorageDetails -from ads.opctl.operator.lowcode.common.utils import ( - get_frequency_in_seconds, - get_frequency_of_datetime, -) from ads.opctl.operator.lowcode.common.data import AbstractData -from ads.opctl.operator.lowcode.forecast.utils import ( - default_signer, -) from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, - InvalidParameterError, - PermissionsError, DataMismatchError, + InvalidParameterError, +) +from ads.opctl.operator.lowcode.common.utils import ( + get_frequency_in_seconds, + get_frequency_of_datetime, ) -from ..const import SupportedModels -from abc import ABC, abstractmethod + +from ..const import ForecastOutputColumns, SupportedModels +from ..operator_config import ForecastOperatorConfig class HistoricalData(AbstractData): @@ -51,13 +41,12 @@ def _verify_dt_col(self, spec): self.freq_in_secs = get_frequency_in_seconds( self.data.index.get_level_values(0) ) - if spec.model == SupportedModels.AutoMLX: - if abs(self.freq_in_secs) < 3600: - message = ( - "{} requires data with a frequency of at least one hour. Please try using a different model," - " or select the 'auto' option.".format(SupportedModels.AutoMLX) - ) - raise InvalidParameterError(message) + if spec.model == SupportedModels.AutoMLX and abs(self.freq_in_secs) < 3600: + message = ( + f"{SupportedModels.AutoMLX} requires data with a frequency of at least one hour. Please try using a different model," + " or select the 'auto' option." + ) + raise InvalidParameterError(message) class AdditionalData(AbstractData): @@ -77,11 +66,11 @@ def __init__(self, spec, historical_data): else: self.name = "additional_data" self.data = None - self._data_dict = dict() + self._data_dict = {} self.create_horizon(spec, historical_data) def create_horizon(self, spec, historical_data): - logger.debug(f"No additional data provided. Constructing horizon.") + logger.debug("No additional data provided. Constructing horizon.") future_dates = pd.Series( pd.date_range( start=historical_data.get_max_time(), @@ -109,6 +98,7 @@ def create_horizon(self, spec, historical_data): self.additional_regressors = [] def _ingest_data(self, spec): + _spec = spec self.additional_regressors = list(self.data.columns) if not self.additional_regressors: logger.warn( @@ -146,12 +136,11 @@ def _load_data(self, spec): self.historical_data = HistoricalData(spec) self.additional_data = AdditionalData(spec, self.historical_data) - if spec.generate_explanations: - if spec.additional_data is None: - logger.warn( - f"Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." - ) - spec.generate_explanations = False + if spec.generate_explanations and spec.additional_data is None: + logger.warn( + "Unable to generate explanations as there is no additional data passed in. Either set generate_explanations to False, or pass in additional data." + ) + spec.generate_explanations = False def get_all_data_long(self, include_horizon=True): how = "outer" if include_horizon else "left" @@ -182,7 +171,7 @@ def get_data_multi_indexed(self): ) def get_data_by_series(self, include_horizon=True): - total_dict = dict() + total_dict = {} hist_data = self.historical_data.get_dict_by_series() add_data = self.additional_data.get_dict_by_series() how = "outer" if include_horizon else "left" @@ -200,10 +189,10 @@ def get_data_at_series(self, s_id, include_horizon=True): all_data = self.get_data_by_series(include_horizon=include_horizon) try: return all_data[s_id] - except: + except Exception as e: raise InvalidParameterError( f"Unable to retrieve series id: {s_id} from data. Available series ids are: {self.list_series_ids()}" - ) + ) from e def get_horizon_at_series(self, s_id): return self.get_data_at_series(s_id)[-self._horizon :] @@ -234,7 +223,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except: + except Exception: pass return series_ids @@ -269,7 +258,7 @@ def __init__( target_column: str the name of the original target column dt_column: the name of the original datetime column """ - self.series_id_map = dict() + self.series_id_map = {} self._set_ci_column_names(confidence_interval_width) self.horizon = horizon self.target_column_name = target_column @@ -281,7 +270,7 @@ def add_series_id( forecast: pd.DataFrame, overwrite: bool = False, ): - if not overwrite and series_id in self.series_id_map.keys(): + if not overwrite and series_id in self.series_id_map: raise ValueError( f"Attempting to update ForecastOutput for series_id {series_id} when this already exists. Set overwrite to True." ) @@ -321,15 +310,15 @@ def populate_series_output( """ try: output_i = self.series_id_map[series_id] - except KeyError: + except KeyError as e: raise ValueError( f"Attempting to update output for series: {series_id}, however no series output has been initialized." - ) + ) from e if (output_i.shape[0] - self.horizon) == len(fit_val): - output_i["fitted_value"].iloc[ - : -self.horizon - ] = fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon + output_i["fitted_value"].iloc[: -self.horizon] = ( + fit_val # Note: may need to do len(output_i) - (len(fit_val) + horizon) : -horizon + ) elif (output_i.shape[0] - self.horizon) > len(fit_val): logger.debug( f"Fitted Values were only generated on a subset ({len(fit_val)}/{(output_i.shape[0] - self.horizon)}) of the data for Series: {series_id}." @@ -378,7 +367,7 @@ def get_horizon_long(self): def get_forecast(self, series_id): try: return self.series_id_map[series_id] - except KeyError as ke: + except KeyError: logger.debug( f"No Forecast found for series_id: {series_id}. Returning empty DataFrame." ) @@ -389,7 +378,7 @@ def list_series_ids(self, sorted=True): if sorted: try: series_ids.sort() - except: + except Exception: pass return series_ids diff --git a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py index 5af3e304b..9907a26e7 100644 --- a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +++ b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import traceback import pandas as pd @@ -192,6 +193,8 @@ def _generate_report(self): import report_creator as rc from utilsforecast.plotting import plot_series + logging.getLogger("root").setLevel(logging.WARNING) + # Section 1: Forecast Overview sec1_text = rc.Block( rc.Heading("Forecast Overview", level=2), diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py index 769b3948a..08afa092a 100644 --- a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -1,45 +1,35 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + import numpy as np import optuna import pandas as pd -from joblib import Parallel, delayed from torch import Tensor -from torchmetrics.regression import ( - MeanAbsoluteError, - MeanAbsolutePercentageError, - MeanSquaredError, - R2Score, - SymmetricMeanAbsolutePercentageError, -) from ads.common.decorator.runtime_dependency import ( OptionalDependency, runtime_dependency, ) from ads.opctl import logger - -from ..const import DEFAULT_TRIALS, ForecastOutputColumns, SupportedModels -from ads.opctl.operator.lowcode.forecast.utils import ( - load_pkl, - write_pkl, - _select_plot_list, - _label_encode_dataframe, -) from ads.opctl.operator.lowcode.common.utils import ( disable_print, enable_print, - seconds_to_datetime, ) -from .base_model import ForecastOperatorBaseModel +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, + load_pkl, + write_pkl, +) + +from ..const import DEFAULT_TRIALS, SupportedModels from ..operator_config import ForecastOperatorConfig +from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -import traceback - # def _get_np_metrics_dict(selected_metric): # metric_translation = { @@ -62,7 +52,7 @@ object="NeuralProphet", install_from=OptionalDependency.FORECAST, ) -def _fit_model(data, params, additional_regressors, select_metric): +def _fit_model(data, params, additional_regressors): from neuralprophet import NeuralProphet, set_log_level if logger.level > 10: @@ -70,13 +60,12 @@ def _fit_model(data, params, additional_regressors, select_metric): disable_print() m = NeuralProphet(**params) - # m.metrics = _get_np_metrics_dict(select_metric) for add_reg in additional_regressors: m = m.add_future_regressor(name=add_reg) m.fit(df=data) - accepted_regressors_config = m.config_regressors or dict() + accepted_regressors_config = m.config_regressors or {} if hasattr(accepted_regressors_config, "regressors"): - accepted_regressors_config = accepted_regressors_config.regressors or dict() + accepted_regressors_config = accepted_regressors_config.regressors or {} enable_print() return m, list(accepted_regressors_config.keys()) @@ -97,11 +86,12 @@ def _load_model(self): self.loaded_trainers = load_pkl( self.spec.previous_output_dir + "/trainer.pkl" ) - except: - logger.debug("model.pkl/trainer.pkl is not present") + except Exception as e: + logger.debug(f"model.pkl/trainer.pkl is not present. Error message: {e}") def set_kwargs(self): # Extract the Confidence Interval Width and convert to prophet's equivalent - interval_width + model_kwargs = self.spec.model_kwargs if self.spec.confidence_interval_width is None: quantiles = model_kwargs.get("quantiles", [0.05, 0.95]) self.spec.confidence_interval_width = float(quantiles[1]) - float( @@ -110,8 +100,6 @@ def set_kwargs(self): else: boundaries = round((1 - self.spec.confidence_interval_width) / 2, 2) quantiles = [boundaries, self.spec.confidence_interval_width + boundaries] - - model_kwargs = self.spec.model_kwargs model_kwargs["quantiles"] = quantiles return model_kwargs @@ -124,12 +112,10 @@ def _train_model(self, i, s_id, df, model_kwargs): if self.loaded_models is not None and s_id in self.loaded_models: model = self.loaded_models[s_id] - accepted_regressors_config = ( - model.config_regressors.regressors or dict() - ) + accepted_regressors_config = model.config_regressors.regressors or {} if hasattr(accepted_regressors_config, "regressors"): accepted_regressors_config = ( - accepted_regressors_config.regressors or dict() + accepted_regressors_config.regressors or {} ) self.accepted_regressors[s_id] = list(accepted_regressors_config.keys()) if self.loaded_trainers is not None and s_id in self.loaded_trainers: @@ -143,8 +129,6 @@ def _train_model(self, i, s_id, df, model_kwargs): data=data_i, params=model_kwargs, additional_regressors=self.additional_regressors, - select_metric=None, - # select_metric=self.spec.metric, ) logger.debug( @@ -205,7 +189,6 @@ def _train_model(self, i, s_id, df, model_kwargs): "config_normalization": model.config_normalization, "config_missing": model.config_missing, "config_model": model.config_model, - "config_normalization": model.config_normalization, "data_freq": model.data_freq, "fitted": model.fitted, "data_params": model.data_params, @@ -220,19 +203,19 @@ def _train_model(self, i, s_id, df, model_kwargs): self.errors_dict[s_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(traceback.format_exc()) raise e def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() - self.trainers = dict() - self.outputs = dict() - self.errors_dict = dict() - self.explanations_info = dict() - self.accepted_regressors = dict() + self.models = {} + self.trainers = {} + self.outputs = {} + self.errors_dict = {} + self.explanations_info = {} + self.accepted_regressors = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -282,7 +265,6 @@ def objective(trial): data=df_train, params=params, additional_regressors=self.additional_regressors, - select_metric=self.spec.metric, ) df_test = df_test[["y", "ds"] + accepted_regressors] @@ -326,6 +308,8 @@ def objective(trial): def _generate_report(self): import report_creator as rc + logging.getLogger("root").setLevel(logging.WARNING) + series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -371,7 +355,7 @@ def _generate_report(self): sec5_text = rc.Heading("Neural Prophet Model Parameters", level=2) model_states = [] - for i, (s_id, m) in enumerate(self.models.items()): + for s_id, m in self.models.items(): model_states.append( pd.Series( m.state_dict(), @@ -449,7 +433,7 @@ def _save_model(self, output_dir, storage_options): ) def explain_model(self): - self.local_explanation = dict() + self.local_explanation = {} global_expl = [] rename_cols = { f"future_regressor_{col}": col diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index 40c842911..fc70b6c11 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -1,17 +1,23 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +import traceback + +import matplotlib as mpl import numpy as np import optuna import pandas as pd -import logging from joblib import Parallel, delayed -from ads.common.decorator.runtime_dependency import runtime_dependency + from ads.opctl import logger +from ads.opctl.operator.lowcode.common.utils import set_log_level from ads.opctl.operator.lowcode.forecast.operator_config import ForecastOperatorConfig +from ads.opctl.operator.lowcode.forecast.utils import ( + _select_plot_list, +) from ..const import ( DEFAULT_TRIALS, @@ -19,23 +25,14 @@ ForecastOutputColumns, SupportedModels, ) -from ads.opctl.operator.lowcode.forecast.utils import ( - _select_plot_list, - _label_encode_dataframe, -) -from ads.opctl.operator.lowcode.common.utils import set_log_level from .base_model import ForecastOperatorBaseModel -from ..operator_config import ForecastOperatorConfig from .forecast_datasets import ForecastDatasets, ForecastOutput -import traceback -import matplotlib as mpl - try: set_log_level("prophet", logger.level) set_log_level("cmdstanpy", logger.level) mpl.rcParams["figure.max_open_warning"] = 100 -except: +except Exception: pass @@ -73,9 +70,6 @@ def set_kwargs(self): def _train_model(self, i, series_id, df, model_kwargs): try: - from prophet import Prophet - from prophet.diagnostics import cross_validation, performance_metrics - self.forecast_output.init_series_output( series_id=series_id, data_at_series=df ) @@ -130,15 +124,15 @@ def _train_model(self, i, series_id, df, model_kwargs): self.errors_dict[series_id] = { "model_name": self.spec.model, "error": str(e), - "error_trace": traceback.format_exc() + "error_trace": traceback.format_exc(), } logger.warn(f"Encountered Error: {e}. Skipping.") logger.warn(traceback.format_exc()) def _build_model(self) -> pd.DataFrame: full_data_dict = self.datasets.get_data_by_series() - self.models = dict() - self.outputs = dict() + self.models = {} + self.outputs = {} self.additional_regressors = self.datasets.get_additional_data_column_names() model_kwargs = self.set_kwargs() self.forecast_output = ForecastOutput( @@ -249,6 +243,8 @@ def _generate_report(self): import report_creator as rc from prophet.plot import add_changepoints_to_plot + logging.getLogger("root").setLevel(logging.WARNING) + series_ids = self.models.keys() all_sections = [] if len(series_ids) > 0: @@ -351,7 +347,6 @@ def _generate_report(self): # Append the global explanation text and section to the "all_sections" list all_sections = all_sections + [ global_explanation_section, - local_explanation_text, local_explanation_section, ] except Exception as e: From 62556700e3545e33bc42003feeb9803c00678151 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 11:30:28 +0000 Subject: [PATCH 09/29] anomaly updates --- .../lowcode/anomaly/model/anomaly_merlion.py | 11 ++++---- .../operator/lowcode/anomaly/model/automlx.py | 20 ++++++++------ .../operator/lowcode/anomaly/model/autots.py | 9 ++++--- .../lowcode/anomaly/model/base_model.py | 26 ++++++++++++++----- .../lowcode/anomaly/model/isolationforest.py | 19 +++++++------- .../lowcode/anomaly/model/oneclasssvm.py | 21 +++++++-------- .../lowcode/anomaly/model/randomcutforest.py | 8 ++++-- 7 files changed, 68 insertions(+), 46 deletions(-) diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py index cc1e80b52..308d97370 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +++ b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py @@ -4,9 +4,11 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import importlib +import logging import numpy as np import pandas as pd +import report_creator as rc from merlion.post_process.threshold import AggregateAlarms from merlion.utils import TimeSeries @@ -21,6 +23,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("root").setLevel(logging.WARNING) + class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel): """Class representing Merlion Anomaly Detection operator model.""" @@ -84,7 +88,7 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): data = df.set_index(date_column) data = TimeSeries.from_pd(data) - for model_name, (model_config, model) in model_config_map.items(): + for _, (model_config, model) in model_config_map.items(): if self.spec.model == SupportedModels.BOCPD: model_config = model_config(**self.spec.model_kwargs) else: @@ -115,7 +119,7 @@ def _build_model(self) -> AnomalyOutput: y_pred = (y_pred.to_pd().reset_index()["anom_score"] > 0).astype( int ) - except Exception as e: + except Exception: y_pred = ( scores["anom_score"] > np.percentile( @@ -135,15 +139,12 @@ def _build_model(self) -> AnomalyOutput: OutputColumns.SCORE_COL: scores["anom_score"], } ).reset_index(drop=True) - # model_objects[model_name].append(model) anomaly_output.add_output(target, anomaly, score) return anomaly_output def _generate_report(self): """Genreates a report for the model.""" - import report_creator as rc - other_sections = [ rc.Heading("Selected Models Overview", level=2), rc.Text( diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py index a6deef1fa..6e665c125 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/automlx.py +++ b/ads/opctl/operator/lowcode/anomaly/model/automlx.py @@ -1,16 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency -from .anomaly_dataset import AnomalyOutput +from ads.opctl import logger +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns + +logging.getLogger("root").setLevel(logging.WARNING) class AutoMLXOperatorModel(AnomalyOperatorBaseModel): @@ -25,16 +30,17 @@ class AutoMLXOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> pd.DataFrame: - from automlx import init import logging + import automlx + try: - init( + automlx.init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, ) - except Exception as e: + except Exception: logger.info("Ray already initialized") date_column = self.spec.datetime_column.name anomaly_output = AnomalyOutput(date_column=date_column) @@ -73,8 +79,6 @@ def _build_model(self) -> pd.DataFrame: return anomaly_output def _generate_report(self): - import report_creator as rc - """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py index c795440de..32702596c 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/autots.py +++ b/ads/opctl/operator/lowcode/anomaly/model/autots.py @@ -1,9 +1,12 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + +import report_creator as rc + from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger from ads.opctl.operator.lowcode.anomaly.const import OutputColumns @@ -12,6 +15,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("root").setLevel(logging.WARNING) + class AutoTSOperatorModel(AnomalyOperatorBaseModel): """Class representing AutoTS Anomaly Detection operator model.""" @@ -91,8 +96,6 @@ def _build_model(self) -> AnomalyOutput: return anomaly_output def _generate_report(self): - import report_creator as rc - """The method that needs to be implemented on the particular model level.""" other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index e8de5213e..c24068ccb 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -3,6 +3,7 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import tempfile import time @@ -12,6 +13,7 @@ import fsspec import numpy as np import pandas as pd +import report_creator as rc from sklearn import linear_model from ads.common.object_storage_details import ObjectStorageDetails @@ -33,6 +35,8 @@ from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData +logging.getLogger("root").setLevel(logging.WARNING) + class AnomalyOperatorBaseModel(ABC): """The base class for the anomaly detection operator models.""" @@ -59,8 +63,8 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets): def generate_report(self): """Generates the report.""" import matplotlib.pyplot as plt - plt.rcParams.update({'figure.max_open_warning': 0}) - import report_creator as rc + + plt.rcParams.update({"figure.max_open_warning": 0}) start_time = time.time() # fallback using sklearn oneclasssvm when the sub model _build_model fails @@ -84,7 +88,13 @@ def generate_report(self): anomaly_output, test_data, elapsed_time ) table_blocks = [ - rc.DataTable(df.head(SUBSAMPLE_THRESHOLD) if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD else df, label=col, index=True) + rc.DataTable( + df.head(SUBSAMPLE_THRESHOLD) + if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD + else df, + label=col, + index=True, + ) for col, df in self.datasets.full_data_dict.items() ] data_table = rc.Select(blocks=table_blocks) @@ -144,7 +154,9 @@ def generate_report(self): else: figure_blocks = None - blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None + blocks.append( + rc.Group(*figure_blocks, label=target) + ) if figure_blocks else None plots = rc.Select(blocks) report_sections = [] @@ -154,7 +166,9 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"), + rc.Text( + f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n" + ), rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." @@ -285,8 +299,6 @@ def _save_report( test_metrics: pd.DataFrame, ): """Saves resulting reports to the given folder.""" - import report_creator as rc - unique_output_dir = self.spec.output_directory.url if ObjectStorageDetails.is_oci_path(unique_output_dir): diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py index 0083ad0fd..b5adfd6cc 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py @@ -1,17 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .base_model import AnomalyOperatorBaseModel from .anomaly_dataset import AnomalyOutput -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .base_model import AnomalyOperatorBaseModel + +logging.getLogger("root").setLevel(logging.WARNING) class IsolationForestOperatorModel(AnomalyOperatorBaseModel): @@ -36,13 +40,9 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = IsolationForest(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)( - model.predict(df) - ) + y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) - scores = model.score_samples( - df - ) + scores = model.score_samples(df) index_col = df.columns[0] @@ -59,7 +59,6 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py index 157f7eb60..c6d3269ad 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +++ b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py @@ -1,17 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency +from ads.opctl.operator.lowcode.anomaly.const import OutputColumns -from .base_model import AnomalyOperatorBaseModel from .anomaly_dataset import AnomalyOutput -from ads.opctl.operator.lowcode.anomaly.const import OutputColumns +from .base_model import AnomalyOperatorBaseModel + +logging.getLogger("root").setLevel(logging.WARNING) class OneClassSVMOperatorModel(AnomalyOperatorBaseModel): @@ -36,13 +40,9 @@ def _build_model(self) -> AnomalyOutput: for target, df in self.datasets.full_data_dict.items(): model = OneClassSVM(**model_kwargs) model.fit(df) - y_pred = np.vectorize(self.outlier_map.get)( - model.predict(df) - ) + y_pred = np.vectorize(self.outlier_map.get)(model.predict(df)) - scores = model.score_samples( - df - ) + scores = model.score_samples(df) index_col = df.columns[0] @@ -54,12 +54,11 @@ def _build_model(self) -> AnomalyOutput: ).reset_index(drop=True) anomaly_output.add_output(target, anomaly, score) - + return anomaly_output def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index 17f19351d..0ea344228 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -3,8 +3,11 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging + import numpy as np import pandas as pd +import report_creator as rc from ads.common.decorator.runtime_dependency import runtime_dependency from ads.opctl import logger @@ -13,6 +16,8 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel +logging.getLogger("root").setLevel(logging.WARNING) + class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): """ @@ -27,7 +32,7 @@ class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): ), ) def _build_model(self) -> AnomalyOutput: - from rrcf import RCTree + import rrcf model_kwargs = self.spec.model_kwargs @@ -96,7 +101,6 @@ def _build_model(self) -> AnomalyOutput: def _generate_report(self): """Generates the report.""" - import report_creator as rc other_sections = [ rc.Heading("Selected Models Overview", level=2), From 72b991d73072e6bec501905bbca3e52e4ea2c529 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 11:42:00 +0000 Subject: [PATCH 10/29] add in changes to other operators --- .../operator/lowcode/pii/model/report.py | 18 +-- .../lowcode/recommender/model/base_model.py | 103 ++++++++++-------- .../operator/lowcode/recommender/model/svd.py | 70 +++++++----- 3 files changed, 110 insertions(+), 81 deletions(-) diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py index 50e0fe579..70ef098d8 100644 --- a/ads/opctl/operator/lowcode/pii/model/report.py +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -1,10 +1,10 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import random import tempfile @@ -40,11 +40,13 @@ try: import report_creator as rc -except ImportError: +except ImportError as e: raise ModuleNotFoundError( f"`report-creator` module was not found. Please run " f"`pip install {OptionalDependency.PII}`." - ) + ) from e + +logging.getLogger("root").setLevel(logging.WARNING) @dataclass(repr=True) @@ -139,13 +141,13 @@ def make_model_card(model_name="", readme_path=""): fig = go.Figure( data=[ go.Table( - header=dict(values=list(df.columns)), - cells=dict(values=[df.Metrics, df.Values]), + header={"Columns": df.columns}, + cells={"Metrics": df.Metrics, "Values": df.Values}, ) ] ) eval_res_tb = rc.Widget(data=fig, caption="Evaluation Results") - except: + except Exception: eval_res_tb = rc.Text("-") logger.warning( "The given readme.md doesn't have correct template for Evaluation Results." @@ -321,7 +323,9 @@ def make_view(self): self.report_sections = [title_text, report_description, time_proceed, structure] return self - def save_report(self, report_sections=None, report_uri=None, storage_options={}): + def save_report( + self, report_sections=None, report_uri=None, storage_options: Dict = None + ): with tempfile.TemporaryDirectory() as temp_dir: report_local_path = os.path.join(temp_dir, "___report.html") disable_print() diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index f317b19b0..c345f84a7 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -1,39 +1,43 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging import os import tempfile import time from abc import ABC, abstractmethod -from typing import Tuple, Dict +from typing import Dict, Tuple import fsspec import pandas as pd import report_creator as rc +from plotly import graph_objects as go from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger -from ads.opctl.operator.lowcode.common.utils import default_signer from ads.opctl.operator.lowcode.common.utils import ( - human_time_friendly, - enable_print, + default_signer, disable_print, + enable_print, + human_time_friendly, write_data, ) + +from ..operator_config import RecommenderOperatorConfig from .factory import SupportedModels from .recommender_dataset import RecommenderDatasets -from ..operator_config import RecommenderOperatorConfig -from plotly import graph_objects as go -import matplotlib.pyplot as plt + +logging.getLogger("root").setLevel(logging.WARNING) class RecommenderOperatorBaseModel(ABC): """The base class for the recommender detection operator models.""" - def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): + def __init__( + self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets + ): self.config = config self.spec = self.config.spec self.datasets = datasets @@ -71,7 +75,7 @@ def generate_report(self): rc.Metric( heading="Num items", value=len(self.datasets.items), - ) + ), ), ) @@ -83,62 +87,67 @@ def generate_report(self): user_rating_counts = self.datasets.interactions[user_col].value_counts() fig_user = go.Figure(data=[go.Histogram(x=user_rating_counts, nbinsx=100)]) fig_user.update_layout( - title=f'Distribution of the number of interactions by {user_col}', - xaxis_title=f'Number of {interaction_col}', - yaxis_title=f'Number of {user_col}', - bargap=0.2 + title=f"Distribution of the number of interactions by {user_col}", + xaxis_title=f"Number of {interaction_col}", + yaxis_title=f"Number of {user_col}", + bargap=0.2, ) item_title = rc.Heading("Item Statistics", level=2) item_rating_counts = self.datasets.interactions[item_col].value_counts() fig_item = go.Figure(data=[go.Histogram(x=item_rating_counts, nbinsx=100)]) fig_item.update_layout( - title=f'Distribution of the number of interactions by {item_col}', - xaxis_title=f'Number of {interaction_col}', - yaxis_title=f'Number of {item_col}', - bargap=0.2 + title=f"Distribution of the number of interactions by {item_col}", + xaxis_title=f"Number of {interaction_col}", + yaxis_title=f"Number of {item_col}", + bargap=0.2, ) result_heatmap_title = rc.Heading("Sample Recommendations", level=2) sample_items = result_df[item_col].head(100).index filtered_df = result_df[result_df[item_col].isin(sample_items)] - data = filtered_df.pivot(index=user_col, columns=item_col, values=interaction_col) - fig = go.Figure(data=go.Heatmap( - z=data.values, - x=data.columns, - y=data.index, - colorscale='Viridis' - )) + data = filtered_df.pivot( + index=user_col, columns=item_col, values=interaction_col + ) + fig = go.Figure( + data=go.Heatmap( + z=data.values, x=data.columns, y=data.index, colorscale="Viridis" + ) + ) fig.update_layout( - title='Recommendation heatmap of User-Item Interactions (sample)', + title="Recommendation heatmap of User-Item Interactions (sample)", width=1500, height=800, xaxis_title=item_col, yaxis_title=user_col, - coloraxis_colorbar=dict(title=interaction_col) + coloraxis_colorbar={"title": interaction_col}, ) - plots = [user_title, rc.Widget(fig_user), - item_title, rc.Widget(fig_item), - result_heatmap_title, rc.Widget(fig)] + plots = [ + user_title, + rc.Widget(fig_user), + item_title, + rc.Widget(fig_item), + result_heatmap_title, + rc.Widget(fig), + ] test_metrics_sections = [rc.DataTable(pd.DataFrame(metrics, index=[0]))] yaml_appendix_title = rc.Heading("Reference: YAML File", level=2) yaml_appendix = rc.Yaml(self.config.to_dict()) report_sections = ( - [summary] - + plots - + test_metrics_sections - + other_sections - + [yaml_appendix_title, yaml_appendix] + [summary] + + plots + + test_metrics_sections + + other_sections + + [yaml_appendix_title, yaml_appendix] ) # save the report and result CSV - self._save_report( - report_sections=report_sections, - result_df=result_df - ) + self._save_report(report_sections=report_sections, result_df=result_df) + @abstractmethod def _evaluation_metrics(self): pass + @abstractmethod def _test_data_evaluate_metrics(self): pass @@ -150,7 +159,7 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): if ObjectStorageDetails.is_oci_path(unique_output_dir): storage_options = default_signer() else: - storage_options = dict() + storage_options = {} # report-creator html report if self.spec.generate_report: @@ -161,19 +170,23 @@ def _save_report(self, report_sections: Tuple, result_df: pd.DataFrame): report.save(rc.Block(*report_sections), report_local_path) enable_print() - report_path = os.path.join(unique_output_dir, self.spec.report_filename) + report_path = os.path.join( + unique_output_dir, self.spec.report_filename + ) with open(report_local_path) as f1: with fsspec.open( - report_path, - "w", - **storage_options, + report_path, + "w", + **storage_options, ) as f2: f2.write(f1.read()) # recommender csv report write_data( data=result_df, - filename=os.path.join(unique_output_dir, self.spec.recommendations_filename), + filename=os.path.join( + unique_output_dir, self.spec.recommendations_filename + ), format="csv", storage_options=storage_options, ) diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index 968170986..a92a51fda 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -1,28 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -from typing import Tuple, Dict, Any - # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import logging +from typing import Dict, Tuple import pandas as pd +import report_creator as rc from pandas import DataFrame +from surprise import SVD, Dataset, Reader +from surprise.accuracy import mae, rmse +from surprise.model_selection import train_test_split -from .recommender_dataset import RecommenderDatasets +from ..constant import SupportedMetrics from ..operator_config import RecommenderOperatorConfig from .factory import RecommenderOperatorBaseModel -from surprise import Dataset, Reader -from surprise.model_selection import train_test_split -from surprise import SVD -from surprise.accuracy import rmse, mae -import report_creator as rc -from ..constant import SupportedMetrics +from .recommender_dataset import RecommenderDatasets + +logging.getLogger("root").setLevel(logging.WARNING) class SVDOperatorModel(RecommenderOperatorBaseModel): """Class representing scikit surprise SVD operator model.""" - def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets): + def __init__( + self, config: RecommenderOperatorConfig, datasets: RecommenderDatasets + ): super().__init__(config, datasets) self.interactions = datasets.interactions self.users = datasets.users @@ -35,8 +37,12 @@ def __init__(self, config: RecommenderOperatorConfig, datasets: RecommenderDatas def _get_recommendations(self, user_id, n): all_item_ids = self.items[self.item_id].unique() - rated_items = self.interactions[self.interactions[self.user_id] == user_id][self.item_id] - unrated_items = [item_id for item_id in all_item_ids if item_id not in rated_items.values] + rated_items = self.interactions[self.interactions[self.user_id] == user_id][ + self.item_id + ] + unrated_items = [ + item_id for item_id in all_item_ids if item_id not in rated_items.values + ] predictions = [self.algo.predict(user_id, item_id) for item_id in unrated_items] predictions.sort(key=lambda x: x.est, reverse=True) top_n_recommendations = predictions[:n] @@ -46,7 +52,10 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: min_rating = self.interactions[self.interaction_column].min() max_rating = self.interactions[self.interaction_column].max() reader = Reader(rating_scale=(min_rating, max_rating)) - data = Dataset.load_from_df(self.interactions[[self.user_id, self.item_id, self.interaction_column]], reader) + data = Dataset.load_from_df( + self.interactions[[self.user_id, self.item_id, self.interaction_column]], + reader, + ) trainset, testset = train_test_split(data, test_size=self.test_size) self.algo.fit(trainset) predictions = self.algo.test(testset) @@ -58,11 +67,13 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: for user_id in self.users[self.user_id]: recommendations = self._get_recommendations(user_id, n=self.spec.top_k) for item_id, est_rating in recommendations: - all_recommendations.append({ - self.user_id: user_id, - self.item_id: item_id, - self.interaction_column: est_rating - }) + all_recommendations.append( + { + self.user_id: user_id, + self.item_id: item_id, + self.interaction_column: est_rating, + } + ) recommendations_df = pd.DataFrame(all_recommendations) return recommendations_df, metric @@ -72,17 +83,18 @@ def _generate_report(self): decompose a user-item interaction matrix into three constituent matrices. These matrices capture the latent factors that explain the observed interactions. """ - new_user_recommendations = self._get_recommendations("__new_user__", self.spec.top_k) + new_user_recommendations = self._get_recommendations( + "__new_user__", self.spec.top_k + ) new_recommendations = [] for item_id, est_rating in new_user_recommendations: - new_recommendations.append({ - self.user_id: "__new_user__", - self.item_id: item_id, - self.interaction_column: est_rating - }) + new_recommendations.append( + { + self.user_id: "__new_user__", + self.item_id: item_id, + self.interaction_column: est_rating, + } + ) title = rc.Heading("Recommendations for new users", level=2) other_sections = [title, rc.DataTable(new_recommendations)] - return ( - model_description, - other_sections - ) + return (model_description, other_sections) From a5fbe80c15f8c5d13816f19fb59fa6476f4a467f Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 12:06:00 +0000 Subject: [PATCH 11/29] fixing recommender svd class issue --- ads/opctl/operator/lowcode/recommender/model/svd.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index a92a51fda..a3565c905 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -98,3 +98,9 @@ def _generate_report(self): title = rc.Heading("Recommendations for new users", level=2) other_sections = [title, rc.DataTable(new_recommendations)] return (model_description, other_sections) + + def _evaluation_metrics(self): + pass + + def _test_data_evaluate_metrics(self): + pass From 8134de955343624f48e57d62459dcca76d21f829 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 12:09:05 +0000 Subject: [PATCH 12/29] update data file --- ads/opctl/operator/lowcode/common/data.py | 35 ++++++++++++----------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py index 530a1d392..9426bd284 100644 --- a/ads/opctl/operator/lowcode/common/data.py +++ b/ads/opctl/operator/lowcode/common/data.py @@ -1,29 +1,28 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import time -from .transformations import Transformations +from abc import ABC, abstractmethod + +import pandas as pd + from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns -from ads.opctl.operator.lowcode.common.utils import load_data from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, InvalidParameterError, - PermissionsError, - DataMismatchError, ) -from abc import ABC -import pandas as pd +from ads.opctl.operator.lowcode.common.utils import load_data + +from .transformations import Transformations class AbstractData(ABC): def __init__(self, spec: dict, name="input_data"): self.Transformations = Transformations self.data = None - self._data_dict = dict() + self._data_dict = {} self.name = name self.spec = spec self.load_transform_ingest_data(spec) @@ -35,12 +34,15 @@ def get_raw_data_by_cat(self, category): condition = pd.Series(True, index=self.raw_data.index) if category in mapping: for col, val in mapping[category].items(): - condition &= (self.raw_data[col] == val) + condition &= self.raw_data[col] == val data_by_cat = self.raw_data[condition].reset_index(drop=True) - data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat + data_by_cat = ( + self._data_transformer._format_datetime_col(data_by_cat) + if self.spec.datetime_column + else data_by_cat + ) return data_by_cat - def get_dict_by_series(self): if not self._data_dict: for s_id in self.list_series_ids(): @@ -59,12 +61,12 @@ def get_data_for_series(self, series_id): data_dict = self.get_dict_by_series() try: return data_dict[series_id] - except: + except Exception as e: raise InvalidParameterError( f"Unable to retrieve series {series_id} from {self.name}. Available series ids are: {self.list_series_ids()}" - ) + ) from e - def _load_data(self, data_spec, **kwargs): + def _load_data(self, data_spec): loading_start_time = time.time() try: raw_data = load_data(data_spec) @@ -77,7 +79,7 @@ def _load_data(self, data_spec, **kwargs): ) return raw_data - def _transform_data(self, spec, raw_data, **kwargs): + def _transform_data(self, spec, raw_data): transformation_start_time = time.time() self._data_transformer = self.Transformations(spec, name=self.name) data = self._data_transformer.run(raw_data) @@ -92,6 +94,7 @@ def load_transform_ingest_data(self, spec): self.data = self._transform_data(spec, self.raw_data) self._ingest_data(spec) + @abstractmethod def _ingest_data(self, spec): pass From 25e6497b1b0e8cca5310a1a2a077a604db7d5b3a Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 15:18:58 +0000 Subject: [PATCH 13/29] revert data class --- ads/opctl/operator/lowcode/common/data.py | 35 +++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py index 9426bd284..530a1d392 100644 --- a/ads/opctl/operator/lowcode/common/data.py +++ b/ads/opctl/operator/lowcode/common/data.py @@ -1,28 +1,29 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import time -from abc import ABC, abstractmethod - -import pandas as pd - +from .transformations import Transformations from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns +from ads.opctl.operator.lowcode.common.utils import load_data from ads.opctl.operator.lowcode.common.errors import ( + InputDataError, InvalidParameterError, + PermissionsError, + DataMismatchError, ) -from ads.opctl.operator.lowcode.common.utils import load_data - -from .transformations import Transformations +from abc import ABC +import pandas as pd class AbstractData(ABC): def __init__(self, spec: dict, name="input_data"): self.Transformations = Transformations self.data = None - self._data_dict = {} + self._data_dict = dict() self.name = name self.spec = spec self.load_transform_ingest_data(spec) @@ -34,15 +35,12 @@ def get_raw_data_by_cat(self, category): condition = pd.Series(True, index=self.raw_data.index) if category in mapping: for col, val in mapping[category].items(): - condition &= self.raw_data[col] == val + condition &= (self.raw_data[col] == val) data_by_cat = self.raw_data[condition].reset_index(drop=True) - data_by_cat = ( - self._data_transformer._format_datetime_col(data_by_cat) - if self.spec.datetime_column - else data_by_cat - ) + data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat return data_by_cat + def get_dict_by_series(self): if not self._data_dict: for s_id in self.list_series_ids(): @@ -61,12 +59,12 @@ def get_data_for_series(self, series_id): data_dict = self.get_dict_by_series() try: return data_dict[series_id] - except Exception as e: + except: raise InvalidParameterError( f"Unable to retrieve series {series_id} from {self.name}. Available series ids are: {self.list_series_ids()}" - ) from e + ) - def _load_data(self, data_spec): + def _load_data(self, data_spec, **kwargs): loading_start_time = time.time() try: raw_data = load_data(data_spec) @@ -79,7 +77,7 @@ def _load_data(self, data_spec): ) return raw_data - def _transform_data(self, spec, raw_data): + def _transform_data(self, spec, raw_data, **kwargs): transformation_start_time = time.time() self._data_transformer = self.Transformations(spec, name=self.name) data = self._data_transformer.run(raw_data) @@ -94,7 +92,6 @@ def load_transform_ingest_data(self, spec): self.data = self._transform_data(spec, self.raw_data) self._ingest_data(spec) - @abstractmethod def _ingest_data(self, spec): pass From 5e926662dc0017d509e972cad87d28937766616d Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 15:34:32 +0000 Subject: [PATCH 14/29] bump rc version --- ads/opctl/operator/lowcode/common/data.py | 24 ++++++++++++----------- pyproject.toml | 8 ++++---- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py index 530a1d392..c85d5e5df 100644 --- a/ads/opctl/operator/lowcode/common/data.py +++ b/ads/opctl/operator/lowcode/common/data.py @@ -1,22 +1,21 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import time -from .transformations import Transformations +from abc import ABC + +import pandas as pd + from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns -from ads.opctl.operator.lowcode.common.utils import load_data from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, InvalidParameterError, - PermissionsError, - DataMismatchError, ) -from abc import ABC -import pandas as pd +from ads.opctl.operator.lowcode.common.utils import load_data + +from .transformations import Transformations class AbstractData(ABC): @@ -35,12 +34,15 @@ def get_raw_data_by_cat(self, category): condition = pd.Series(True, index=self.raw_data.index) if category in mapping: for col, val in mapping[category].items(): - condition &= (self.raw_data[col] == val) + condition &= self.raw_data[col] == val data_by_cat = self.raw_data[condition].reset_index(drop=True) - data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat + data_by_cat = ( + self._data_transformer._format_datetime_col(data_by_cat) + if self.spec.datetime_column + else data_by_cat + ) return data_by_cat - def get_dict_by_series(self): if not self._data_dict: for s_id in self.list_series_ids(): diff --git a/pyproject.toml b/pyproject.toml index 833fefed6..4f3fe0e17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,13 +171,13 @@ forecast = [ "statsmodels", "plotly", "oracledb", - "report-creator==1.0.9", + "report-creator==1.0.27", ] anomaly = [ "oracle_ads[opctl]", "autots", "oracledb", - "report-creator==1.0.9", + "report-creator==1.0.27", "rrcf==0.4.4", "scikit-learn", "salesforce-merlion[all]==2.0.4" @@ -186,7 +186,7 @@ recommender = [ "oracle_ads[opctl]", "scikit-surprise", "plotly", - "report-creator==1.0.9", + "report-creator==1.0.27", ] feature-store-marketplace = [ "oracle-ads[opctl]", @@ -202,7 +202,7 @@ pii = [ "scrubadub_spacy", "spacy-transformers==1.2.5", "spacy==3.6.1", - "report-creator==1.0.9", + "report-creator==1.0.27", ] llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] From 12e24d54a1df4ceb4a24650dee78982e587b89b8 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 16:30:51 +0000 Subject: [PATCH 15/29] upgrade to latest rc logging --- .../operator/lowcode/anomaly/model/anomaly_merlion.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/automlx.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/autots.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/base_model.py | 2 +- .../operator/lowcode/anomaly/model/isolationforest.py | 2 +- ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py | 2 +- .../operator/lowcode/anomaly/model/randomcutforest.py | 2 +- ads/opctl/operator/lowcode/forecast/model/arima.py | 2 +- ads/opctl/operator/lowcode/forecast/model/automlx.py | 8 ++++---- ads/opctl/operator/lowcode/forecast/model/autots.py | 2 +- ads/opctl/operator/lowcode/forecast/model/base_model.py | 2 +- ads/opctl/operator/lowcode/forecast/model/ml_forecast.py | 2 +- .../operator/lowcode/forecast/model/neuralprophet.py | 2 +- ads/opctl/operator/lowcode/forecast/model/prophet.py | 2 +- ads/opctl/operator/lowcode/forecast/utils.py | 2 +- ads/opctl/operator/lowcode/pii/model/report.py | 2 +- .../operator/lowcode/recommender/model/base_model.py | 2 +- ads/opctl/operator/lowcode/recommender/model/svd.py | 2 +- pyproject.toml | 8 ++++---- 19 files changed, 25 insertions(+), 25 deletions(-) diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py index 308d97370..8999b2674 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py +++ b/ads/opctl/operator/lowcode/anomaly/model/anomaly_merlion.py @@ -23,7 +23,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class AnomalyMerlionOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py index 6e665c125..059545cf8 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/automlx.py +++ b/ads/opctl/operator/lowcode/anomaly/model/automlx.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class AutoMLXOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/autots.py b/ads/opctl/operator/lowcode/anomaly/model/autots.py index 32702596c..550833a67 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/autots.py +++ b/ads/opctl/operator/lowcode/anomaly/model/autots.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class AutoTSOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index c24068ccb..c9ca984be 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -35,7 +35,7 @@ from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class AnomalyOperatorBaseModel(ABC): diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py index b5adfd6cc..ef7715653 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class IsolationForestOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py index c6d3269ad..f6177e63d 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py +++ b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py @@ -15,7 +15,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class OneClassSVMOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py index 0ea344228..ad34159ab 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py +++ b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py @@ -16,7 +16,7 @@ from .anomaly_dataset import AnomalyOutput from .base_model import AnomalyOperatorBaseModel -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class RandomCutForestOperatorModel(AnomalyOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/forecast/model/arima.py b/ads/opctl/operator/lowcode/forecast/model/arima.py index 87edccdfa..17817257f 100644 --- a/ads/opctl/operator/lowcode/forecast/model/arima.py +++ b/ads/opctl/operator/lowcode/forecast/model/arima.py @@ -20,7 +20,7 @@ from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class ArimaOperatorModel(ForecastOperatorBaseModel): diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index 41846a5d3..d2fc61778 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -24,7 +24,7 @@ from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) AUTOMLX_N_ALGOS_TUNED = 4 AUTOMLX_DEFAULT_SCORE_METRIC = "neg_sym_mean_abs_percent_error" @@ -78,10 +78,10 @@ def preprocess(self, data): # TODO: re-use self.le for explanations def _build_model(self) -> pd.DataFrame: import logging - import automlx + from automlx import Pipeline, init try: - automlx.init( + init( engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL, @@ -123,7 +123,7 @@ def _build_model(self) -> pd.DataFrame: if self.loaded_models is not None and s_id in self.loaded_models: model = self.loaded_models[s_id] else: - model = automlx.Pipeline( + model = Pipeline( task="forecasting", **model_kwargs, ) diff --git a/ads/opctl/operator/lowcode/forecast/model/autots.py b/ads/opctl/operator/lowcode/forecast/model/autots.py index fac04a898..ca3310bab 100644 --- a/ads/opctl/operator/lowcode/forecast/model/autots.py +++ b/ads/opctl/operator/lowcode/forecast/model/autots.py @@ -20,7 +20,7 @@ from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets, ForecastOutput -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) AUTOTS_MAX_GENERATION = 10 AUTOTS_MODELS_TO_VALIDATE = 0.15 diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 84aa53208..357426a79 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -51,7 +51,7 @@ from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec from .forecast_datasets import ForecastDatasets -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class ForecastOperatorBaseModel(ABC): diff --git a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py index 9907a26e7..1911ebf0c 100644 --- a/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py +++ b/ads/opctl/operator/lowcode/forecast/model/ml_forecast.py @@ -193,7 +193,7 @@ def _generate_report(self): import report_creator as rc from utilsforecast.plotting import plot_series - logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("report_creator").setLevel(logging.WARNING) # Section 1: Forecast Overview sec1_text = rc.Block( diff --git a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py index 08afa092a..040f05748 100644 --- a/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/neuralprophet.py @@ -308,7 +308,7 @@ def objective(trial): def _generate_report(self): import report_creator as rc - logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("report_creator").setLevel(logging.WARNING) series_ids = self.models.keys() all_sections = [] diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index fc70b6c11..24121b531 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -243,7 +243,7 @@ def _generate_report(self): import report_creator as rc from prophet.plot import add_changepoints_to_plot - logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("report_creator").setLevel(logging.WARNING) series_ids = self.models.keys() all_sections = [] diff --git a/ads/opctl/operator/lowcode/forecast/utils.py b/ads/opctl/operator/lowcode/forecast/utils.py index e3a88d7b7..72c7b727a 100644 --- a/ads/opctl/operator/lowcode/forecast/utils.py +++ b/ads/opctl/operator/lowcode/forecast/utils.py @@ -32,7 +32,7 @@ from .const import RENDER_LIMIT, SupportedMetrics -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) def _label_encode_dataframe(df, no_encode: Set = None): diff --git a/ads/opctl/operator/lowcode/pii/model/report.py b/ads/opctl/operator/lowcode/pii/model/report.py index 70ef098d8..d4fca2d9b 100644 --- a/ads/opctl/operator/lowcode/pii/model/report.py +++ b/ads/opctl/operator/lowcode/pii/model/report.py @@ -46,7 +46,7 @@ f"`pip install {OptionalDependency.PII}`." ) from e -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) @dataclass(repr=True) diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index c345f84a7..bd4ab9f3c 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -29,7 +29,7 @@ from .factory import SupportedModels from .recommender_dataset import RecommenderDatasets -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class RecommenderOperatorBaseModel(ABC): diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index a3565c905..7f86f4dfa 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -16,7 +16,7 @@ from .factory import RecommenderOperatorBaseModel from .recommender_dataset import RecommenderDatasets -logging.getLogger("root").setLevel(logging.WARNING) +logging.getLogger("report_creator").setLevel(logging.WARNING) class SVDOperatorModel(RecommenderOperatorBaseModel): diff --git a/pyproject.toml b/pyproject.toml index 4f3fe0e17..24e9884f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,13 +171,13 @@ forecast = [ "statsmodels", "plotly", "oracledb", - "report-creator==1.0.27", + "report-creator==1.0.28", ] anomaly = [ "oracle_ads[opctl]", "autots", "oracledb", - "report-creator==1.0.27", + "report-creator==1.0.28", "rrcf==0.4.4", "scikit-learn", "salesforce-merlion[all]==2.0.4" @@ -186,7 +186,7 @@ recommender = [ "oracle_ads[opctl]", "scikit-surprise", "plotly", - "report-creator==1.0.27", + "report-creator==1.0.28", ] feature-store-marketplace = [ "oracle-ads[opctl]", @@ -202,7 +202,7 @@ pii = [ "scrubadub_spacy", "spacy-transformers==1.2.5", "spacy==3.6.1", - "report-creator==1.0.27", + "report-creator==1.0.28", ] llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] From 171a68981af7b2ad02f408f2ae9204753b59bcc5 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 12 Nov 2024 19:21:25 +0000 Subject: [PATCH 16/29] treat model_desc as rc.Text --- ads/opctl/operator/lowcode/anomaly/model/base_model.py | 5 ++--- ads/opctl/operator/lowcode/forecast/model/base_model.py | 3 ++- ads/opctl/operator/lowcode/forecast/model/prophet.py | 2 +- ads/opctl/operator/lowcode/recommender/model/base_model.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py index c9ca984be..5ee1ca36f 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/base_model.py +++ b/ads/opctl/operator/lowcode/anomaly/model/base_model.py @@ -166,9 +166,8 @@ def generate_report(self): yaml_appendix = rc.Yaml(self.config.to_dict()) summary = rc.Block( rc.Group( - rc.Text( - f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n" - ), + rc.Text(f"You selected the **`{self.spec.model}`** model.\n"), + model_description, rc.Text( "Based on your dataset, you could have also selected " f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`." diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index 357426a79..ba933ecc7 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -148,8 +148,9 @@ def generate_report(self): header_section = rc.Block( rc.Heading("Forecast Report", level=1), rc.Text( - f"You selected the {self.spec.model} model.\n{model_description}\nBased on your dataset, you could have also selected any of the models: {SupportedModels.keys()}." + f"You selected the {self.spec.model} model.\nBased on your dataset, you could have also selected any of the models: {SupportedModels.keys()}." ), + model_description, rc.Group( rc.Metric( heading="Analysis was completed in ", diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index 24121b531..aa9033b98 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -354,7 +354,7 @@ def _generate_report(self): logger.warn(f"Failed to generate Explanations with error: {e}.") logger.debug(f"Full Traceback: {traceback.format_exc()}") - model_description = ( + model_description = rc.Text( "Prophet is a procedure for forecasting time series data based on an additive " "model where non-linear trends are fit with yearly, weekly, and daily seasonality, " "plus holiday effects. It works best with time series that have strong seasonal " diff --git a/ads/opctl/operator/lowcode/recommender/model/base_model.py b/ads/opctl/operator/lowcode/recommender/model/base_model.py index bd4ab9f3c..7f677a037 100644 --- a/ads/opctl/operator/lowcode/recommender/model/base_model.py +++ b/ads/opctl/operator/lowcode/recommender/model/base_model.py @@ -61,8 +61,9 @@ def generate_report(self): header_section = rc.Block( rc.Heading("Recommender Report", level=1), rc.Text( - f"The recommendations was generated using {SupportedModels.SVD.upper()}. {model_description}" + f"The recommendations was generated using {SupportedModels.SVD.upper()}." ), + model_description, rc.Group( rc.Metric( heading="Recommendations was generated in ", From 177a71ce3570cfd8556f7d1b4f4e4638407bfd53 Mon Sep 17 00:00:00 2001 From: Allen Date: Wed, 13 Nov 2024 10:24:53 +0000 Subject: [PATCH 17/29] typo in reccommender --- ads/opctl/operator/lowcode/recommender/model/svd.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ads/opctl/operator/lowcode/recommender/model/svd.py b/ads/opctl/operator/lowcode/recommender/model/svd.py index 7f86f4dfa..a14c5a745 100644 --- a/ads/opctl/operator/lowcode/recommender/model/svd.py +++ b/ads/opctl/operator/lowcode/recommender/model/svd.py @@ -78,11 +78,11 @@ def _build_model(self) -> Tuple[DataFrame, Dict]: return recommendations_df, metric def _generate_report(self): - model_description = """ - Singular Value Decomposition (SVD) is a matrix factorization technique used in recommendation systems to - decompose a user-item interaction matrix into three constituent matrices. These matrices capture the - latent factors that explain the observed interactions. - """ + model_description = rc.Text( + "Singular Value Decomposition (SVD) is a matrix factorization technique used in recommendation systems to \ + decompose a user-item interaction matrix into three constituent matrices. These matrices capture the \ + latent factors that explain the observed interactions." + ) new_user_recommendations = self._get_recommendations( "__new_user__", self.spec.top_k ) From fc1a992830e92bc589df03e7b0b6961198b8b176 Mon Sep 17 00:00:00 2001 From: Allen Date: Wed, 13 Nov 2024 16:33:32 +0000 Subject: [PATCH 18/29] adding more example datasets --- ads/opctl/operator/common/utils.py | 9 +++--- .../operator/lowcode/anomaly/model/factory.py | 4 +-- .../lowcode/forecast/model/factory.py | 5 ++-- .../operator/lowcode/pii/model/factory.py | 9 +++--- .../lowcode/recommender/model/factory.py | 10 +++---- tests/operators/data/retail_forecast.yaml | 16 +++++++++++ .../operators/data/timeseries/retail_add.csv | 28 +++++++++++++++++++ .../operators/data/timeseries/retail_prim.csv | 25 +++++++++++++++++ .../operators/data/timeseries/retail_test.csv | 4 +++ 9 files changed, 92 insertions(+), 18 deletions(-) create mode 100644 tests/operators/data/retail_forecast.yaml create mode 100644 tests/operators/data/timeseries/retail_add.csv create mode 100644 tests/operators/data/timeseries/retail_prim.csv create mode 100644 tests/operators/data/timeseries/retail_test.csv diff --git a/ads/opctl/operator/common/utils.py b/ads/opctl/operator/common/utils.py index 7db91a221..47808edd0 100644 --- a/ads/opctl/operator/common/utils.py +++ b/ads/opctl/operator/common/utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ @@ -18,7 +17,6 @@ from cerberus import Validator from ads.opctl import logger, utils -from ads.opctl.operator import __operators__ CONTAINER_NETWORK = "CONTAINER_NETWORK" @@ -26,7 +24,10 @@ class OperatorValidator(Validator): """The custom validator class.""" - pass + def validate(self, obj_dict, **kwargs): + # Model should be case insensitive + obj_dict["spec"]["model"] = str(obj_dict["spec"]["model"]).lower() + return super().validate(obj_dict, **kwargs) def create_output_folder(name): @@ -34,7 +35,7 @@ def create_output_folder(name): protocol = fsspec.utils.get_protocol(output_folder) storage_options = {} if protocol != "file": - storage_options = auth or default_signer() + storage_options = default_signer() fs = fsspec.filesystem(protocol, **storage_options) name_suffix = 1 diff --git a/ads/opctl/operator/lowcode/anomaly/model/factory.py b/ads/opctl/operator/lowcode/anomaly/model/factory.py index 10df5733c..4bb59d02b 100644 --- a/ads/opctl/operator/lowcode/anomaly/model/factory.py +++ b/ads/opctl/operator/lowcode/anomaly/model/factory.py @@ -26,9 +26,9 @@ class UnSupportedModelError(Exception): def __init__(self, operator_config: AnomalyOperatorConfig, model_type: str): supported_models = ( - SupportedModels.values + SupportedModels.values() if operator_config.spec.datetime_column - else NonTimeADSupportedModels.values + else NonTimeADSupportedModels.values() ) message = ( f"Model: `{model_type}` is not supported. " diff --git a/ads/opctl/operator/lowcode/forecast/model/factory.py b/ads/opctl/operator/lowcode/forecast/model/factory.py index eb7f0bee3..446709a0d 100644 --- a/ads/opctl/operator/lowcode/forecast/model/factory.py +++ b/ads/opctl/operator/lowcode/forecast/model/factory.py @@ -11,6 +11,7 @@ from .autots import AutoTSOperatorModel from .base_model import ForecastOperatorBaseModel from .forecast_datasets import ForecastDatasets +from .ml_forecast import MLForecastOperatorModel from .neuralprophet import NeuralProphetOperatorModel from .prophet import ProphetOperatorModel @@ -19,7 +20,7 @@ class UnSupportedModelError(Exception): def __init__(self, model_type: str): super().__init__( f"Model: `{model_type}` " - f"is not supported. Supported models: {SupportedModels.values}" + f"is not supported. Supported models: {SupportedModels.values()}" ) @@ -32,7 +33,7 @@ class ForecastOperatorModelFactory: SupportedModels.Prophet: ProphetOperatorModel, SupportedModels.Arima: ArimaOperatorModel, SupportedModels.NeuralProphet: NeuralProphetOperatorModel, - # SupportedModels.LGBForecast: MLForecastOperatorModel, + SupportedModels.LGBForecast: MLForecastOperatorModel, SupportedModels.AutoMLX: AutoMLXOperatorModel, SupportedModels.AutoTS: AutoTSOperatorModel, } diff --git a/ads/opctl/operator/lowcode/pii/model/factory.py b/ads/opctl/operator/lowcode/pii/model/factory.py index 102204ea3..c95bce33a 100644 --- a/ads/opctl/operator/lowcode/pii/model/factory.py +++ b/ads/opctl/operator/lowcode/pii/model/factory.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2023 Oracle and/or its affiliates. +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import uuid @@ -18,7 +17,7 @@ class UnSupportedDetectorError(Exception): def __init__(self, dtype: str): super().__init__( f"Detector: `{dtype}` " - f"is not supported. Supported models: {SupportedDetector.values}" + f"is not supported. Supported models: {SupportedDetector.values()}" ) @@ -42,7 +41,9 @@ class SpacyDetector(PiiBaseDetector): @runtime_dependency(module="scrubadub", install_from=OptionalDependency.PII) @runtime_dependency(module="scrubadub_spacy", install_from=OptionalDependency.PII) def construct(cls, entity, model, **kwargs): - spacy_entity_detector = scrubadub_spacy.detectors.spacy.SpacyEntityDetector( + import scrubadub + from scrubadub_spacy.detectors.spacy import SpacyEntityDetector + spacy_entity_detector = SpacyEntityDetector( named_entities=[entity], name=f"spacy_{uuid.uuid4()}", model=model, diff --git a/ads/opctl/operator/lowcode/recommender/model/factory.py b/ads/opctl/operator/lowcode/recommender/model/factory.py index c6284f36e..149d4565e 100644 --- a/ads/opctl/operator/lowcode/recommender/model/factory.py +++ b/ads/opctl/operator/lowcode/recommender/model/factory.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2023 Oracle and/or its affiliates. +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from ..constant import SupportedModels @@ -10,11 +9,12 @@ from .recommender_dataset import RecommenderDatasets from .svd import SVDOperatorModel + class UnSupportedModelError(Exception): def __init__(self, model_type: str): super().__init__( f"Model: `{model_type}` " - f"is not supported. Supported models: {SupportedModels.values}" + f"is not supported. Supported models: {SupportedModels.values()}" ) @@ -23,9 +23,7 @@ class RecommenderOperatorModelFactory: The factory class helps to instantiate proper model operator based on the model type. """ - _MAP = { - SupportedModels.SVD: SVDOperatorModel - } + _MAP = {SupportedModels.SVD: SVDOperatorModel} @classmethod def get_model( diff --git a/tests/operators/data/retail_forecast.yaml b/tests/operators/data/retail_forecast.yaml new file mode 100644 index 000000000..1b104570f --- /dev/null +++ b/tests/operators/data/retail_forecast.yaml @@ -0,0 +1,16 @@ +kind: operator +type: forecast +version: v1 +spec: + datetime_column: + name: Date + historical_data: + url: timeseries/retail_prim.csv + additional_data: + url: timeseries/retail_add.csv + test_data: + url: timeseries/retail_test.csv + horizon: 3 + model: LGBForecast + target_column: Sales + # generate_explanations: True diff --git a/tests/operators/data/timeseries/retail_add.csv b/tests/operators/data/timeseries/retail_add.csv new file mode 100644 index 000000000..5c8bf676e --- /dev/null +++ b/tests/operators/data/timeseries/retail_add.csv @@ -0,0 +1,28 @@ +Date,Ad Spend,Discount Rate,Foot Traffic +2023-01-01,100.0,0.0,1000.0 +2023-02-01,100.0,0.0,1100.0 +2023-03-01,100.0,0.0,1300.0 +2023-04-01,100.0,0.0,1400.0 +2023-05-01,100.0,0.0,1500.0 +2023-06-01,100.0,0.0,1600.0 +2023-07-01,100.0,0.0,1700.0 +2023-08-01,100.0,0.0,1800.0 +2023-09-01,0.0,0.0,1900.0 +2023-10-01,0.0,0.0,2000.0 +2023-11-01,0.0,0.0,2000.0 +2023-12-01,0.0,0.0,2100.0 +2024-01-01,0.0,0.0,2200.0 +2024-02-01,0.0,0.0,2300.0 +2024-03-01,0.0,0.0,2400.0 +2024-04-01,0.0,0.1,2500.0 +2024-05-01,0.0,0.1,2600.0 +2024-06-01,0.0,0.1,2700.0 +2024-07-01,0.0,0.1,2800.0 +2024-08-01,0.0,0.1,2900.0 +2024-09-01,0.0,0.1,3000.0 +2024-10-01,0.0,0.1,3100.0 +2024-11-01,0.0,0.1,3200.0 +2024-12-01,0.0,0.1,3300.0 +2025-01-01,100.0,0.1,3400.0 +2025-02-01,100.0,0.1,3500.0 +2025-03-01,100.0,0.1,3700.0 diff --git a/tests/operators/data/timeseries/retail_prim.csv b/tests/operators/data/timeseries/retail_prim.csv new file mode 100644 index 000000000..5e6b7a2b4 --- /dev/null +++ b/tests/operators/data/timeseries/retail_prim.csv @@ -0,0 +1,25 @@ +Date,Sales +2023-01-01,672.9013417961711 +2023-02-01,725.386000114524 +2023-03-01,787.6404076885843 +2023-04-01,838.6794060206407 +2023-05-01,896.0618494560191 +2023-06-01,944.0145584340703 +2023-07-01,998.8245931828164 +2023-08-01,1042.7815773098787 +2023-09-01,990.7393502418831 +2023-10-01,1050.4189067877965 +2023-11-01,1087.699685678044 +2023-12-01,1158.7867475664566 +2024-01-01,1214.0293852069105 +2024-02-01,1260.0462629542862 +2024-03-01,1304.5403104618904 +2024-04-01,1573.163162016336 +2024-05-01,1611.2355265665328 +2024-06-01,1680.5619939753535 +2024-07-01,1739.2582412202794 +2024-08-01,1801.9466748533303 +2024-09-01,1885.3736882778257 +2024-10-01,1921.6889033445455 +2024-11-01,1987.2824623188812 +2024-12-01,2045.8385281527853 diff --git a/tests/operators/data/timeseries/retail_test.csv b/tests/operators/data/timeseries/retail_test.csv new file mode 100644 index 000000000..0c85df24b --- /dev/null +++ b/tests/operators/data/timeseries/retail_test.csv @@ -0,0 +1,4 @@ +Date,Sales +2025-01-01,2244.70244431561 +2025-02-01,2311.4765510687644 +2025-03-01,2390.2794816577234 From b5973e41cd4f7a9268e659fb1fcd7c6807dcd65f Mon Sep 17 00:00:00 2001 From: Allen Date: Wed, 13 Nov 2024 16:36:50 +0000 Subject: [PATCH 19/29] restric import --- ads/opctl/operator/lowcode/pii/model/factory.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ads/opctl/operator/lowcode/pii/model/factory.py b/ads/opctl/operator/lowcode/pii/model/factory.py index c95bce33a..b570622c4 100644 --- a/ads/opctl/operator/lowcode/pii/model/factory.py +++ b/ads/opctl/operator/lowcode/pii/model/factory.py @@ -41,8 +41,9 @@ class SpacyDetector(PiiBaseDetector): @runtime_dependency(module="scrubadub", install_from=OptionalDependency.PII) @runtime_dependency(module="scrubadub_spacy", install_from=OptionalDependency.PII) def construct(cls, entity, model, **kwargs): - import scrubadub + from scrubadub.filth import Filth from scrubadub_spacy.detectors.spacy import SpacyEntityDetector + spacy_entity_detector = SpacyEntityDetector( named_entities=[entity], name=f"spacy_{uuid.uuid4()}", @@ -51,7 +52,7 @@ def construct(cls, entity, model, **kwargs): if entity.upper() not in cls.DEFAULT_SPACY_NAMED_ENTITIES: filth_cls = type( construct_filth_cls_name(entity), - (scrubadub.filth.Filth,), + (Filth,), {"type": entity.upper()}, ) spacy_entity_detector.filth_cls_map[entity.upper()] = filth_cls From c86bc36289d1a6525b2ffec6ac674293ef96c462 Mon Sep 17 00:00:00 2001 From: Allen Date: Thu, 14 Nov 2024 12:41:18 +0000 Subject: [PATCH 20/29] check for model arg --- ads/opctl/operator/common/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ads/opctl/operator/common/utils.py b/ads/opctl/operator/common/utils.py index 47808edd0..969057b6b 100644 --- a/ads/opctl/operator/common/utils.py +++ b/ads/opctl/operator/common/utils.py @@ -26,7 +26,8 @@ class OperatorValidator(Validator): def validate(self, obj_dict, **kwargs): # Model should be case insensitive - obj_dict["spec"]["model"] = str(obj_dict["spec"]["model"]).lower() + if "model" in obj_dict["spec"]: + obj_dict["spec"]["model"] = str(obj_dict["spec"]["model"]).lower() return super().validate(obj_dict, **kwargs) From 520870166fd592ee76cb8545b8c52beaa69762ec Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 19 Nov 2024 14:45:33 +0000 Subject: [PATCH 21/29] LightGBM requires re-formatting column names --- .../lowcode/common/transformations.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py index 672d11777..f5261374d 100644 --- a/ads/opctl/operator/lowcode/common/transformations.py +++ b/ads/opctl/operator/lowcode/common/transformations.py @@ -1,18 +1,20 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- -# Copyright (c) 2023 Oracle and/or its affiliates. +# Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from abc import ABC + +import pandas as pd +import re + from ads.opctl import logger +from ads.opctl.operator.lowcode.common.const import DataColumns from ads.opctl.operator.lowcode.common.errors import ( - InvalidParameterError, DataMismatchError, + InvalidParameterError, ) -from ads.opctl.operator.lowcode.common.const import DataColumns from ads.opctl.operator.lowcode.common.utils import merge_category_columns -import pandas as pd -from abc import ABC class Transformations(ABC): @@ -58,6 +60,7 @@ def run(self, data): """ clean_df = self._remove_trailing_whitespace(data) + clean_df = self._normalize_column_names(clean_df) if self.name == "historical_data": self._check_historical_dataset(clean_df) clean_df = self._set_series_id_column(clean_df) @@ -95,8 +98,11 @@ def run(self, data): def _remove_trailing_whitespace(self, df): return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) + def _normalize_column_names(self, df): + return df.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x)) + def _set_series_id_column(self, df): - self._target_category_columns_map = dict() + self._target_category_columns_map = {} if not self.target_category_columns: df[DataColumns.Series] = "Series 1" self.has_artificial_series = True @@ -125,10 +131,10 @@ def _format_datetime_col(self, df): df[self.dt_column_name] = pd.to_datetime( df[self.dt_column_name], format=self.dt_column_format ) - except: + except Exception as ee: raise InvalidParameterError( f"Unable to determine the datetime type for column: {self.dt_column_name} in dataset: {self.name}. Please specify the format explicitly. (For example adding 'format: %d/%m/%Y' underneath 'name: {self.dt_column_name}' in the datetime_column section of the yaml file if you haven't already. For reference, here is the first datetime given: {df[self.dt_column_name].values[0]}" - ) + ) from ee return df def _set_multi_index(self, df): @@ -242,7 +248,6 @@ def _check_historical_dataset(self, df): "Class": "A", "Num": 2 }, - } """ From 3dafe3113c6212e2d463bb15956c73314f4dde12 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 19 Nov 2024 14:45:53 +0000 Subject: [PATCH 22/29] LightGBM requires re-formatting column names --- ads/opctl/operator/lowcode/common/transformations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py index f5261374d..936370c02 100644 --- a/ads/opctl/operator/lowcode/common/transformations.py +++ b/ads/opctl/operator/lowcode/common/transformations.py @@ -3,10 +3,10 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import re from abc import ABC import pandas as pd -import re from ads.opctl import logger from ads.opctl.operator.lowcode.common.const import DataColumns From e1c59d2a972f469708e5cff4b8ac3416edd5b30f Mon Sep 17 00:00:00 2001 From: Allen Date: Wed, 20 Nov 2024 18:38:49 +0000 Subject: [PATCH 23/29] remove ray when low cpu count --- ads/opctl/operator/lowcode/forecast/model/automlx.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py index d2fc61778..d21bb9c3f 100644 --- a/ads/opctl/operator/lowcode/forecast/model/automlx.py +++ b/ads/opctl/operator/lowcode/forecast/model/automlx.py @@ -2,6 +2,7 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import logging +import os import traceback import numpy as np @@ -80,10 +81,17 @@ def _build_model(self) -> pd.DataFrame: from automlx import Pipeline, init + cpu_count = os.cpu_count() try: + if cpu_count < 4: + engine = "local" + engine_opts = None + else: + engine = "ray" + engine_opts = ({"ray_setup": {"_temp_dir": "/tmp/ray-temp"}},) init( - engine="ray", - engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, + engine=engine, + engine_opts=engine_opts, loglevel=logging.CRITICAL, ) except Exception as e: From a9fd3b8921526afbcc9c17b5d349c754a3a89379 Mon Sep 17 00:00:00 2001 From: Allen Date: Thu, 21 Nov 2024 13:36:49 +0000 Subject: [PATCH 24/29] whitespace not supported by lightgbm --- ads/opctl/operator/lowcode/common/transformations.py | 2 +- ads/opctl/operator/lowcode/forecast/model/base_model.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py index 936370c02..26b8bf351 100644 --- a/ads/opctl/operator/lowcode/common/transformations.py +++ b/ads/opctl/operator/lowcode/common/transformations.py @@ -104,7 +104,7 @@ def _normalize_column_names(self, df): def _set_series_id_column(self, df): self._target_category_columns_map = {} if not self.target_category_columns: - df[DataColumns.Series] = "Series 1" + df[DataColumns.Series] = "Series_1" self.has_artificial_series = True else: df[DataColumns.Series] = merge_category_columns( diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index ba933ecc7..f6313bdd0 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -481,12 +481,12 @@ def _save_report( metrics_col_name = ( self.original_target_column if self.datasets.has_artificial_series() - else "Series 1" + else "Series_1" ) if metrics_df is not None: write_data( data=metrics_df.reset_index().rename( - {"index": "metrics", "Series 1": metrics_col_name}, axis=1 + {"index": "metrics", "Series_1": metrics_col_name}, axis=1 ), filename=os.path.join( unique_output_dir, self.spec.metrics_filename @@ -505,7 +505,7 @@ def _save_report( if test_metrics_df is not None: write_data( data=test_metrics_df.reset_index().rename( - {"index": "metrics", "Series 1": metrics_col_name}, axis=1 + {"index": "metrics", "Series_1": metrics_col_name}, axis=1 ), filename=os.path.join( unique_output_dir, self.spec.test_metrics_filename From 42ce1a51be517d2a3f8c8ef3b5d5c0b7fa895763 Mon Sep 17 00:00:00 2001 From: Allen Date: Thu, 21 Nov 2024 16:25:18 +0000 Subject: [PATCH 25/29] revert change --- ads/opctl/operator/lowcode/common/transformations.py | 2 +- ads/opctl/operator/lowcode/forecast/model/base_model.py | 6 +++--- tests/operators/anomaly/test_anomaly_simple.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py index 26b8bf351..936370c02 100644 --- a/ads/opctl/operator/lowcode/common/transformations.py +++ b/ads/opctl/operator/lowcode/common/transformations.py @@ -104,7 +104,7 @@ def _normalize_column_names(self, df): def _set_series_id_column(self, df): self._target_category_columns_map = {} if not self.target_category_columns: - df[DataColumns.Series] = "Series_1" + df[DataColumns.Series] = "Series 1" self.has_artificial_series = True else: df[DataColumns.Series] = merge_category_columns( diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py index f6313bdd0..ba933ecc7 100644 --- a/ads/opctl/operator/lowcode/forecast/model/base_model.py +++ b/ads/opctl/operator/lowcode/forecast/model/base_model.py @@ -481,12 +481,12 @@ def _save_report( metrics_col_name = ( self.original_target_column if self.datasets.has_artificial_series() - else "Series_1" + else "Series 1" ) if metrics_df is not None: write_data( data=metrics_df.reset_index().rename( - {"index": "metrics", "Series_1": metrics_col_name}, axis=1 + {"index": "metrics", "Series 1": metrics_col_name}, axis=1 ), filename=os.path.join( unique_output_dir, self.spec.metrics_filename @@ -505,7 +505,7 @@ def _save_report( if test_metrics_df is not None: write_data( data=test_metrics_df.reset_index().rename( - {"index": "metrics", "Series_1": metrics_col_name}, axis=1 + {"index": "metrics", "Series 1": metrics_col_name}, axis=1 ), filename=os.path.join( unique_output_dir, self.spec.test_metrics_filename diff --git a/tests/operators/anomaly/test_anomaly_simple.py b/tests/operators/anomaly/test_anomaly_simple.py index 658d292a5..94b04d70f 100644 --- a/tests/operators/anomaly/test_anomaly_simple.py +++ b/tests/operators/anomaly/test_anomaly_simple.py @@ -100,7 +100,7 @@ def test_artificial_big(model): all_data = [] TARGET_COLUMN = "sensor" - TARGET_CATEGORY_COLUMN = "Meter ID" + TARGET_CATEGORY_COLUMN = "Meter_ID" DATETIME_COLUMN = "Date" yr_in_30_min = pd.date_range( "2014-01-15 00:00:00", "2015-01-15 00:00:00", freq="30min" From e6f6a238b027df5be5831b57899e92b2c15e1e18 Mon Sep 17 00:00:00 2001 From: Allen Date: Thu, 21 Nov 2024 18:33:38 +0000 Subject: [PATCH 26/29] remove lightgbm re-name until later --- ads/opctl/operator/lowcode/common/transformations.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py index 936370c02..3f4796d4c 100644 --- a/ads/opctl/operator/lowcode/common/transformations.py +++ b/ads/opctl/operator/lowcode/common/transformations.py @@ -3,7 +3,6 @@ # Copyright (c) 2023, 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import re from abc import ABC import pandas as pd @@ -98,8 +97,8 @@ def run(self, data): def _remove_trailing_whitespace(self, df): return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) - def _normalize_column_names(self, df): - return df.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x)) + # def _normalize_column_names(self, df): + # return df.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x)) def _set_series_id_column(self, df): self._target_category_columns_map = {} From 5c2e161dbdd5272990ad0b71948e2bc0858020c0 Mon Sep 17 00:00:00 2001 From: Allen Date: Thu, 21 Nov 2024 19:17:37 +0000 Subject: [PATCH 27/29] remove lightgbm re-name until later --- ads/opctl/operator/lowcode/common/transformations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py index 3f4796d4c..77272c6b2 100644 --- a/ads/opctl/operator/lowcode/common/transformations.py +++ b/ads/opctl/operator/lowcode/common/transformations.py @@ -59,7 +59,7 @@ def run(self, data): """ clean_df = self._remove_trailing_whitespace(data) - clean_df = self._normalize_column_names(clean_df) + # clean_df = self._normalize_column_names(clean_df) if self.name == "historical_data": self._check_historical_dataset(clean_df) clean_df = self._set_series_id_column(clean_df) From 8ec68bf8f34aec7843f14bc6a4997cc179edc190 Mon Sep 17 00:00:00 2001 From: Allen Date: Fri, 22 Nov 2024 09:06:07 +0000 Subject: [PATCH 28/29] fix docs examples --- .../forecast_operator/multivariate.rst | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/source/user_guide/operators/forecast_operator/multivariate.rst b/docs/source/user_guide/operators/forecast_operator/multivariate.rst index 9cdf07014..b0b8a79c8 100644 --- a/docs/source/user_guide/operators/forecast_operator/multivariate.rst +++ b/docs/source/user_guide/operators/forecast_operator/multivariate.rst @@ -15,24 +15,24 @@ If the historical data includes a ``target_category_column``, it should also be For example, if the historical data is: -==== ========= - Qtr Revenue -==== ========= - Q1 1200 - Q2 1300 - Q3 1500 -==== ========= +=========== ========= + Month Revenue +=========== ========= + 01-01-2024 1200 + 01-02-2024 1300 + 01-03-2024 1500 +=========== ========= Then the additional data (with a horizon of 1) should be formatted as: -==== ======== ======== ============== - Qtr COGS Discount SP500 Futures -==== ======== ======== ============== - Q1 100 0 1.02 - Q2 100 0.1 1.03 - Q3 105 0 1.04 - Q4 105 0.1 1.01 -==== ======== ======== ============== +=========== ======== ======== ============== + Month COGS Discount SP500 Futures +=========== ======== ======== ============== + 01-01-2024 100 0 1.02 + 01-02-2024 100 0.1 1.03 + 01-03-2024 105 0 1.04 + 01-04-2024 105 0.1 1.01 +=========== ======== ======== ============== Note that the additional data does not include the target column (Revenue), but it does include the datetime column (Qtr). You would include this additional data in the YAML file as follows: @@ -43,7 +43,7 @@ Note that the additional data does not include the target column (Revenue), but version: v1 spec: datetime_column: - name: Qtr + name: Month historical_data: url: historical_data.csv additional_data: @@ -61,7 +61,7 @@ You can experiment by removing columns and observing how the results change. Bel version: v1 spec: datetime_column: - name: Qtr + name: Month historical_data: url: historical_data.csv additional_data: From edba0c313ba26ca8480c667707d10cc79b7641c4 Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 26 Nov 2024 14:05:51 +0000 Subject: [PATCH 29/29] enable parquet --- ads/opctl/operator/lowcode/common/utils.py | 74 +++++++++---------- .../lowcode/forecast/model/prophet.py | 3 + .../operator/lowcode/forecast/schema.yaml | 2 +- 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/ads/opctl/operator/lowcode/common/utils.py b/ads/opctl/operator/lowcode/common/utils.py index 1c9ede754..38ee9cd0b 100644 --- a/ads/opctl/operator/lowcode/common/utils.py +++ b/ads/opctl/operator/lowcode/common/utils.py @@ -1,42 +1,32 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*-- # Copyright (c) 2024 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import argparse import logging import os import shutil import sys import tempfile -import time -from string import Template -from typing import Any, Dict, List, Tuple -import pandas as pd -from ads.opctl import logger -import oracledb +from typing import List, Union import fsspec -import yaml -from typing import Union +import oracledb +import pandas as pd +from ads.common.object_storage_details import ObjectStorageDetails from ads.opctl import logger +from ads.opctl.operator.common.operator_config import OutputDirectory from ads.opctl.operator.lowcode.common.errors import ( - InputDataError, InvalidParameterError, - PermissionsError, - DataMismatchError, ) -from ads.opctl.operator.common.operator_config import OutputDirectory -from ads.common.object_storage_details import ObjectStorageDetails from ads.secrets import ADBSecretKeeper def call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs): - if fsspec.utils.get_protocol(filename) == "file": - return pd_fn(filename, **kwargs) - elif fsspec.utils.get_protocol(filename) in ["http", "https"]: + if fsspec.utils.get_protocol(filename) == "file" or fsspec.utils.get_protocol( + filename + ) in ["http", "https"]: return pd_fn(filename, **kwargs) storage_options = storage_options or ( @@ -48,7 +38,7 @@ def call_pandas_fsspec(pd_fn, filename, storage_options, **kwargs): def load_data(data_spec, storage_options=None, **kwargs): if data_spec is None: - raise InvalidParameterError(f"No details provided for this data source.") + raise InvalidParameterError("No details provided for this data source.") filename = data_spec.url format = data_spec.format columns = data_spec.columns @@ -67,7 +57,7 @@ def load_data(data_spec, storage_options=None, **kwargs): if not format: _, format = os.path.splitext(filename) format = format[1:] - if format in ["json", "clipboard", "excel", "csv", "feather", "hdf"]: + if format in ["json", "clipboard", "excel", "csv", "feather", "hdf", "parquet"]: read_fn = getattr(pd, f"read_{format}") data = call_pandas_fsspec( read_fn, filename, storage_options=storage_options @@ -84,19 +74,31 @@ def load_data(data_spec, storage_options=None, **kwargs): with tempfile.TemporaryDirectory() as temp_dir: if vault_secret_id is not None: try: - with ADBSecretKeeper.load_secret(vault_secret_id, wallet_dir=temp_dir) as adwsecret: - if 'wallet_location' in adwsecret and 'wallet_location' not in connect_args: - shutil.unpack_archive(adwsecret["wallet_location"], temp_dir) - connect_args['wallet_location'] = temp_dir - if 'user_name' in adwsecret and 'user' not in connect_args: - connect_args['user'] = adwsecret['user_name'] - if 'password' in adwsecret and 'password' not in connect_args: - connect_args['password'] = adwsecret['password'] - if 'service_name' in adwsecret and 'service_name' not in connect_args: - connect_args['service_name'] = adwsecret['service_name'] + with ADBSecretKeeper.load_secret( + vault_secret_id, wallet_dir=temp_dir + ) as adwsecret: + if ( + "wallet_location" in adwsecret + and "wallet_location" not in connect_args + ): + shutil.unpack_archive( + adwsecret["wallet_location"], temp_dir + ) + connect_args["wallet_location"] = temp_dir + if "user_name" in adwsecret and "user" not in connect_args: + connect_args["user"] = adwsecret["user_name"] + if "password" in adwsecret and "password" not in connect_args: + connect_args["password"] = adwsecret["password"] + if ( + "service_name" in adwsecret + and "service_name" not in connect_args + ): + connect_args["service_name"] = adwsecret["service_name"] except Exception as e: - raise Exception(f"Could not retrieve database credentials from vault {vault_secret_id}: {e}") + raise Exception( + f"Could not retrieve database credentials from vault {vault_secret_id}: {e}" + ) con = oracledb.connect(**connect_args) if table_name is not None: @@ -105,11 +107,11 @@ def load_data(data_spec, storage_options=None, **kwargs): data = pd.read_sql(sql, con) else: raise InvalidParameterError( - f"Database `connect_args` provided without sql query or table name. Please specify either `sql` or `table_name`." + "Database `connect_args` provided without sql query or table name. Please specify either `sql` or `table_name`." ) else: raise InvalidParameterError( - f"No filename/url provided, and no connect_args provided. Please specify one of these if you want to read data from a file or a database respectively." + "No filename/url provided, and no connect_args provided. Please specify one of these if you want to read data from a file or a database respectively." ) if columns: # keep only these columns, done after load because only CSV supports stream filtering @@ -232,7 +234,7 @@ def human_time_friendly(seconds): accumulator.append( "{} {}{}".format(int(amount), unit, "" if amount == 1 else "s") ) - accumulator.append("{} secs".format(round(seconds, 2))) + accumulator.append(f"{round(seconds, 2)} secs") return ", ".join(accumulator) @@ -248,9 +250,7 @@ def find_output_dirname(output_dir: OutputDirectory): unique_output_dir = f"{output_dir}_{counter}" counter += 1 logger.warn( - "Since the output directory was not specified, the output will be saved to {} directory.".format( - unique_output_dir - ) + f"Since the output directory was not specified, the output will be saved to {unique_output_dir} directory." ) return unique_output_dir diff --git a/ads/opctl/operator/lowcode/forecast/model/prophet.py b/ads/opctl/operator/lowcode/forecast/model/prophet.py index aa9033b98..c72a9fd1f 100644 --- a/ads/opctl/operator/lowcode/forecast/model/prophet.py +++ b/ads/opctl/operator/lowcode/forecast/model/prophet.py @@ -142,6 +142,9 @@ def _build_model(self) -> pd.DataFrame: dt_column=self.spec.datetime_column.name, ) + # if os.environ["OCI__IS_SPARK"]: + # pass + # else: Parallel(n_jobs=-1, require="sharedmem")( delayed(ProphetOperatorModel._train_model)( self, i, series_id, df, model_kwargs.copy() diff --git a/ads/opctl/operator/lowcode/forecast/schema.yaml b/ads/opctl/operator/lowcode/forecast/schema.yaml index 0b542cd41..e0c722ae4 100644 --- a/ads/opctl/operator/lowcode/forecast/schema.yaml +++ b/ads/opctl/operator/lowcode/forecast/schema.yaml @@ -311,7 +311,7 @@ spec: missing_value_imputation: type: boolean required: false - default: false + default: true outlier_treatment: type: boolean required: false