Skip to content

Commit

Permalink
update utilsforecast (#281)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Dec 7, 2023
1 parent 6f41765 commit a8cb278
Show file tree
Hide file tree
Showing 10 changed files with 225 additions and 399 deletions.
1 change: 0 additions & 1 deletion mlforecast/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
'mlforecast.core.TimeSeries._update_features': ( 'core.html#timeseries._update_features',
'mlforecast/core.py'),
'mlforecast.core.TimeSeries._update_y': ('core.html#timeseries._update_y', 'mlforecast/core.py'),
'mlforecast.core.TimeSeries._validate_freq': ('core.html#timeseries._validate_freq', 'mlforecast/core.py'),
'mlforecast.core.TimeSeries.features': ('core.html#timeseries.features', 'mlforecast/core.py'),
'mlforecast.core.TimeSeries.fit_transform': ('core.html#timeseries.fit_transform', 'mlforecast/core.py'),
'mlforecast.core.TimeSeries.predict': ('core.html#timeseries.predict', 'mlforecast/core.py'),
Expand Down
190 changes: 62 additions & 128 deletions mlforecast/core.py

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion mlforecast/distributed/forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def _preprocess_partition(
fit_ts_only: bool = False,
) -> List[List[Any]]:
ts = copy.deepcopy(base_ts)
ts._validate_freq(part, time_col)
if fit_ts_only:
ts._fit(
part,
Expand Down
95 changes: 41 additions & 54 deletions mlforecast/forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,9 @@

import numpy as np
import pandas as pd
import utilsforecast.processing as ufp
from sklearn.base import BaseEstimator, clone
from utilsforecast.compat import DataFrame
from utilsforecast.processing import (
assign_columns,
backtest_splits,
copy_if_pandas,
counts_by_id,
drop_index_if_pandas,
filter_with_mask,
is_in,
is_nan,
join,
maybe_compute_sort_indices,
take_rows,
to_numpy,
vertical_concat,
)

from mlforecast.core import (
DateFeature,
Expand Down Expand Up @@ -62,7 +48,7 @@ def _add_conformal_distribution_intervals(
`level` should be already sorted. This strategy creates forecasts paths
based on errors and calculate quantiles using those paths.
"""
fcst_df = copy_if_pandas(fcst_df, deep=False)
fcst_df = ufp.copy_if_pandas(fcst_df, deep=False)
alphas = [100 - lv for lv in level]
cuts = [alpha / 200 for alpha in reversed(alphas)]
cuts.extend(1 - alpha / 200 for alpha in alphas)
Expand All @@ -81,7 +67,7 @@ def _add_conformal_distribution_intervals(
lo_cols = [f"{model}-lo-{lv}" for lv in reversed(level)]
hi_cols = [f"{model}-hi-{lv}" for lv in level]
out_cols = lo_cols + hi_cols
fcst_df = assign_columns(fcst_df, out_cols, quantiles)
fcst_df = ufp.assign_columns(fcst_df, out_cols, quantiles)
return fcst_df

# %% ../nbs/forecast.ipynb 7
Expand All @@ -100,7 +86,7 @@ def _add_conformal_error_intervals(
`level` should be already sorted. This startegy creates prediction intervals
based on the absolute errors.
"""
fcst_df = copy_if_pandas(fcst_df, deep=False)
fcst_df = ufp.copy_if_pandas(fcst_df, deep=False)
cuts = [lv / 100 for lv in level]
for model in model_names:
mean = fcst_df[model].to_numpy().ravel()
Expand All @@ -117,7 +103,7 @@ def _add_conformal_error_intervals(
hi_cols = [f"{model}-hi-{lv}" for lv in level]
quantiles = np.vstack([mean - quantiles[::-1], mean + quantiles]).T
columns = lo_cols + hi_cols
fcst_df = assign_columns(fcst_df, columns, quantiles)
fcst_df = ufp.assign_columns(fcst_df, columns, quantiles)
return fcst_df

# %% ../nbs/forecast.ipynb 8
Expand Down Expand Up @@ -298,7 +284,7 @@ def fit_models(
# TODO: migrate to utils
Xh = X[keep]
else:
Xh = filter_with_mask(X, keep)
Xh = ufp.filter_with_mask(X, keep)
yh = y[keep, col]
self.models_[name].append(clone(model).fit(Xh, yh))
else:
Expand Down Expand Up @@ -348,7 +334,7 @@ def _conformity_scores(
for model in self.models.keys():
# compute absolute error for each model
abs_err = abs(cv_results[model] - cv_results[target_col])
cv_results = assign_columns(cv_results, model, abs_err)
cv_results = ufp.assign_columns(cv_results, model, abs_err)
return cv_results.drop(columns=target_col)

def _invert_transforms_fitted(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -361,7 +347,7 @@ def _invert_transforms_fitted(self, df: pd.DataFrame) -> pd.DataFrame:
model_cols = [
c for c in df.columns if c not in (self.ts.id_col, self.ts.time_col)
]
id_counts = counts_by_id(df, self.ts.id_col)
id_counts = ufp.counts_by_id(df, self.ts.id_col)
sizes = id_counts["counts"].to_numpy()
indptr = np.append(0, sizes.cumsum())
for tfm in self.ts.target_transforms[::-1]:
Expand All @@ -373,7 +359,7 @@ def _invert_transforms_fitted(self, df: pd.DataFrame) -> pd.DataFrame:
for col in model_cols:
ga = GroupedArray(df[col].to_numpy(), indptr)
ga = tfm.inverse_transform_fitted(ga)
df = assign_columns(df, col, ga.data)
df = ufp.assign_columns(df, col, ga.data)
tfm.idxs = None
else:
df = tfm.inverse_transform(df)
Expand Down Expand Up @@ -401,39 +387,41 @@ def _compute_fitted_values(
target_col: str,
max_horizon: Optional[int],
) -> DataFrame:
base = copy_if_pandas(base, deep=False)
sort_idxs = maybe_compute_sort_indices(base, id_col, time_col)
base = ufp.copy_if_pandas(base, deep=False)
sort_idxs = ufp.maybe_compute_sort_indices(base, id_col, time_col)
if sort_idxs is not None:
base = take_rows(base, sort_idxs)
X = take_rows(X, sort_idxs)
base = ufp.take_rows(base, sort_idxs)
X = ufp.take_rows(X, sort_idxs)
y = y[sort_idxs]
if max_horizon is None:
fitted_values = assign_columns(base, target_col, y)
fitted_values = ufp.assign_columns(base, target_col, y)
for name, model in self.models_.items():
assert not isinstance(model, list) # mypy
preds = model.predict(X)
fitted_values = assign_columns(fitted_values, name, preds)
fitted_values = ufp.assign_columns(fitted_values, name, preds)
fitted_values = self._invert_transforms_fitted(fitted_values)
else:
horizon_fitted_values = []
for horizon in range(max_horizon):
horizon_base = copy_if_pandas(base, deep=True)
horizon_base = assign_columns(horizon_base, target_col, y[:, horizon])
horizon_base = ufp.copy_if_pandas(base, deep=True)
horizon_base = ufp.assign_columns(
horizon_base, target_col, y[:, horizon]
)
horizon_fitted_values.append(horizon_base)
for name, horizon_models in self.models_.items():
for horizon, model in enumerate(horizon_models):
preds = model.predict(X)
horizon_fitted_values[horizon] = assign_columns(
horizon_fitted_values[horizon] = ufp.assign_columns(
horizon_fitted_values[horizon], name, preds
)
for horizon, horizon_df in enumerate(horizon_fitted_values):
keep_mask = ~is_nan(horizon_df[target_col])
horizon_df = filter_with_mask(horizon_df, keep_mask)
horizon_df = copy_if_pandas(horizon_df, deep=True)
keep_mask = ~ufp.is_nan(horizon_df[target_col])
horizon_df = ufp.filter_with_mask(horizon_df, keep_mask)
horizon_df = ufp.copy_if_pandas(horizon_df, deep=True)
horizon_df = self._invert_transforms_fitted(horizon_df)
horizon_df = assign_columns(horizon_df, "h", horizon + 1)
horizon_df = ufp.assign_columns(horizon_df, "h", horizon + 1)
horizon_fitted_values[horizon] = horizon_df
fitted_values = vertical_concat(
fitted_values = ufp.vertical_concat(
horizon_fitted_values, match_categories=False
)
if self.ts.target_transforms is not None:
Expand Down Expand Up @@ -528,7 +516,7 @@ def fit(
base = prep[[id_col, time_col]]
X, y = self._extract_X_y(prep, target_col)
if as_numpy:
X = to_numpy(X)
X = ufp.to_numpy(X)
del prep
self.fit_models(X, y)
if fitted:
Expand All @@ -541,7 +529,7 @@ def fit(
target_col=target_col,
max_horizon=max_horizon,
)
fitted_values = drop_index_if_pandas(fitted_values)
fitted_values = ufp.drop_index_if_pandas(fitted_values)
self.fcst_fitted_values_ = fitted_values
return self

Expand Down Expand Up @@ -672,8 +660,8 @@ def predict(
self.prediction_intervals.method
)
if ids is not None:
ids_mask = is_in(self._cs_df[self.ts.id_col], ids)
cs_df = filter_with_mask(self._cs_df, ids_mask)
ids_mask = ufp.is_in(self._cs_df[self.ts.id_col], ids)
cs_df = ufp.filter_with_mask(self._cs_df, ids_mask)
n_series = len(ids)
else:
cs_df = self._cs_df
Expand Down Expand Up @@ -770,8 +758,7 @@ def cross_validation(
"""
results = []
self.cv_models_ = []
self.ts._validate_freq(df, time_col)
splits = backtest_splits(
splits = ufp.backtest_splits(
df,
n_windows=n_windows,
h=h,
Expand Down Expand Up @@ -801,7 +788,7 @@ def cross_validation(
self.cv_models_.append(self.models_)
if fitted:
self.cv_fitted_values_.append(
assign_columns(self.fcst_fitted_values_, "fold", i_window)
ufp.assign_columns(self.fcst_fitted_values_, "fold", i_window)
)
if fitted and not should_fit:
if self.ts.target_transforms is not None:
Expand All @@ -823,7 +810,7 @@ def cross_validation(
base = prep[[id_col, time_col]]
train_X, train_y = self._extract_X_y(prep, target_col)
if as_numpy:
train_X = to_numpy(train_X)
train_X = ufp.to_numpy(train_X)
del prep
fitted_values = self._compute_fitted_values(
base=base,
Expand All @@ -834,7 +821,7 @@ def cross_validation(
target_col=target_col,
max_horizon=max_horizon,
)
fitted_values = assign_columns(fitted_values, "fold", i_window)
fitted_values = ufp.assign_columns(fitted_values, "fold", i_window)
self.cv_fitted_values_.append(fitted_values)
static = [c for c in self.ts.static_features_.columns if c != id_col]
dynamic = [
Expand All @@ -854,15 +841,15 @@ def cross_validation(
level=level,
X_df=X_df,
)
y_pred = join(y_pred, cutoffs, on=id_col, how="left")
result = join(
y_pred = ufp.join(y_pred, cutoffs, on=id_col, how="left")
result = ufp.join(
valid[[id_col, time_col, target_col]],
y_pred,
on=[id_col, time_col],
)
sort_idxs = maybe_compute_sort_indices(result, id_col, time_col)
sort_idxs = ufp.maybe_compute_sort_indices(result, id_col, time_col)
if sort_idxs is not None:
result = take_rows(result, sort_idxs)
result = ufp.take_rows(result, sort_idxs)
if result.shape[0] < valid.shape[0]:
raise ValueError(
"Cross validation result produced less results than expected. "
Expand All @@ -871,17 +858,17 @@ def cross_validation(
)
results.append(result)
del self.models_
out = vertical_concat(results, match_categories=False)
out = drop_index_if_pandas(out)
out = ufp.vertical_concat(results, match_categories=False)
out = ufp.drop_index_if_pandas(out)
first_out_cols = [id_col, time_col, "cutoff", target_col]
remaining_cols = [c for c in out.columns if c not in first_out_cols]
return out[first_out_cols + remaining_cols]

def cross_validation_fitted_values(self):
if not getattr(self, "cv_fitted_values_", []):
raise ValueError("Please run cross_validation with fitted=True first.")
out = vertical_concat(self.cv_fitted_values_, match_categories=False)
out = ufp.vertical_concat(self.cv_fitted_values_, match_categories=False)
first_out_cols = [self.ts.id_col, self.ts.time_col, "fold", self.ts.target_col]
remaining_cols = [c for c in out.columns if c not in first_out_cols]
out = drop_index_if_pandas(out)
out = ufp.drop_index_if_pandas(out)
return out[first_out_cols + remaining_cols]
2 changes: 0 additions & 2 deletions mlforecast/lgb_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ def setup(
)
self.metric_fn = _metric2fn[metric]
self.metric_name = metric
self.ts._validate_freq(df, time_col)
self.items = []
self.h = h
self.id_col = id_col
Expand Down Expand Up @@ -427,7 +426,6 @@ def fit(
cv_result : list of tuple.
List of (boosting rounds, metric value) tuples.
"""
self.ts._validate_freq(df, time_col)
self.setup(
df=df,
n_windows=n_windows,
Expand Down
Loading

0 comments on commit a8cb278

Please sign in to comment.