Skip to content

Commit

Permalink
migrate processing and generate_series to utilsforecast (#221)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Sep 29, 2023
1 parent 4407dc5 commit 7328627
Show file tree
Hide file tree
Showing 34 changed files with 654 additions and 811 deletions.
92 changes: 42 additions & 50 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,35 +169,35 @@ series.head()
<th>0</th>
<td>id_00</td>
<td>2000-01-01</td>
<td>1.751917</td>
<td>17.519167</td>
<td>72</td>
</tr>
<tr>
<th>1</th>
<td>id_00</td>
<td>2000-01-02</td>
<td>9.196715</td>
<td>87.799695</td>
<td>72</td>
</tr>
<tr>
<th>2</th>
<td>id_00</td>
<td>2000-01-03</td>
<td>18.577788</td>
<td>177.442975</td>
<td>72</td>
</tr>
<tr>
<th>3</th>
<td>id_00</td>
<td>2000-01-04</td>
<td>24.520646</td>
<td>232.704110</td>
<td>72</td>
</tr>
<tr>
<th>4</th>
<td>id_00</td>
<td>2000-01-05</td>
<td>33.418028</td>
<td>317.510474</td>
<td>72</td>
</tr>
</tbody>
Expand All @@ -216,7 +216,7 @@ import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

models = [
lgb.LGBMRegressor(),
lgb.LGBMRegressor(verbosity=-1),
xgb.XGBRegressor(),
RandomForestRegressor(random_state=0),
]
Expand Down Expand Up @@ -298,41 +298,41 @@ predictions
<th>0</th>
<td>id_00</td>
<td>2000-04-04</td>
<td>69.082830</td>
<td>67.761337</td>
<td>68.226556</td>
<td>299.923771</td>
<td>309.664124</td>
<td>298.424164</td>
</tr>
<tr>
<th>1</th>
<td>id_00</td>
<td>2000-04-05</td>
<td>75.706024</td>
<td>74.588699</td>
<td>75.484774</td>
<td>365.424147</td>
<td>382.150085</td>
<td>365.816014</td>
</tr>
<tr>
<th>2</th>
<td>id_00</td>
<td>2000-04-06</td>
<td>82.222473</td>
<td>81.058289</td>
<td>82.853684</td>
<td>432.562441</td>
<td>453.373779</td>
<td>436.360620</td>
</tr>
<tr>
<th>3</th>
<td>id_00</td>
<td>2000-04-07</td>
<td>89.577638</td>
<td>88.735947</td>
<td>90.351212</td>
<td>495.628000</td>
<td>527.965149</td>
<td>503.670100</td>
</tr>
<tr>
<th>4</th>
<td>id_00</td>
<td>2000-04-08</td>
<td>44.149095</td>
<td>44.981384</td>
<td>46.291173</td>
<td>60.786223</td>
<td>75.762299</td>
<td>62.176080</td>
</tr>
<tr>
<th>...</th>
Expand All @@ -346,41 +346,41 @@ predictions
<th>275</th>
<td>id_19</td>
<td>2000-03-23</td>
<td>30.151270</td>
<td>31.814825</td>
<td>32.592799</td>
<td>36.266780</td>
<td>29.889120</td>
<td>34.799780</td>
</tr>
<tr>
<th>276</th>
<td>id_19</td>
<td>2000-03-24</td>
<td>31.418104</td>
<td>32.653374</td>
<td>33.563294</td>
<td>44.370984</td>
<td>34.968884</td>
<td>39.920982</td>
</tr>
<tr>
<th>277</th>
<td>id_19</td>
<td>2000-03-25</td>
<td>32.843567</td>
<td>33.586033</td>
<td>34.530912</td>
<td>50.746222</td>
<td>39.970238</td>
<td>46.196266</td>
</tr>
<tr>
<th>278</th>
<td>id_19</td>
<td>2000-03-26</td>
<td>34.127210</td>
<td>34.541473</td>
<td>35.507559</td>
<td>58.906524</td>
<td>45.125305</td>
<td>51.653060</td>
</tr>
<tr>
<th>279</th>
<td>id_19</td>
<td>2000-03-27</td>
<td>34.329202</td>
<td>35.450943</td>
<td>36.425001</td>
<td>63.073949</td>
<td>50.682716</td>
<td>56.845384</td>
</tr>
</tbody>
</table>
Expand All @@ -390,23 +390,15 @@ predictions
### Visualize results

``` python
import matplotlib.pyplot as plt
import pandas as pd

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 6), gridspec_kw=dict(hspace=0.3))
for i, (uid, axi) in enumerate(zip(series['unique_id'].unique(), ax.flat)):
fltr = lambda df: df['unique_id'].eq(uid)
pd.concat([series.loc[fltr, ['ds', 'y']], predictions.loc[fltr]]).set_index('ds').plot(ax=axi)
axi.set(title=uid, xlabel=None)
if i % 2 == 0:
axi.legend().remove()
else:
axi.legend(bbox_to_anchor=(1.01, 1.0))
from utilsforecast.plotting import plot_series
```

``` python
fig = plot_series(series, predictions, max_ids=4, plot_random=False)
fig.savefig('figs/index.png', bbox_inches='tight')
plt.close()
```

![](https://raw.githubusercontent.com/Nixtla/mlforecast/main/figs/index.png)
![](https://raw.githubusercontent.com/Nixtla/mlforecast/main/nbs/figs/index.png)

## Sample notebooks

Expand Down
4 changes: 2 additions & 2 deletions action_files/remove_logs_cells
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def cell_contains_ips(cell):
if 'text' not in output:
return False
for line in output['text']:
if IP_REGEX.search(line) or HOURS_REGEX.search(line):
if IP_REGEX.search(line) or HOURS_REGEX.search(line) or 'Trying to bind port' in line:
return True
return False

Expand All @@ -27,5 +27,5 @@ def clean_nb(nb):

if __name__ == '__main__':
repo_root = Path(__file__).parents[1]
for nb in (repo_root / 'nbs').glob('*distributed*'):
for nb in (repo_root / 'nbs').rglob('*distributed*.ipynb'):
process_write(warn_msg='Failed to clean_nb', proc_nb=clean_nb, f_in=nb)
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dependencies:
- pyspark>=3.3
- scikit-learn
- statsmodels
- utilsforecast
- window-ops
- xgboost
- pip:
Expand All @@ -24,4 +23,5 @@ dependencies:
- lightgbm_ray
- nbdev
- ray<2.4
- utilsforecast>=0.0.6
- xgboost_ray
Binary file removed figs/index.png
Binary file not shown.
2 changes: 1 addition & 1 deletion local_environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ dependencies:
- prophet
- scikit-learn
- statsmodels
- utilsforecast
- window-ops
- xgboost
- pip:
- datasetsforecast
- nbdev
- utilsforecast>=0.0.6
37 changes: 23 additions & 14 deletions mlforecast/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from .target_transforms import BaseTargetTransform
from .utils import _ensure_shallow_copy

from utilsforecast.processing import DataFrameProcessor
from utilsforecast.validation import validate_format

# %% ../nbs/core.ipynb 10
date_features_dtypes = {
"year": np.uint16,
Expand Down Expand Up @@ -193,48 +196,54 @@ def _fit(
keep_last_n: Optional[int] = None,
) -> "TimeSeries":
"""Save the series values, ids and last dates."""
for col in (id_col, time_col, target_col):
if col not in df:
raise ValueError(f"Data doesn't contain {col} column")
validate_format(df, id_col, time_col, target_col)
if df[target_col].isnull().any():
raise ValueError(f"{target_col} column contains null values.")
if pd.api.types.is_datetime64_dtype(df[time_col]):
if self.freq == 1:
raise ValueError(
"Must set frequency when using a timestamp type column."
)
elif np.issubdtype(df[time_col].dtype.type, np.integer):
else:
if self.freq != 1:
warnings.warn("Setting `freq=1` since time col is int.")
self.freq = 1
else:
raise ValueError(f"{time_col} must be either timestamp or integer.")
self.id_col = id_col
self.target_col = target_col
self.time_col = time_col
self.keep_last_n = keep_last_n
self.static_features = static_features
sort_idxs = pd.core.sorting.lexsort_indexer([df[id_col], df[time_col]])
self.restore_idxs = np.empty(df.shape[0], dtype=np.int32)
self.restore_idxs[sort_idxs] = np.arange(df.shape[0])
sorted_df = df[[id_col, time_col, target_col]].iloc[sort_idxs]
proc = DataFrameProcessor(id_col, time_col, target_col)
sorted_df = df[[id_col, time_col, target_col]]
uids, times, _, indptr, sort_idxs = proc.process(sorted_df)
self.uids = pd.Index(uids)
self.last_dates = pd.Index(times)
if sort_idxs is not None:
self.restore_idxs = np.empty(df.shape[0], dtype=np.int32)
self.restore_idxs[sort_idxs] = np.arange(df.shape[0])
sorted_df = sorted_df.iloc[sort_idxs]
else:
self.restore_idxs = np.arange(df.shape[0])
if self.target_transforms is not None:
for tfm in self.target_transforms:
tfm.set_column_names(id_col, time_col, target_col)
sorted_df = tfm.fit_transform(sorted_df)
self.ga = GroupedArray.from_sorted_df(sorted_df, id_col, target_col)
data = sorted_df[target_col].values
if data.dtype not in (np.float32, np.float64):
data = data.astype(np.float32)
self.ga = GroupedArray(data, indptr)
self._ga = GroupedArray(self.ga.data, self.ga.indptr)
last_idxs_per_serie = self.ga.indptr[1:] - 1
self.uids = pd.Index(sorted_df[id_col].iloc[last_idxs_per_serie])
self.last_dates = pd.Index(sorted_df[time_col].iloc[last_idxs_per_serie])
to_drop = [id_col, time_col, target_col]
if static_features is None:
static_features = df.columns.drop([time_col, target_col]).tolist()
elif id_col not in static_features:
static_features = [id_col] + static_features
else: # static_features defined and contain id_col
to_drop = [time_col, target_col]
self.static_features_ = df.iloc[sort_idxs[last_idxs_per_serie]][
if sort_idxs is not None:
last_idxs_per_serie = sort_idxs[last_idxs_per_serie]
self.static_features_ = df.iloc[last_idxs_per_serie][
static_features
].reset_index(drop=True)
self.features_order_ = df.columns.drop(to_drop).tolist() + self.features
Expand Down
Loading

0 comments on commit 7328627

Please sign in to comment.