migrate processing and generate_series to utilsforecast (#221)

Nixtla · Sep 29, 2023 · 7328627 · 7328627
1 parent 4407dc5
commit 7328627
Show file tree

Hide file tree

Showing 34 changed files with 654 additions and 811 deletions.
diff --git a/README.md b/README.md
@@ -169,35 +169,35 @@ series.head()
       <th>0</th>
       <td>id_00</td>
       <td>2000-01-01</td>
-      <td>1.751917</td>
+      <td>17.519167</td>
       <td>72</td>
     </tr>
     <tr>
       <th>1</th>
       <td>id_00</td>
       <td>2000-01-02</td>
-      <td>9.196715</td>
+      <td>87.799695</td>
       <td>72</td>
     </tr>
     <tr>
       <th>2</th>
       <td>id_00</td>
       <td>2000-01-03</td>
-      <td>18.577788</td>
+      <td>177.442975</td>
       <td>72</td>
     </tr>
     <tr>
       <th>3</th>
       <td>id_00</td>
       <td>2000-01-04</td>
-      <td>24.520646</td>
+      <td>232.704110</td>
       <td>72</td>
     </tr>
     <tr>
       <th>4</th>
       <td>id_00</td>
       <td>2000-01-05</td>
-      <td>33.418028</td>
+      <td>317.510474</td>
       <td>72</td>
     </tr>
   </tbody>
@@ -216,7 +216,7 @@ import xgboost as xgb
 from sklearn.ensemble import RandomForestRegressor
 
 models = [
-    lgb.LGBMRegressor(),
+    lgb.LGBMRegressor(verbosity=-1),
     xgb.XGBRegressor(),
     RandomForestRegressor(random_state=0),
 ]
@@ -298,41 +298,41 @@ predictions
       <th>0</th>
       <td>id_00</td>
       <td>2000-04-04</td>
-      <td>69.082830</td>
-      <td>67.761337</td>
-      <td>68.226556</td>
+      <td>299.923771</td>
+      <td>309.664124</td>
+      <td>298.424164</td>
     </tr>
     <tr>
       <th>1</th>
       <td>id_00</td>
       <td>2000-04-05</td>
-      <td>75.706024</td>
-      <td>74.588699</td>
-      <td>75.484774</td>
+      <td>365.424147</td>
+      <td>382.150085</td>
+      <td>365.816014</td>
     </tr>
     <tr>
       <th>2</th>
       <td>id_00</td>
       <td>2000-04-06</td>
-      <td>82.222473</td>
-      <td>81.058289</td>
-      <td>82.853684</td>
+      <td>432.562441</td>
+      <td>453.373779</td>
+      <td>436.360620</td>
     </tr>
     <tr>
       <th>3</th>
       <td>id_00</td>
       <td>2000-04-07</td>
-      <td>89.577638</td>
-      <td>88.735947</td>
-      <td>90.351212</td>
+      <td>495.628000</td>
+      <td>527.965149</td>
+      <td>503.670100</td>
     </tr>
     <tr>
       <th>4</th>
       <td>id_00</td>
       <td>2000-04-08</td>
-      <td>44.149095</td>
-      <td>44.981384</td>
-      <td>46.291173</td>
+      <td>60.786223</td>
+      <td>75.762299</td>
+      <td>62.176080</td>
     </tr>
     <tr>
       <th>...</th>
@@ -346,41 +346,41 @@ predictions
       <th>275</th>
       <td>id_19</td>
       <td>2000-03-23</td>
-      <td>30.151270</td>
-      <td>31.814825</td>
-      <td>32.592799</td>
+      <td>36.266780</td>
+      <td>29.889120</td>
+      <td>34.799780</td>
     </tr>
     <tr>
       <th>276</th>
       <td>id_19</td>
       <td>2000-03-24</td>
-      <td>31.418104</td>
-      <td>32.653374</td>
-      <td>33.563294</td>
+      <td>44.370984</td>
+      <td>34.968884</td>
+      <td>39.920982</td>
     </tr>
     <tr>
       <th>277</th>
       <td>id_19</td>
       <td>2000-03-25</td>
-      <td>32.843567</td>
-      <td>33.586033</td>
-      <td>34.530912</td>
+      <td>50.746222</td>
+      <td>39.970238</td>
+      <td>46.196266</td>
     </tr>
     <tr>
       <th>278</th>
       <td>id_19</td>
       <td>2000-03-26</td>
-      <td>34.127210</td>
-      <td>34.541473</td>
-      <td>35.507559</td>
+      <td>58.906524</td>
+      <td>45.125305</td>
+      <td>51.653060</td>
     </tr>
     <tr>
       <th>279</th>
       <td>id_19</td>
       <td>2000-03-27</td>
-      <td>34.329202</td>
-      <td>35.450943</td>
-      <td>36.425001</td>
+      <td>63.073949</td>
+      <td>50.682716</td>
+      <td>56.845384</td>
     </tr>
   </tbody>
 </table>
@@ -390,23 +390,15 @@ predictions
 ### Visualize results
 
 ``` python
-import matplotlib.pyplot as plt
-import pandas as pd
-
-fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 6), gridspec_kw=dict(hspace=0.3))
-for i, (uid, axi) in enumerate(zip(series['unique_id'].unique(), ax.flat)):
-    fltr = lambda df: df['unique_id'].eq(uid)
-    pd.concat([series.loc[fltr, ['ds', 'y']], predictions.loc[fltr]]).set_index('ds').plot(ax=axi)
-    axi.set(title=uid, xlabel=None)
-    if i % 2 == 0:
-        axi.legend().remove()
-    else:
-        axi.legend(bbox_to_anchor=(1.01, 1.0))
+from utilsforecast.plotting import plot_series
+```
+
+``` python
+fig = plot_series(series, predictions, max_ids=4, plot_random=False)
 fig.savefig('figs/index.png', bbox_inches='tight')
-plt.close()
 ```
 
-![](https://raw.githubusercontent.com/Nixtla/mlforecast/main/figs/index.png)
+![](https://raw.githubusercontent.com/Nixtla/mlforecast/main/nbs/figs/index.png)
 
 ## Sample notebooks
 

diff --git a/action_files/remove_logs_cells b/action_files/remove_logs_cells
@@ -14,7 +14,7 @@ def cell_contains_ips(cell):
         if 'text' not in output:
             return False
         for line in output['text']:
-            if IP_REGEX.search(line) or HOURS_REGEX.search(line):
+            if IP_REGEX.search(line) or HOURS_REGEX.search(line) or 'Trying to bind port' in line:
                 return True
     return False
 
@@ -27,5 +27,5 @@ def clean_nb(nb):
 
 if __name__ == '__main__':
     repo_root = Path(__file__).parents[1]
-    for nb in (repo_root / 'nbs').glob('*distributed*'):
+    for nb in (repo_root / 'nbs').rglob('*distributed*.ipynb'):
         process_write(warn_msg='Failed to clean_nb', proc_nb=clean_nb, f_in=nb)
diff --git a/environment.yml b/environment.yml
@@ -14,7 +14,6 @@ dependencies:
   - pyspark>=3.3
   - scikit-learn
   - statsmodels
-  - utilsforecast
   - window-ops
   - xgboost
   - pip:
@@ -24,4 +23,5 @@ dependencies:
     - lightgbm_ray
     - nbdev
     - ray<2.4
+    - utilsforecast>=0.0.6
     - xgboost_ray
diff --git a/figs/index.png b/figs/index.png
diff --git a/local_environment.yml b/local_environment.yml
@@ -12,9 +12,9 @@ dependencies:
   - prophet
   - scikit-learn
   - statsmodels
-  - utilsforecast
   - window-ops
   - xgboost
   - pip:
     - datasetsforecast
     - nbdev
+    - utilsforecast>=0.0.6
diff --git a/mlforecast/core.py b/mlforecast/core.py
@@ -19,6 +19,9 @@
 from .target_transforms import BaseTargetTransform
 from .utils import _ensure_shallow_copy
 
+from utilsforecast.processing import DataFrameProcessor
+from utilsforecast.validation import validate_format
+
 # %% ../nbs/core.ipynb 10
 date_features_dtypes = {
     "year": np.uint16,
@@ -193,48 +196,54 @@ def _fit(
         keep_last_n: Optional[int] = None,
     ) -> "TimeSeries":
         """Save the series values, ids and last dates."""
-        for col in (id_col, time_col, target_col):
-            if col not in df:
-                raise ValueError(f"Data doesn't contain {col} column")
+        validate_format(df, id_col, time_col, target_col)
         if df[target_col].isnull().any():
             raise ValueError(f"{target_col} column contains null values.")
         if pd.api.types.is_datetime64_dtype(df[time_col]):
             if self.freq == 1:
                 raise ValueError(
                     "Must set frequency when using a timestamp type column."
                 )
-        elif np.issubdtype(df[time_col].dtype.type, np.integer):
+        else:
             if self.freq != 1:
                 warnings.warn("Setting `freq=1` since time col is int.")
                 self.freq = 1
-        else:
-            raise ValueError(f"{time_col} must be either timestamp or integer.")
         self.id_col = id_col
         self.target_col = target_col
         self.time_col = time_col
         self.keep_last_n = keep_last_n
         self.static_features = static_features
-        sort_idxs = pd.core.sorting.lexsort_indexer([df[id_col], df[time_col]])
-        self.restore_idxs = np.empty(df.shape[0], dtype=np.int32)
-        self.restore_idxs[sort_idxs] = np.arange(df.shape[0])
-        sorted_df = df[[id_col, time_col, target_col]].iloc[sort_idxs]
+        proc = DataFrameProcessor(id_col, time_col, target_col)
+        sorted_df = df[[id_col, time_col, target_col]]
+        uids, times, _, indptr, sort_idxs = proc.process(sorted_df)
+        self.uids = pd.Index(uids)
+        self.last_dates = pd.Index(times)
+        if sort_idxs is not None:
+            self.restore_idxs = np.empty(df.shape[0], dtype=np.int32)
+            self.restore_idxs[sort_idxs] = np.arange(df.shape[0])
+            sorted_df = sorted_df.iloc[sort_idxs]
+        else:
+            self.restore_idxs = np.arange(df.shape[0])
         if self.target_transforms is not None:
             for tfm in self.target_transforms:
                 tfm.set_column_names(id_col, time_col, target_col)
                 sorted_df = tfm.fit_transform(sorted_df)
-        self.ga = GroupedArray.from_sorted_df(sorted_df, id_col, target_col)
+        data = sorted_df[target_col].values
+        if data.dtype not in (np.float32, np.float64):
+            data = data.astype(np.float32)
+        self.ga = GroupedArray(data, indptr)
         self._ga = GroupedArray(self.ga.data, self.ga.indptr)
         last_idxs_per_serie = self.ga.indptr[1:] - 1
-        self.uids = pd.Index(sorted_df[id_col].iloc[last_idxs_per_serie])
-        self.last_dates = pd.Index(sorted_df[time_col].iloc[last_idxs_per_serie])
         to_drop = [id_col, time_col, target_col]
         if static_features is None:
             static_features = df.columns.drop([time_col, target_col]).tolist()
         elif id_col not in static_features:
             static_features = [id_col] + static_features
         else:  # static_features defined and contain id_col
             to_drop = [time_col, target_col]
-        self.static_features_ = df.iloc[sort_idxs[last_idxs_per_serie]][
+        if sort_idxs is not None:
+            last_idxs_per_serie = sort_idxs[last_idxs_per_serie]
+        self.static_features_ = df.iloc[last_idxs_per_serie][
             static_features
         ].reset_index(drop=True)
         self.features_order_ = df.columns.drop(to_drop).tolist() + self.features