Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error with HierarchicalReconciliation.reconcile (TopDown Forecast proportions) #321

Closed
ag2789 opened this issue Jan 16, 2025 · 4 comments
Closed
Assignees
Labels

Comments

@ag2789
Copy link

ag2789 commented Jan 16, 2025

What happened + What you expected to happen

I am trying to replicate the code for accounts for my UseCase. The sample code worked a couple of months back but not sure if anything has changed to Y_hat_df and Y_df dataframes structures to reconcile forecast at lower levels.

Error:


KeyError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:

File /opt/conda/lib/python3.10/site-packages/pandas/_libs/index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()

File /opt/conda/lib/python3.10/site-packages/pandas/_libs/index.pyx:165, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:5745, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:5753, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'unique_id'

The above exception was the direct cause of the following exception:

KeyError Traceback (most recent call last)
Cell In[23], line 22
4 reconcilers = [#BottomUp(),
5 TopDown(method='forecast_proportions'),
6 # TopDown(method='average_proportions'),
(...)
17 # ERM(method='reg_bu'),
18 ]
20 rec_model = HierarchicalReconciliation(reconcilers=reconcilers)
---> 22 p_rec = rec_model.reconcile(Y_hat_df=p, Y_df=p_fitted, S=S_train, tags=tags)

File /opt/conda/lib/python3.10/site-packages/hierarchicalforecast/core.py:323, in HierarchicalReconciliation.reconcile(self, Y_hat_df, S, tags, Y_df, level, intervals_method, num_samples, seed, is_balanced, id_col, time_col, target_col)
320 Y_nw = None
322 # Check input's validity and sort dataframes
--> 323 Y_hat_nw, S_nw, Y_nw, self.model_names = self._prepare_fit(
324 Y_hat_nw=Y_hat_nw,
325 S_nw=S_nw,
326 Y_nw=Y_nw,
327 tags=tags,
328 level=level,
329 intervals_method=intervals_method,
330 id_col=id_col,
331 time_col=time_col,
332 target_col=target_col,
333 )
335 # Initialize reconciler arguments
336 reconciler_args = dict(
337 idx_bottom=np.arange(len(S_nw))[-S_nw.shape[1] :],
338 tags={
(...)
345 },
346 )

File /opt/conda/lib/python3.10/site-packages/hierarchicalforecast/core.py:203, in HierarchicalReconciliation._prepare_fit(self, Y_hat_nw, S_nw, Y_nw, tags, level, intervals_method, id_col, time_col, target_col)
197 raise ValueError(
198 f"The bottom {S_nw.shape[1]}x{S_nw.shape[1]} part of S must be an identity matrix."
199 )
201 # Check Y_hat_df\S_df series difference
202 # TODO: this logic should be method specific
--> 203 S_diff = set(S_nw[id_col]) - set(Y_hat_nw[id_col])
204 Y_hat_diff = set(Y_hat_nw[id_col]) - set(S_nw[id_col])
205 if S_diff:

File /opt/conda/lib/python3.10/site-packages/narwhals/dataframe.py:1006, in DataFrame.getitem(self, item)
1003 return self._from_compliant_dataframe(self._compliant_frame[item])
1004 if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2):
1005 return self._series(
-> 1006 self._compliant_frame[item],
1007 level=self._level,
1008 )
1010 elif (
1011 is_sequence_but_not_str(item)
1012 or isinstance(item, slice)
1013 or (is_numpy_array(item) and item.ndim == 1)
1014 ):
1015 return self._from_compliant_dataframe(self._compliant_frame[item])

File /opt/conda/lib/python3.10/site-packages/narwhals/_pandas_like/dataframe.py:186, in PandasLikeDataFrame.getitem(self, item)
182 if isinstance(item, str):
183 from narwhals._pandas_like.series import PandasLikeSeries
185 return PandasLikeSeries(
--> 186 self._native_frame[item],
187 implementation=self._implementation,
188 backend_version=self._backend_version,
189 version=self._version,
190 )
192 elif (
193 isinstance(item, tuple)
194 and len(item) == 2
195 and is_sequence_but_not_str(item[1])
196 ):
197 if len(item[1]) == 0:
198 # Return empty dataframe

File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:3807, in DataFrame.getitem(self, key)
3805 if self.columns.nlevels > 1:
3806 return self._getitem_multilevel(key)
-> 3807 indexer = self.columns.get_loc(key)
3808 if is_integer(indexer):
3809 indexer = [indexer]

File /opt/conda/lib/python3.10/site-packages/pandas/core/indexes/base.py:3804, in Index.get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will raise
3807 # InvalidIndexError. Otherwise we fall through and re-raise
3808 # the TypeError.
3809 self._check_indexing_error(key)

KeyError: 'unique_id'

Versions / Dependencies

Python 3.10

Reproduction script

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os

data = pd.read_csv('lacity.org-website-traffic.csv',
parse_dates=['Date']).loc[:, ['Date', 'Device Category', 'Browser', 'Sessions']]
data['Website'] = 'lacity.org'
selected = ['Chrome', 'Firefox', 'Internet Explorer', 'Opera', 'Safari',
'Android Browser', 'Safari (in-app)']
data = data.loc[data['Browser'].isin(selected), :]
data = data.rename(columns={'Date': 'ds', 'Sessions': 'y'})

train = data[['ds','Device Category','y','Website']].loc[data['ds'] < '2019-01-01']
valid = data[['ds','Device Category','y','Website']].loc[(data['ds'] >= '2019-01-01') & (data['ds'] < '2019-02-01')]
h = valid['ds'].nunique()

spec = [['Website'],
['Website', 'Device Category']]

from hierarchicalforecast.utils import aggregate

train_agg, S_train, tags = aggregate(train, spec)
valid_agg, _, _ = aggregate(valid, spec)

from statsforecast import StatsForecast
from statsforecast.models import HoltWinters
model = StatsForecast(models=[HoltWinters(season_length=7, error_type='A')],
freq='D', n_jobs=-1)
model.fit(train_agg)

p = model.forecast(h=h, fitted=True)
p_fitted = model.forecast_fitted_values()

from hierarchicalforecast.methods import BottomUp, TopDown, MinTrace, ERM, OptimalCombination
from hierarchicalforecast.core import HierarchicalReconciliation

reconcilers = [
TopDown(method='forecast_proportions'),
]

rec_model = HierarchicalReconciliation(reconcilers=reconcilers)

p_rec = rec_model.reconcile(Y_hat_df=p, Y_df=p_fitted, S=S_train, tags=tags)

Image

Issue Severity

High: It blocks me from completing my task.

@ag2789 ag2789 added the bug label Jan 16, 2025
@elephaint elephaint self-assigned this Jan 16, 2025
@elephaint
Copy link
Contributor

Unfortunate that your code doesn't work anymore.

We did have a breaking change two months ago, so I'm expecting that to be the issue here. The error message also seems to imply so.

Can you retry with these steps:

  • Upgrade hierarchicalforecast and statsforecast to the lastest version (e.g. pip install hierarchicalforecast --upgrade for both packages)
  • Your forecasts p need to be a DataFrame (can be Polars or Pandas) with columns unique_id, ds and y. So, make sure unique_id is a column, not the index.

If this doesn't solve the issue, let me know.

@ag2789
Copy link
Author

ag2789 commented Jan 16, 2025

As per suggestion, I upgraded the hierarchicalforecast and statsforecast to the lastest version. Also, changed p to have 'unique_id' as a column instead of index.

I still get KeyError: 'unique_id' . The issue still persists

@elephaint
Copy link
Contributor

elephaint commented Jan 16, 2025

Your code runs fine on my machine, not giving any errors. Below is a rewrite to make the code cleaner + more idiomatic, but I couldn't reproduce the error when I use this dataset from Kaggle.

Can you check when you download the dataset from Kaggle and run my snippet below you still get an error? Because if so, it means your Python environment is probably faulty, as it must then be a package version issue.

Here's the rewritten code (gives same results as your code):

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os

data = pd.read_csv('lacity.org-website-traffic.csv',
parse_dates=['Date'], usecols=['Date', 'Device Category', 'Browser', 'Sessions'])

data['Website'] = 'lacity.org'

selected = ['Chrome', 'Firefox', 'Internet Explorer', 'Opera', 'Safari',
'Android Browser', 'Safari (in-app)']

data = data.query("Browser.isin(@selected)")

# Each timeseries needs a unique_id. We create that based on the browser and device category, which defines a unique timeseries in this dataset.
data["unique_id"] = data["Browser"] + "_" + data["Device Category"]

train = data.query("Date < '2019-01-01'")
valid = data.query("Date >= '2019-01-01' & Date < '2019-02-01'")

h = valid['Date'].nunique()

# The lowest granularity of the data is on browser-device category level. This is an example of a hierarchical structure that could be used for reconciliation.
spec = [['Website'],
        ['Website', 'Device Category']]

from hierarchicalforecast.utils import aggregate

# We can specify the time_col and target_cols in aggregate, so we don't have to rename them.
train_agg, S_train, tags = aggregate(train, spec, time_col='Date', target_cols=["Sessions"])

from statsforecast import StatsForecast
from statsforecast.models import HoltWinters

model = StatsForecast(models=[HoltWinters(season_length=7, error_type='A')],
freq='D')

# Model fitting isn't required in StatsForecast, we can just call .forecast directly. Also, we can specify the time_col and target_col in forecast, so we don't have to rename them.
p = model.forecast(df = train_agg, h=h, fitted=True, time_col='Date', target_col="Sessions")
p_fitted = model.forecast_fitted_values()

from hierarchicalforecast.methods import BottomUp, TopDown, MinTrace, ERM, OptimalCombination
from hierarchicalforecast.core import HierarchicalReconciliation

reconcilers = [
TopDown(method='forecast_proportions'),
]

rec_model = HierarchicalReconciliation(reconcilers=reconcilers)

p_rec = rec_model.reconcile(Y_hat_df=p, Y_df=p_fitted, S=S_train, tags=tags, time_col='Date', target_col='Sessions')

@ag2789
Copy link
Author

ag2789 commented Jan 17, 2025

the issue is resolved . thanks for your help!

@ag2789 ag2789 closed this as completed Jan 17, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants