Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename TSDataset.index -> TSDataset.timestamps, TSDataset.add_columns_from_pandas -> TSDataset.add_features_from_pandas, TSDataset.update_columns_from_pandas -> TSDataset.update_features_from_pandas #593

Merged
merged 10 commits into from
Feb 3, 2025
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed
- Fix possibility of silent handling of duplicate features when updating dataset with `TSDataset.update_columns_from_pandas` ([#522](https://github.com/etna-team/etna/pull/552))
-
-
-
- **Breaking:** Rename `TSDataset.index` to `TSDataset.timestamps` ([#593](https://github.com/etna-team/etna/pull/593))
- **Breaking:** Rename `TSDataset.add_columns_from_pandas` to `TSDataset.add_features_from_pandas` ([#593](https://github.com/etna-team/etna/pull/593))
- **Breaking:** Rename `TSDataset.update_columns_from_pandas` to `TSDataset.update_features_from_pandas` ([#593](https://github.com/etna-team/etna/pull/593))
-
-
-
Expand Down
6 changes: 3 additions & 3 deletions etna/analysis/eda/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@
if segments is None:
segments = sorted(ts.segments)

holidays_df = _create_holidays_df(holidays, index=ts.index, as_is=as_is)
holidays_df = _create_holidays_df(holidays, index=ts.timestamps, as_is=as_is)

Check warning on line 336 in etna/analysis/eda/plots.py

View check run for this annotation

Codecov / codecov/patch

etna/analysis/eda/plots.py#L336

Added line #L336 was not covered by tests

_, ax = _prepare_axes(num_plots=len(segments), columns_num=columns_num, figsize=figsize)

Expand Down Expand Up @@ -633,7 +633,7 @@
default value is "1M"

* integer for data with integer timestamp, groups are formed by ``timestamp // freq``,
default value is ``ts.index.max() + 1``
default value is ``ts.timestamps.max() + 1``

n_rows:
maximum number of rows to plot
Expand All @@ -657,7 +657,7 @@
if ts.freq is None:
# make only one group
if freq is None:
freq = ts.index.max() + 1
freq = ts.timestamps.max() + 1

Check warning on line 660 in etna/analysis/eda/plots.py

View check run for this annotation

Codecov / codecov/patch

etna/analysis/eda/plots.py#L660

Added line #L660 was not covered by tests
grouped_data = df_full.groupby(df_full.timestamp // freq)
else:
if freq is None:
Expand Down
2 changes: 1 addition & 1 deletion etna/clustering/distances/euclidean_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _get_average(self, ts: "TSDataset") -> pd.DataFrame:
pd.DataFrame:
dataframe with columns "timestamp" and "target" that contains the series
"""
centroid = pd.DataFrame({"timestamp": ts.index.values, "target": ts.df.mean(axis=1).values})
centroid = pd.DataFrame({"timestamp": ts.timestamps.values, "target": ts.df.mean(axis=1).values})
return centroid


Expand Down
2 changes: 1 addition & 1 deletion etna/commands/forecast_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def compute_horizon(horizon: int, forecast_params: Dict[str, Any], tsdataset: TS
forecast_start_timestamp = _check_timestamp_param(
param=forecast_params["start_timestamp"], param_name="start_timestamp", freq=tsdataset.freq
)
train_end_timestamp = tsdataset.index.max()
train_end_timestamp = tsdataset.timestamps.max()

if forecast_start_timestamp <= train_end_timestamp:
raise ValueError("Parameter `start_timestamp` should greater than end of training dataset!")
Expand Down
4 changes: 2 additions & 2 deletions etna/commands/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _max_n_folds_forecast(pipeline: Pipeline, context_size: int, ts: Optional[TS
else:
ts = pipeline.ts

num_points = len(ts.index)
num_points = len(ts.timestamps)
horizon = pipeline.horizon

return _estimate_n_folds(num_points=num_points, horizon=horizon, stride=horizon, context_size=context_size)
Expand All @@ -65,7 +65,7 @@ def _max_n_folds_backtest(pipeline: Pipeline, context_size: int, ts: TSDataset,
if backtest_with_intervals:
raise NotImplementedError("Number of folds estimation for backtest with intervals is not implemented!")

num_points = len(ts.index)
num_points = len(ts.timestamps)

horizon = pipeline.horizon
stride = method_kwargs.get("stride", horizon)
Expand Down
12 changes: 6 additions & 6 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def make_future(
# check if we have enough values in regressors
# TODO: check performance
if self.regressors:
future_index = df.index.difference(self.index)
future_index = df.index.difference(self.timestamps)
for segment in self.segments:
regressors_index = self.df_exog.loc[:, pd.IndexSlice[segment, self.regressors]].index
if not np.all(future_index.isin(regressors_index)):
Expand Down Expand Up @@ -1258,7 +1258,7 @@ def train_test_split(

return train, test

def update_columns_from_pandas(self, df_update: pd.DataFrame):
def update_features_from_pandas(self, df_update: pd.DataFrame):
"""Update the existing columns in the dataset with the new values from pandas dataframe.

Before updating columns in ``df``, columns of ``df_update`` will be cropped by the last timestamp in ``df``.
Expand Down Expand Up @@ -1298,7 +1298,7 @@ def update_columns_from_pandas(self, df_update: pd.DataFrame):

self.df.iloc[:, column_idx] = df

def add_columns_from_pandas(
def add_features_from_pandas(
self, df_update: pd.DataFrame, update_exog: bool = False, regressors: Optional[List[str]] = None
):
"""Update the dataset with the new columns from pandas dataframe.
Expand Down Expand Up @@ -1374,15 +1374,15 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
self._regressors = list(set(self._regressors) - features_set)

@property
def index(self) -> pd.Index:
def timestamps(self) -> pd.Index:
"""Return TSDataset timestamp index.

Returns
-------
:
timestamp index of TSDataset
"""
return self.df.index
return self.df.index.copy()

def level_names(self) -> Optional[List[str]]:
"""Return names of the levels in the hierarchical structure."""
Expand Down Expand Up @@ -1911,4 +1911,4 @@ def size(self) -> Tuple[int, int, Optional[int]]:
:
Tuple of TSDataset sizes
"""
return len(self.index), len(self.segments), len(self.features)
return len(self.timestamps), len(self.segments), len(self.features)
2 changes: 1 addition & 1 deletion etna/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def _validate_index(y_true: TSDataset, y_pred: TSDataset):
ValueError:
If there are mismatches in ``y_true`` and ``y_pred`` timestamps
"""
if not y_true.index.equals(y_pred.index):
if not y_true.timestamps.equals(y_pred.timestamps):
raise ValueError("y_true and y_pred have different timestamps")

def _validate_nans(self, y_true: TSDataset, y_pred: TSDataset):
Expand Down
4 changes: 2 additions & 2 deletions etna/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def forecast(self, ts: "TSDataset", prediction_size: int, return_components: boo
raise NotImplementedError("This mode isn't currently implemented!")

expected_length = prediction_size + self.encoder_length
if len(ts.index) < expected_length:
if len(ts.timestamps) < expected_length:
raise ValueError(
"Given context isn't big enough, try to decrease context_size, prediction_size or increase length of given dataset!"
)
Expand All @@ -682,7 +682,7 @@ def forecast(self, ts: "TSDataset", prediction_size: int, return_components: boo
dropna=False,
)
predictions = self.raw_predict(test_dataset)
end_idx = len(ts.index)
end_idx = len(ts.timestamps)
future_ts = ts.tsdataset_idx_slice(start_idx=end_idx - prediction_size, end_idx=end_idx)
for (segment, feature_nm), value in predictions.items():
# we don't want to change dtype after assignment, but there can happen cast to float32
Expand Down
4 changes: 2 additions & 2 deletions etna/models/nn/chronos/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def _forecast(
if return_components:
raise NotImplementedError("This mode isn't currently implemented!")

max_context_size = len(ts.index) - prediction_size
max_context_size = len(ts.timestamps) - prediction_size
if max_context_size <= 0:
raise ValueError("Dataset doesn't have any context timestamps.")

Expand All @@ -220,7 +220,7 @@ def _forecast(
**predict_kwargs,
) # shape [n_segments, prediction_length, n_quantiles], [n_segments, prediction_length]

end_idx = len(ts.index)
end_idx = len(ts.timestamps)
future_ts = ts.tsdataset_idx_slice(start_idx=end_idx - prediction_size, end_idx=end_idx)

if prediction_interval:
Expand Down
6 changes: 3 additions & 3 deletions etna/models/nn/timesfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def forecast(
if return_components:
raise NotImplementedError("This mode isn't currently implemented!")

max_context_size = len(ts.index) - prediction_size
max_context_size = len(ts.timestamps) - prediction_size
if max_context_size <= 0:
raise ValueError("Dataset doesn't have any context timestamps.")

Expand All @@ -235,15 +235,15 @@ def forecast(

self.tfm._set_horizon(prediction_size)

end_idx = len(ts.index)
end_idx = len(ts.timestamps)

all_exog = self._exog_columns()
df_slice = ts.df.loc[:, pd.IndexSlice[:, all_exog + ["target"]]]
first_valid_index = (
df_slice.isna().any(axis=1).idxmin()
) # If all timestamps contains NaNs, idxmin() returns the first timestamp

target_df = df_slice.loc[first_valid_index : ts.index[-prediction_size - 1], pd.IndexSlice[:, "target"]]
target_df = df_slice.loc[first_valid_index : ts.timestamps[-prediction_size - 1], pd.IndexSlice[:, "target"]]

nan_segment_mask = target_df.isna().any()
if nan_segment_mask.any():
Expand Down
2 changes: 1 addition & 1 deletion etna/pipeline/autoregressive_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _forecast(self, ts: TSDataset, return_components: bool) -> TSDataset:
target_components_dfs = []
for idx_start in range(0, self.horizon, self.step):
current_step = min(self.step, self.horizon - idx_start)
current_idx_border = ts.index.shape[0] + idx_start
current_idx_border = ts.timestamps.shape[0] + idx_start
current_ts = TSDataset(
df=prediction_df.iloc[:current_idx_border],
freq=ts.freq,
Expand Down
16 changes: 8 additions & 8 deletions etna/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def validate_on_dataset(self, ts: TSDataset, horizon: int):
ValueError:
Last target timestamp should be not later than horizon steps after last train timestamp
"""
timestamps = ts.index.to_list()
timestamps = ts.timestamps.to_list()

if self.first_train_timestamp is not None and self.first_train_timestamp not in timestamps:
raise ValueError("First train timestamp isn't present in a given dataset!")
Expand Down Expand Up @@ -559,7 +559,7 @@ def _make_predict_timestamps(
end_timestamp = _check_timestamp_param(param=end_timestamp, param_name="end_timestamp", freq=ts.freq)

min_timestamp = ts.describe()["start_timestamp"].max()
max_timestamp = ts.index[-1]
max_timestamp = ts.timestamps[-1]

if start_timestamp is None:
start_timestamp = min_timestamp
Expand Down Expand Up @@ -719,7 +719,7 @@ def _generate_masks_from_n_folds(
assert_never(mode)

masks = []
dataset_timestamps = list(ts.index)
dataset_timestamps = list(ts.timestamps)
min_timestamp_idx, max_timestamp_idx = 0, len(dataset_timestamps)
for offset in range(n_folds, 0, -1):
min_train_idx = min_timestamp_idx + (n_folds - offset) * stride * constant_history_length
Expand Down Expand Up @@ -756,7 +756,7 @@ def _generate_folds_datasets(
ts: TSDataset, masks: List[FoldMask], horizon: int
) -> Generator[Tuple[TSDataset, TSDataset], None, None]:
"""Generate folds."""
timestamps = list(ts.index)
timestamps = list(ts.timestamps)
for mask in masks:
min_train_idx = timestamps.index(mask.first_train_timestamp)
max_train_idx = timestamps.index(mask.last_train_timestamp)
Expand Down Expand Up @@ -824,10 +824,10 @@ def _process_fold_forecast(
logger.start_experiment(job_type="crossval", group=str(fold_number))

fold: Dict[str, Any] = {}
for stage_name, stage_df in zip(("train", "test"), (train, test)):
for stage_name, stage_ts in zip(("train", "test"), (train, test)):
fold[f"{stage_name}_timerange"] = {}
fold[f"{stage_name}_timerange"]["start"] = stage_df.index.min()
fold[f"{stage_name}_timerange"]["end"] = stage_df.index.max()
fold[f"{stage_name}_timerange"]["start"] = stage_ts.timestamps.min()
fold[f"{stage_name}_timerange"]["end"] = stage_ts.timestamps.max()

forecast.df = forecast.df.loc[mask.target_timestamps]
test.df = test.df.loc[mask.target_timestamps]
Expand Down Expand Up @@ -906,7 +906,7 @@ def _prepare_fold_masks(
ts=ts, n_folds=masks, horizon=self.horizon, mode=mode, stride=stride
)
for i, mask in enumerate(masks):
mask.first_train_timestamp = mask.first_train_timestamp if mask.first_train_timestamp else ts.index[0]
mask.first_train_timestamp = mask.first_train_timestamp if mask.first_train_timestamp else ts.timestamps[0]
masks[i] = mask
for mask in masks:
mask.validate_on_dataset(ts=ts, horizon=self.horizon)
Expand Down
2 changes: 1 addition & 1 deletion etna/pipeline/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _create_ts(
def _determine_prediction_size(
self, ts: TSDataset, start_timestamp: Union[pd.Timestamp, int], end_timestamp: Union[pd.Timestamp, int]
) -> int:
timestamp_indices = pd.Series(np.arange(len(ts.index)), index=ts.index)
timestamp_indices = pd.Series(np.arange(len(ts.timestamps)), index=ts.timestamps)
timestamps = timestamp_indices.loc[start_timestamp:end_timestamp]
return len(timestamps)

Expand Down
4 changes: 2 additions & 2 deletions etna/transforms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ def _update_dataset(self, ts: TSDataset, columns_before: Set[str], df_transforme
ts.drop_features(features=columns_to_remove, drop_from_exog=False)
if len(columns_to_add) != 0:
new_regressors = self.get_regressors_info()
ts.add_columns_from_pandas(
ts.add_features_from_pandas(
df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[:, columns_to_add]],
update_exog=False,
regressors=new_regressors,
)
if len(columns_to_update) != 0:
ts.update_columns_from_pandas(
ts.update_features_from_pandas(
df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[:, columns_to_update]]
)
return ts
Expand Down
12 changes: 6 additions & 6 deletions etna/transforms/decomposition/dft_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def fit(self, ts: TSDataset) -> "FourierDecomposeTransform":
:
the fitted transform instance.
"""
self._first_timestamp = ts.index.min()
self._last_timestamp = ts.index.max()
self._first_timestamp = ts.timestamps.min()
self._last_timestamp = ts.timestamps.max()

self._check_segments(df=ts[..., self.in_column].droplevel("feature", axis=1))

Expand All @@ -159,19 +159,19 @@ def transform(self, ts: TSDataset) -> TSDataset:
if self._first_timestamp is None:
raise ValueError("Transform is not fitted!")

if ts.index.min() < self._first_timestamp:
if ts.timestamps.min() < self._first_timestamp:
raise ValueError(
f"First index of the dataset to be transformed must be larger or equal than {self._first_timestamp}!"
)

if ts.index.min() > self._last_timestamp:
if ts.timestamps.min() > self._last_timestamp:
raise ValueError(
f"Dataset to be transformed must contain historical observations in range {self._first_timestamp} - {self._last_timestamp}"
)

segment_df = ts[..., self.in_column].droplevel("feature", axis=1)

ts_max_timestamp = ts.index.max()
ts_max_timestamp = ts.timestamps.max()
if ts_max_timestamp > self._last_timestamp:
future_steps = determine_num_steps(self._last_timestamp, ts_max_timestamp, freq=ts.freq)
segment_df.iloc[-future_steps:] = np.nan
Expand All @@ -192,7 +192,7 @@ def transform(self, ts: TSDataset) -> TSDataset:

segment_components = pd.concat(segment_components, axis=1)

ts.add_columns_from_pandas(segment_components)
ts.add_features_from_pandas(segment_components)

return ts

Expand Down
10 changes: 5 additions & 5 deletions etna/transforms/decomposition/model_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ def fit(self, ts: TSDataset) -> "ModelDecomposeTransform":
:
the fitted transform instance.
"""
self._first_timestamp = ts.index.min()
self._last_timestamp = ts.index.max()
self._first_timestamp = ts.timestamps.min()
self._last_timestamp = ts.timestamps.max()

ts = self._prepare_ts(ts=ts)

Expand All @@ -149,20 +149,20 @@ def transform(self, ts: TSDataset) -> TSDataset:
if self._first_timestamp is None:
raise ValueError("Transform is not fitted!")

if ts.index.min() < self._first_timestamp:
if ts.timestamps.min() < self._first_timestamp:
raise ValueError(
f"First index of the dataset to be transformed must be larger or equal than {self._first_timestamp}!"
)

if ts.index.min() > self._last_timestamp:
if ts.timestamps.min() > self._last_timestamp:
raise ValueError(
f"Dataset to be transformed must contain historical observations in range {self._first_timestamp} - {self._last_timestamp}"
)

decompose_ts = self._prepare_ts(ts=ts)

future_steps = 0
ts_max_timestamp = decompose_ts.index.max()
ts_max_timestamp = decompose_ts.timestamps.max()
if ts_max_timestamp > self._last_timestamp:
future_steps = determine_num_steps(self._last_timestamp, ts_max_timestamp, freq=decompose_ts.freq)
decompose_ts.df = decompose_ts.df.loc[: self._last_timestamp]
Expand Down
4 changes: 2 additions & 2 deletions examples/102-backtest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1254,7 +1254,7 @@
"source": [
"# 2 With specific mask\n",
"window_size = 85\n",
"first_train_timestamp = ts.index.min() + np.timedelta64(100, \"D\")\n",
"first_train_timestamp = ts.timestamps.min() + np.timedelta64(100, \"D\")\n",
"last_train_timestamp = first_train_timestamp + np.timedelta64(window_size, \"D\")\n",
"target_timestamps = pd.date_range(start=last_train_timestamp + np.timedelta64(1, \"D\"), periods=horizon)\n",
"mask = FoldMask(\n",
Expand Down Expand Up @@ -1381,7 +1381,7 @@
"def sliding_window_masks(window_size, n_folds):\n",
" masks = []\n",
" for n in range(n_folds):\n",
" first_train_timestamp = ts.index.min() + np.timedelta64(100, \"D\") + np.timedelta64(n, \"D\")\n",
" first_train_timestamp = ts.timestamps.min() + np.timedelta64(100, \"D\") + np.timedelta64(n, \"D\")\n",
" last_train_timestamp = first_train_timestamp + np.timedelta64(window_size, \"D\")\n",
" target_timestamps = pd.date_range(start=last_train_timestamp + np.timedelta64(1, \"D\"), periods=horizon)\n",
" mask = FoldMask(\n",
Expand Down
14 changes: 7 additions & 7 deletions examples/206-clustering.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ def const_ts_anomal() -> TSDataset:
@pytest.fixture
def ts_diff_endings(example_reg_tsds):
ts = deepcopy(example_reg_tsds)
ts.loc[ts.index[-5] :, pd.IndexSlice["segment_1", "target"]] = np.NAN
ts.loc[ts.timestamps[-5] :, pd.IndexSlice["segment_1", "target"]] = np.NAN
return ts


Expand Down
Loading