Skip to content

Commit

Permalink
Merge pull request #291 from etna-team/issue-1850009043
Browse files Browse the repository at this point in the history
Teach OutliersTransform to ignore holidays
  • Loading branch information
Polzovat123 authored Apr 9, 2024
2 parents ede40e5 + 95e5129 commit ba63d88
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 11 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- Add in `OutliersTransform` possibilities use `ignore_flag_column` to skip values use ignore ([#291](https://github.com/etna-team/etna/pull/291))

### Changed
-
Expand Down Expand Up @@ -98,7 +98,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- Fix `PredictionIntervalOutliersTransform` fails to work with created columns ([#291](https://github.com/etna-team/etna/pull/291))
- Prohibit empty list value and duplication of `target_timestamps` parameter in `FoldMask` ([#226](https://github.com/etna-team/etna/pull/226))
-
-
Expand Down
6 changes: 5 additions & 1 deletion etna/analysis/outliers/prediction_interval_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ def _select_segments_subset(ts: TSDataset, segments: List[str]) -> TSDataset:
result: TSDataset
dataset with selected column.
"""
df = ts.raw_df.loc[:, pd.IndexSlice[segments, :]].copy()
df = ts.df.loc[:, pd.IndexSlice[segments, :]].copy()
df = df.dropna()
df_exog = ts.df_exog
if df_exog is not None:
df_exog = df_exog.loc[:, pd.IndexSlice[segments, :]].copy()
known_future = ts.known_future
freq = ts.freq

if df_exog is not None:
df = df.drop(df_exog.columns.get_level_values("feature").values.tolist(), axis=1, level=1)

subset_ts = TSDataset(df=df, df_exog=df_exog, known_future=known_future, freq=freq)
return subset_ts

Expand Down
30 changes: 27 additions & 3 deletions etna/transforms/outliers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,24 @@
class OutliersTransform(ReversibleTransform, ABC):
"""Finds outliers in specific columns of DataFrame and replaces it with NaNs."""

def __init__(self, in_column: str):
def __init__(self, in_column: str, ignore_flag_column: Optional[str] = None):
"""
Create instance of OutliersTransform.
Parameters
----------
in_column:
name of processed column
ignore_flag_column:
column name for skipping values from outlier check
"""
super().__init__(required_features=[in_column])
required_features = [in_column]
if ignore_flag_column:
required_features.append(ignore_flag_column)

super().__init__(required_features=required_features)
self.in_column = in_column
self.ignore_flag_column = ignore_flag_column

self.segment_outliers: Optional[Dict[str, pd.Series]] = None

Expand Down Expand Up @@ -78,6 +85,15 @@ def fit(self, ts: TSDataset) -> "OutliersTransform":
:
The fitted transform instance.
"""
if self.ignore_flag_column is not None:
if self.ignore_flag_column not in ts.columns.get_level_values("feature"):
raise ValueError(f'Name ignore_flag_column="{self.ignore_flag_column}" not find.')
types_ignore_flag = ts[..., self.ignore_flag_column].isin([0, 1]).all(axis=0)
if not all(types_ignore_flag):
raise ValueError(
f'Columns ignore_flag contain non binary value: columns: "{self.ignore_flag_column}" in segment: {types_ignore_flag[~types_ignore_flag].index.get_level_values("segment").tolist()}'
)

self.segment_outliers = self.detect_outliers(ts)
self._fit_segments = ts.segments
super().fit(ts=ts)
Expand Down Expand Up @@ -131,8 +147,16 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
if segment not in segments:
continue
# to locate only present indices
segment_outliers_timestamps = list(index_set.intersection(self.segment_outliers[segment].index.values))
if self.ignore_flag_column:
available_points = set(df[df[segment, self.ignore_flag_column] == 0].index.values)
else:
available_points = index_set
segment_outliers_timestamps = list(
available_points.intersection(self.segment_outliers[segment].index.values)
)

df.loc[segment_outliers_timestamps, pd.IndexSlice[segment, self.in_column]] = np.NaN

return df

def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand Down
24 changes: 19 additions & 5 deletions etna/transforms/outliers/point_outliers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Callable
from typing import Dict
from typing import List
from typing import Optional
from typing import Type
from typing import Union

Expand Down Expand Up @@ -32,7 +33,13 @@ class MedianOutliersTransform(OutliersTransform):
it uses information from the whole train part.
"""

def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
def __init__(
self,
in_column: str,
window_size: int = 10,
alpha: float = 3,
ignore_flag_column: Optional[str] = None,
):
"""Create instance of MedianOutliersTransform.
Parameters
Expand All @@ -43,10 +50,12 @@ def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
number of points in the window
alpha:
coefficient for determining the threshold
ignore_flag_column:
column name for skipping values from outlier check
"""
self.window_size = window_size
self.alpha = alpha
super().__init__(in_column=in_column)
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call :py:func:`~etna.analysis.outliers.median_outliers.get_anomalies_median` function with self parameters.
Expand Down Expand Up @@ -97,6 +106,7 @@ def __init__(
distance_coef: float = 3,
n_neighbors: int = 3,
distance_func: Union[Literal["absolute_difference"], Callable[[float, float], float]] = "absolute_difference",
ignore_flag_column: Optional[str] = None,
):
"""Create instance of DensityOutliersTransform.
Expand All @@ -113,12 +123,14 @@ def __init__(
distance_func:
distance function. If a string is specified, a corresponding vectorized implementation will be used.
Custom callable will be used as a scalar function, which will result in worse performance.
ignore_flag_column:
column name for skipping values from outlier check
"""
self.window_size = window_size
self.distance_coef = distance_coef
self.n_neighbors = n_neighbors
self.distance_func = distance_func
super().__init__(in_column=in_column)
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call :py:func:`~etna.analysis.outliers.density_outliers.get_anomalies_density` function with self parameters.
Expand Down Expand Up @@ -169,6 +181,7 @@ def __init__(
in_column: str,
model: Union[Literal["prophet"], Literal["sarimax"], Type["ProphetModel"], Type["SARIMAXModel"]],
interval_width: float = 0.95,
ignore_flag_column: Optional[str] = None,
**model_kwargs,
):
"""Create instance of PredictionIntervalOutliersTransform.
Expand All @@ -181,7 +194,8 @@ def __init__(
model for prediction interval estimation
interval_width:
width of the prediction interval
ignore_flag_column:
column name for skipping values from outlier check
Notes
-----
For not "target" column only column data will be used for learning.
Expand All @@ -190,7 +204,7 @@ def __init__(
self.interval_width = interval_width
self.model_kwargs = model_kwargs
self._model_type = self._get_model_type(model)
super().__init__(in_column=in_column)
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

@staticmethod
def _get_model_type(
Expand Down
172 changes: 172 additions & 0 deletions tests/test_transforms/test_outliers/test_outliers_transform.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import re
from copy import deepcopy

import numpy as np
import pandas as pd
import pytest
Expand All @@ -6,16 +9,50 @@
from etna.analysis import get_anomalies_median
from etna.analysis import get_anomalies_prediction_interval
from etna.datasets.tsdataset import TSDataset
from etna.models import NaiveModel
from etna.models import ProphetModel
from etna.models import SARIMAXModel
from etna.pipeline import Pipeline
from etna.transforms import DensityOutliersTransform
from etna.transforms import HolidayTransform
from etna.transforms import MedianOutliersTransform
from etna.transforms import PredictionIntervalOutliersTransform
from tests.test_transforms.utils import assert_column_changes
from tests.test_transforms.utils import assert_sampling_is_valid
from tests.test_transforms.utils import assert_transformation_equals_loaded_original
from tests.test_transforms.utils import find_columns_diff
from tests.utils import select_segments_subset


def insert_column(ts, info_col, timestamp, segment):
return ts.add_columns_from_pandas(
TSDataset.to_dataset(
pd.DataFrame(
{
"is_holiday": info_col,
"timestamp": timestamp,
"segment": segment,
}
)
)
)


def made_specific_ds(ts, add_error=True):
timestamp = pd.date_range("2021-01-01", end="2021-02-20", freq="D")
info_col1 = [1 if np.sin(i) > 0.5 else 0 for i in range(len(timestamp))]
info_col2 = [1 if np.sin(i) > 0 else 0 for i in range(len(timestamp))]

if add_error:
info_col1[9] = 4
info_col2[10] = 14

insert_column(ts, info_col1, timestamp, "1")
insert_column(ts, info_col2, timestamp, "2")

return ts


@pytest.fixture()
def outliers_solid_tsds():
"""Create TSDataset with outliers and same last date."""
Expand All @@ -42,6 +79,27 @@ def outliers_solid_tsds():
return ts


@pytest.fixture()
def outliers_solid_tsds_with_holidays(outliers_solid_tsds):
"""Create TSDataset with outliers with holidays"""
ts = outliers_solid_tsds
holiday_transform = HolidayTransform(iso_code="RUS", mode="binary", out_column="is_holiday")
ts = holiday_transform.fit_transform(ts)
return ts


@pytest.fixture()
def outliers_solid_tsds_with_error(outliers_solid_tsds):
"""Create TSDataset with outliers error inside ts, incorrect type column"""
return made_specific_ds(outliers_solid_tsds, add_error=True)


@pytest.fixture()
def outliers_solid_tsds_non_regressor_holiday(outliers_solid_tsds):
"""Create TSDataset with outliers inside ts non regressor"""
return made_specific_ds(outliers_solid_tsds, add_error=False)


@pytest.mark.parametrize("attribute_name,value_type", (("outliers_timestamps", list), ("original_values", pd.Series)))
def test_density_outliers_deprecated_store_attributes(outliers_solid_tsds, attribute_name, value_type):
transform = DensityOutliersTransform(in_column="target")
Expand Down Expand Up @@ -255,3 +313,117 @@ def test_params_to_tune(transform, outliers_solid_tsds):
ts = outliers_solid_tsds
assert len(transform.params_to_tune()) > 0
assert_sampling_is_valid(transform=transform, ts=ts)


@pytest.mark.parametrize(
"transform",
(
MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday"),
DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday"),
PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday"),
),
)
def test_correct_ignore_flag(transform, outliers_solid_tsds_with_holidays):
ts = outliers_solid_tsds_with_holidays
transform.fit(ts)
ts_output = transform.transform(ts)
assert not any(ts_output["2021-01-06":"2021-01-06", "1", "target"].isna())


@pytest.mark.parametrize(
"transform",
(
MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday"),
DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday"),
PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday"),
),
)
def test_incorrect_not_exists_column(transform, outliers_solid_tsds):
ts = outliers_solid_tsds
with pytest.raises(ValueError, match='Name ignore_flag_column="is_holiday" not find.'):
transform.fit(ts)
_ = transform.transform(ts)


@pytest.mark.parametrize(
"transform",
(
MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday"),
DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday"),
PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday"),
),
)
def test_incorrect_type_ignore_flag(transform, outliers_solid_tsds_with_error):
ts = outliers_solid_tsds_with_error
with pytest.raises(
ValueError,
match=re.escape("Columns ignore_flag contain non binary value: columns: \"is_holiday\" in segment: ['1', '2']"),
):
transform.fit(ts)
_ = transform.transform(ts)


@pytest.mark.parametrize(
"transform, expected_changes",
[
(MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday"), {"change": {"target"}}),
(DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday"), {"change": {"target"}}),
(
PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday"),
{"change": {"target"}},
),
],
)
def test_full_train_with_outliers(transform, expected_changes, outliers_solid_tsds_with_holidays):
ts = outliers_solid_tsds_with_holidays

train_ts = deepcopy(ts)
test_ts = deepcopy(ts)

transform.fit(train_ts)

transformed_test_ts = transform.transform(deepcopy(test_ts))

inverse_transformed_test_ts = transform.inverse_transform(deepcopy(transformed_test_ts))

# check
assert_column_changes(ts_1=transformed_test_ts, ts_2=inverse_transformed_test_ts, expected_changes=expected_changes)
flat_test_df = test_ts.to_pandas(flatten=True)
flat_transformed_test_df = transformed_test_ts.to_pandas(flatten=True)
flat_inverse_transformed_test_df = inverse_transformed_test_ts.to_pandas(flatten=True)
created_columns, removed_columns, changed_columns = find_columns_diff(
flat_transformed_test_df, flat_inverse_transformed_test_df
)
pd.testing.assert_frame_equal(
flat_test_df[list(changed_columns)], flat_inverse_transformed_test_df[list(changed_columns)]
)


@pytest.mark.parametrize(
"transform",
[
(MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday")),
],
)
def test_full_pipeline(transform, outliers_solid_tsds):
ts = outliers_solid_tsds

holiday_transform = HolidayTransform(iso_code="RUS", mode="binary", out_column="is_holiday")
pipeline = Pipeline(NaiveModel(lag=1), transforms=[holiday_transform, transform], horizon=3)
pipeline.fit(ts)


@pytest.mark.parametrize(
"transform",
[
(MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday")),
],
)
def test_advance_usage_data_in_transform_nonregressor(transform, outliers_solid_tsds_non_regressor_holiday):
ts = outliers_solid_tsds_non_regressor_holiday
transform.fit(ts)
_ = transform.transform(ts)

0 comments on commit ba63d88

Please sign in to comment.