Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Teach OutliersTransform to ignore holidays #291

Merged
merged 32 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7c33aed
add feature
Polzovat123 Mar 28, 2024
8faf807
issue 9043
Polzovat123 Mar 29, 2024
94561b8
fix lint error
Polzovat123 Apr 2, 2024
371aef1
clean code
Polzovat123 Apr 3, 2024
9d62d81
more informative docs
Polzovat123 Apr 3, 2024
e29fc47
add tests
Polzovat123 Apr 3, 2024
cfb7cbd
write CHANGELOG.md
Polzovat123 Apr 3, 2024
ec2d517
clear code
Polzovat123 Apr 3, 2024
3e745df
fix bug
Polzovat123 Apr 3, 2024
6840c75
fix style
Polzovat123 Apr 3, 2024
b28cbc1
fix
Polzovat123 Apr 3, 2024
15f5736
fix CHANGELOG.md
Polzovat123 Apr 4, 2024
b2a0f62
fix use level name instead of index to improve readability
Polzovat123 Apr 4, 2024
f40e0a9
more descriptive name
Polzovat123 Apr 4, 2024
68ffccc
use isna
Polzovat123 Apr 4, 2024
0a2d78b
separate test for error
Polzovat123 Apr 4, 2024
d65f202
clear duplication
Polzovat123 Apr 4, 2024
68a5deb
clear after lint
Polzovat123 Apr 4, 2024
28d7221
clear add test pipeline
Polzovat123 Apr 4, 2024
f73f9bf
fix typo mistake
Polzovat123 Apr 5, 2024
d24e3fa
add link in CHANGELOG.md
Polzovat123 Apr 5, 2024
c9738ab
add link in CHANGELOG.md
Polzovat123 Apr 5, 2024
4eb344a
remove duplicate code
Polzovat123 Apr 5, 2024
51b5044
rewrite pipeline test, that shows real usage example
Polzovat123 Apr 5, 2024
211b681
all -> any
Polzovat123 Apr 5, 2024
8927a7a
test behaviour if ignore_column is regressor/non-regressor
Polzovat123 Apr 5, 2024
9445f50
clear code
Polzovat123 Apr 5, 2024
3b5f167
fix pull, not issue
Polzovat123 Apr 5, 2024
4a00daf
remove useless test
Polzovat123 Apr 5, 2024
6c83b12
remove useless tune_params check
Polzovat123 Apr 5, 2024
d925f0a
Merge branch 'master' into issue-1850009043
Polzovat123 Apr 8, 2024
95e5129
Merge branch 'master' into issue-1850009043
Polzovat123 Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- Add in `OutliersTransform` possibilities use `ignore_flag_column` to skip values use ignore ([#291](https://github.com/etna-team/etna/pull/291))
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved

### Changed
-
Expand Down Expand Up @@ -98,7 +98,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- Fix `PredictionIntervalOutliersTransform` fails to work with created columns ([#293](https://github.com/etna-team/etna/issues/293))
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
- Prohibit empty list value and duplication of `target_timestamps` parameter in `FoldMask` ([#226](https://github.com/etna-team/etna/pull/226))
-
-
Expand Down
6 changes: 5 additions & 1 deletion etna/analysis/outliers/prediction_interval_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ def _select_segments_subset(ts: TSDataset, segments: List[str]) -> TSDataset:
result: TSDataset
dataset with selected column.
"""
df = ts.raw_df.loc[:, pd.IndexSlice[segments, :]].copy()
df = ts.df.loc[:, pd.IndexSlice[segments, :]].copy()
df = df.dropna()
df_exog = ts.df_exog
if df_exog is not None:
df_exog = df_exog.loc[:, pd.IndexSlice[segments, :]].copy()
known_future = ts.known_future
freq = ts.freq

if df_exog is not None:
df = df.drop(df_exog.columns.get_level_values("feature").values.tolist(), axis=1, level=1)

subset_ts = TSDataset(df=df, df_exog=df_exog, known_future=known_future, freq=freq)
return subset_ts

Expand Down
30 changes: 27 additions & 3 deletions etna/transforms/outliers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,24 @@
class OutliersTransform(ReversibleTransform, ABC):
"""Finds outliers in specific columns of DataFrame and replaces it with NaNs."""

def __init__(self, in_column: str):
def __init__(self, in_column: str, ignore_flag_column: Optional[str] = None):
"""
Create instance of OutliersTransform.

Parameters
----------
in_column:
name of processed column
ignore_flag_column:
column name for skipping values from outlier check
"""
super().__init__(required_features=[in_column])
required_features = [in_column]
if ignore_flag_column:
required_features.append(ignore_flag_column)

super().__init__(required_features=required_features)
self.in_column = in_column
self.ignore_flag_column = ignore_flag_column

self.segment_outliers: Optional[Dict[str, pd.Series]] = None

Expand Down Expand Up @@ -78,6 +85,15 @@
:
The fitted transform instance.
"""
if self.ignore_flag_column is not None:
if self.ignore_flag_column not in ts.columns.get_level_values("feature"):
raise ValueError(f'Name ignore_flag_column="{self.ignore_flag_column}" not find.')
types_ignore_flag = ts[..., self.ignore_flag_column].isin([0, 1]).all(axis=0)
if not all(types_ignore_flag):
raise ValueError(

Check warning on line 93 in etna/transforms/outliers/base.py

View check run for this annotation

Codecov / codecov/patch

etna/transforms/outliers/base.py#L93

Added line #L93 was not covered by tests
f'Columns ignore_flag contain non binary value: columns: "{self.ignore_flag_column}" in segment: {types_ignore_flag[~types_ignore_flag].index.get_level_values("segment").tolist()}'
)

self.segment_outliers = self.detect_outliers(ts)
self._fit_segments = ts.segments
super().fit(ts=ts)
Expand Down Expand Up @@ -131,8 +147,16 @@
if segment not in segments:
continue
# to locate only present indices
segment_outliers_timestamps = list(index_set.intersection(self.segment_outliers[segment].index.values))
if self.ignore_flag_column:
available_points = set(df[df[segment, self.ignore_flag_column] == 0].index.values)
else:
available_points = index_set
segment_outliers_timestamps = list(
available_points.intersection(self.segment_outliers[segment].index.values)
)

df.loc[segment_outliers_timestamps, pd.IndexSlice[segment, self.in_column]] = np.NaN

return df

def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand Down
24 changes: 19 additions & 5 deletions etna/transforms/outliers/point_outliers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Callable
from typing import Dict
from typing import List
from typing import Optional
from typing import Type
from typing import Union

Expand Down Expand Up @@ -32,7 +33,13 @@ class MedianOutliersTransform(OutliersTransform):
it uses information from the whole train part.
"""

def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
def __init__(
self,
in_column: str,
window_size: int = 10,
alpha: float = 3,
ignore_flag_column: Optional[str] = None,
):
"""Create instance of MedianOutliersTransform.

Parameters
Expand All @@ -43,10 +50,12 @@ def __init__(self, in_column: str, window_size: int = 10, alpha: float = 3):
number of points in the window
alpha:
coefficient for determining the threshold
ignore_flag_column:
column name for skipping values from outlier check
"""
self.window_size = window_size
self.alpha = alpha
super().__init__(in_column=in_column)
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call :py:func:`~etna.analysis.outliers.median_outliers.get_anomalies_median` function with self parameters.
Expand Down Expand Up @@ -97,6 +106,7 @@ def __init__(
distance_coef: float = 3,
n_neighbors: int = 3,
distance_func: Union[Literal["absolute_difference"], Callable[[float, float], float]] = "absolute_difference",
ignore_flag_column: Optional[str] = None,
):
"""Create instance of DensityOutliersTransform.

Expand All @@ -113,12 +123,14 @@ def __init__(
distance_func:
distance function. If a string is specified, a corresponding vectorized implementation will be used.
Custom callable will be used as a scalar function, which will result in worse performance.
ignore_flag_column:
column name for skipping values from outlier check
"""
self.window_size = window_size
self.distance_coef = distance_coef
self.n_neighbors = n_neighbors
self.distance_func = distance_func
super().__init__(in_column=in_column)
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
"""Call :py:func:`~etna.analysis.outliers.density_outliers.get_anomalies_density` function with self parameters.
Expand Down Expand Up @@ -169,6 +181,7 @@ def __init__(
in_column: str,
model: Union[Literal["prophet"], Literal["sarimax"], Type["ProphetModel"], Type["SARIMAXModel"]],
interval_width: float = 0.95,
ignore_flag_column: Optional[str] = None,
**model_kwargs,
):
"""Create instance of PredictionIntervalOutliersTransform.
Expand All @@ -181,7 +194,8 @@ def __init__(
model for prediction interval estimation
interval_width:
width of the prediction interval

ignore_flag_column:
column name for skipping values from outlier check
Notes
-----
For not "target" column only column data will be used for learning.
Expand All @@ -190,7 +204,7 @@ def __init__(
self.interval_width = interval_width
self.model_kwargs = model_kwargs
self._model_type = self._get_model_type(model)
super().__init__(in_column=in_column)
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

@staticmethod
def _get_model_type(
Expand Down
Loading
Loading