Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Teach OutliersTransform to ignore holidays #291

Merged
merged 32 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7c33aed
add feature
Polzovat123 Mar 28, 2024
8faf807
issue 9043
Polzovat123 Mar 29, 2024
94561b8
fix lint error
Polzovat123 Apr 2, 2024
371aef1
clean code
Polzovat123 Apr 3, 2024
9d62d81
more informative docs
Polzovat123 Apr 3, 2024
e29fc47
add tests
Polzovat123 Apr 3, 2024
cfb7cbd
write CHANGELOG.md
Polzovat123 Apr 3, 2024
ec2d517
clear code
Polzovat123 Apr 3, 2024
3e745df
fix bug
Polzovat123 Apr 3, 2024
6840c75
fix style
Polzovat123 Apr 3, 2024
b28cbc1
fix
Polzovat123 Apr 3, 2024
15f5736
fix CHANGELOG.md
Polzovat123 Apr 4, 2024
b2a0f62
fix use level name instead of index to improve readability
Polzovat123 Apr 4, 2024
f40e0a9
more descriptive name
Polzovat123 Apr 4, 2024
68ffccc
use isna
Polzovat123 Apr 4, 2024
0a2d78b
separate test for error
Polzovat123 Apr 4, 2024
d65f202
clear duplication
Polzovat123 Apr 4, 2024
68a5deb
clear after lint
Polzovat123 Apr 4, 2024
28d7221
clear add test pipeline
Polzovat123 Apr 4, 2024
f73f9bf
fix typo mistake
Polzovat123 Apr 5, 2024
d24e3fa
add link in CHANGELOG.md
Polzovat123 Apr 5, 2024
c9738ab
add link in CHANGELOG.md
Polzovat123 Apr 5, 2024
4eb344a
remove duplicate code
Polzovat123 Apr 5, 2024
51b5044
rewrite pipeline test, that shows real usage example
Polzovat123 Apr 5, 2024
211b681
all -> any
Polzovat123 Apr 5, 2024
8927a7a
test behaviour if ignore_column is regressor/non-regressor
Polzovat123 Apr 5, 2024
9445f50
clear code
Polzovat123 Apr 5, 2024
3b5f167
fix pull, not issue
Polzovat123 Apr 5, 2024
4a00daf
remove useless test
Polzovat123 Apr 5, 2024
6c83b12
remove useless tune_params check
Polzovat123 Apr 5, 2024
d925f0a
Merge branch 'master' into issue-1850009043
Polzovat123 Apr 8, 2024
95e5129
Merge branch 'master' into issue-1850009043
Polzovat123 Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
- Fix `PredictionIntervalOutliersTransform` fails to work with created columns
- Fix `PredictionIntervalOutliersTransform` fails to work with created columns ([#293](https://github.com/etna-team/etna/issues/293))
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
- Prohibit empty list value and duplication of `target_timestamps` parameter in `FoldMask` ([#226](https://github.com/etna-team/etna/pull/226))
-
-
Expand Down
131 changes: 73 additions & 58 deletions tests/test_transforms/test_outliers/test_outliers_transform.py
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,35 @@
from tests.utils import select_segments_subset


def insert_column(ts, info_col, timestamp, segment):
return ts.add_columns_from_pandas(
TSDataset.to_dataset(
pd.DataFrame(
{
"is_holiday": info_col,
"timestamp": timestamp,
"segment": segment,
}
)
)
)


def made_specific_ds(ts, add_error=True):
timestamp = pd.date_range("2021-01-01", end="2021-02-20", freq="D")
info_col1 = [1 if np.sin(i) > 0.5 else 0 for i in range(len(timestamp))]
info_col2 = [1 if np.sin(i) > 0 else 0 for i in range(len(timestamp))]

if add_error:
info_col1[9] = 4
info_col2[10] = 14

insert_column(ts, info_col1, timestamp, "1")
insert_column(ts, info_col2, timestamp, "2")

return ts


@pytest.fixture()
def outliers_solid_tsds():
"""Create TSDataset with outliers and same last date."""
Expand Down Expand Up @@ -60,61 +89,15 @@ def outliers_solid_tsds_with_holidays(outliers_solid_tsds):


@pytest.fixture()
def outliers_solid_tsds_with_error():
def outliers_solid_tsds_with_error(outliers_solid_tsds):
"""Create TSDataset with outliers error inside ts, incorrect type column"""
timestamp = pd.date_range("2021-01-01", end="2021-02-20", freq="D")
target1 = [np.sin(i) for i in range(len(timestamp))]
target1[5] += 10
info_col1 = [1 if np.sin(i) > 0.5 else 0 for i in range(len(timestamp))]
info_col1[9] = 4

target2 = [np.sin(i) for i in range(len(timestamp))]
target2[8] += 8
target2[15] = 2
target2[26] -= 12
info_col2 = [1 if np.sin(i) > 0 else 0 for i in range(len(timestamp))]
info_col2[10] = 14

df1 = pd.DataFrame({"timestamp": timestamp, "target": target1, "segment": "1", "is_holiday": info_col1})
df2 = pd.DataFrame({"timestamp": timestamp, "target": target2, "segment": "2", "is_holiday": info_col2})
df = pd.concat([df1, df2], ignore_index=True)
df_exog = df.copy()
df_exog.columns = ["timestamp", "regressor_1", "segment", "is_holiday"]
ts = TSDataset(
df=TSDataset.to_dataset(df[["timestamp", "target", "segment"]]).iloc[:-10],
df_exog=TSDataset.to_dataset(df_exog),
freq="D",
known_future="all",
)
return ts
return made_specific_ds(outliers_solid_tsds, add_error=True)


@pytest.fixture()
def outliers_solid_tsds_for_pipeline():
"""Create TSDataset with outliers error inside ts, incorrect type column"""
timestamp = pd.date_range("2021-01-01", end="2021-02-20", freq="D")
target1 = [np.sin(i) for i in range(len(timestamp))]
target1[5] += 10
info_col1 = [1 if np.sin(i) > 0.5 else 0 for i in range(len(timestamp))]

target2 = [np.sin(i) for i in range(len(timestamp))]
target2[8] += 8
target2[15] = 2
target2[26] -= 12
info_col2 = [1 if np.sin(i) > 0 else 0 for i in range(len(timestamp))]

df1 = pd.DataFrame({"timestamp": timestamp, "target": target1, "segment": "1", "is_holiday": info_col1})
df2 = pd.DataFrame({"timestamp": timestamp, "target": target2, "segment": "2", "is_holiday": info_col2})
df = pd.concat([df1, df2], ignore_index=True)
df_exog = df.copy()
df_exog.columns = ["timestamp", "regressor_1", "segment", "is_holiday"]
ts = TSDataset(
df=TSDataset.to_dataset(df[["timestamp", "target", "segment"]]).iloc[:-10],
df_exog=TSDataset.to_dataset(df_exog),
freq="D",
known_future="all",
)
return ts
def outliers_solid_tsds_non_regressor_holiday(outliers_solid_tsds):
"""Create TSDataset with outliers inside ts non regressor"""
return made_specific_ds(outliers_solid_tsds, add_error=False)


@pytest.mark.parametrize("attribute_name,value_type", (("outliers_timestamps", list), ("original_values", pd.Series)))
Expand Down Expand Up @@ -345,7 +328,7 @@ def test_correct_ignore_flag(transform, outliers_solid_tsds_with_holidays):
assert len(transform.params_to_tune()) > 0
transform.fit(ts)
ts_output = transform.transform(ts)
assert not all(ts_output["2021-01-06":"2021-01-06", "1", "target"].isna())
assert not any(ts_output["2021-01-06":"2021-01-06", "1", "target"].isna())


@pytest.mark.parametrize(
Expand Down Expand Up @@ -393,8 +376,8 @@ def test_incorrect_type_ignore_flag(transform, outliers_solid_tsds_with_error):
),
],
)
def test_full_train_with_outliner(transform, expected_changes, outliers_solid_tsds_for_pipeline):
ts = outliers_solid_tsds_for_pipeline
def test_full_train_with_outliers(transform, expected_changes, outliers_solid_tsds_with_holidays):
ts = outliers_solid_tsds_with_holidays

train_ts = deepcopy(ts)
test_ts = deepcopy(ts)
Expand Down Expand Up @@ -426,7 +409,39 @@ def test_full_train_with_outliner(transform, expected_changes, outliers_solid_ts
(PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday")),
],
)
def test_full_pipeline(transform):
model = NaiveModel(lag=1)
pipeline = Pipeline(model, transforms=[transform], horizon=3)
pipeline.set_params(**{"model.lag": 3})
def test_full_pipeline(transform, outliers_solid_tsds):
ts = outliers_solid_tsds

holiday_transform = HolidayTransform(iso_code="RUS", mode="binary", out_column="is_holiday")
pipeline = Pipeline(NaiveModel(lag=1), transforms=[holiday_transform, transform], horizon=3)
pipeline.fit(ts)


@pytest.mark.parametrize(
"transform",
[
(MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday")),
],
)
def test_advance_usage_data_in_transform_regressor(transform, outliers_solid_tsds_with_holidays):
ts = outliers_solid_tsds_with_holidays
assert len(transform.params_to_tune()) > 0
brsnw250 marked this conversation as resolved.
Show resolved Hide resolved
transform.fit(ts)
_ = transform.transform(ts)


@pytest.mark.parametrize(
"transform",
[
(MedianOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(DensityOutliersTransform(in_column="target", ignore_flag_column="is_holiday")),
(PredictionIntervalOutliersTransform(in_column="target", model="sarimax", ignore_flag_column="is_holiday")),
],
)
def test_advance_usage_data_in_transform_nonregressor(transform, outliers_solid_tsds_non_regressor_holiday):
ts = outliers_solid_tsds_non_regressor_holiday
assert len(transform.params_to_tune()) > 0
transform.fit(ts)
_ = transform.transform(ts)
Loading