Skip to content

Commit

Permalink
handle scaling of lists, update tests
Browse files Browse the repository at this point in the history
Signed-off-by: Wesley M. Gifford <[email protected]>
  • Loading branch information
wgifford committed Apr 3, 2024
1 parent 2c147ec commit f10005c
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 134 deletions.
6 changes: 2 additions & 4 deletions tests/toolkit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ def ts_data():
{
"id": nreps(["A", "B", "C"], 50),
"id2": nreps(["XX", "YY", "ZZ"], 50),
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)]
* 3,
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)] * 3,
"value1": range(150),
"value2": np.arange(150) ** 2 / 3 + 10,
}
Expand All @@ -44,8 +43,7 @@ def ts_data_runs():
{
"run_id": nreps(["1", "2", "3", "4"], 50),
"asset_id": nreps(["foo", "bar", "foo", "bar"], 50),
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)]
* 4,
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)] * 4,
"value1": range(200),
}
)
Expand Down
50 changes: 16 additions & 34 deletions tests/toolkit/test_time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def test_standard_scaler(sample_data):
# check shape preserved
result = scaler.fit_transform(sample_data[columns])
assert result.shape == sample_data[columns].shape
expected = (
sample_data[columns].values - np.mean(sample_data[columns].values, axis=0)
) / np.std(sample_data[columns].values, axis=0)
expected = (sample_data[columns].values - np.mean(sample_data[columns].values, axis=0)) / np.std(
sample_data[columns].values, axis=0
)
np.testing.assert_allclose(result, expected)

# check serialization
Expand Down Expand Up @@ -100,12 +100,8 @@ def test_time_series_preprocessor_scales(ts_data):

# check scaled result
out = tsp.preprocess(df)
assert np.allclose(
out.groupby(tsp.id_columns)[tsp.target_columns].apply(lambda x: np.mean(x)), 0.0
)
assert np.allclose(
out.groupby(tsp.id_columns)[tsp.target_columns].apply(lambda x: np.std(x)), 1.0
)
assert np.allclose(out.groupby(tsp.id_columns)[tsp.target_columns].apply(lambda x: np.mean(x)), 0.0)
assert np.allclose(out.groupby(tsp.id_columns)[tsp.target_columns].apply(lambda x: np.std(x)), 1.0)

# check inverse scale result
out_inv = tsp.inverse_scale_targets(out)
Expand All @@ -122,9 +118,7 @@ def test_time_series_preprocessor_scales(ts_data):

suffix = "_foo"
targets_suffix = [f"{c}{suffix}" for c in tsp.target_columns]
out.columns = [
f"{c}{suffix}" if c in tsp.target_columns else c for c in out.columns
]
out.columns = [f"{c}{suffix}" if c in tsp.target_columns else c for c in out.columns]
out_inv = tsp.inverse_scale_targets(out, suffix=suffix)
assert np.all(
out_inv.groupby(tsp.id_columns)[targets_suffix].apply(lambda x: np.mean(x))
Expand All @@ -150,19 +144,18 @@ def test_time_series_preprocessor_inv_scales_lists(ts_data):
out = tsp.preprocess(df)

# construct artificial result
out["value1"] = out["value1"].apply(lambda x: np.array([x, x]))
out["value2"] = out["value2"].apply(lambda x: np.array([x, x]))
out["value1"] = out["value1"].apply(lambda x: np.array([x] * 3))
out["value2"] = out["value2"].apply(lambda x: np.array([x] * 3))

out_inv = tsp.inverse_scale_targets(out)

1
assert out_inv["value1"].mean()[0] == df["value1"].mean()
assert out_inv["value2"].mean()[0] == df["value2"].mean()


def test_augment_time_series(ts_data):
periods = 5
a = extend_time_series(
ts_data, timestamp_column="timestamp", grouping_columns=["id"], periods=periods
)
a = extend_time_series(ts_data, timestamp_column="timestamp", grouping_columns=["id"], periods=periods)

# check that length increases by periods for each id
assert a.shape[0] == ts_data.shape[0] + 3 * periods
Expand Down Expand Up @@ -249,9 +242,7 @@ def test_get_datasets(ts_data):
)

# 3 time series of length 50
assert len(train) == 3 * (
int((1 / 3) * 50) - (tsp.context_length + tsp.prediction_length) + 1
)
assert len(train) == 3 * (int((1 / 3) * 50) - (tsp.context_length + tsp.prediction_length) + 1)

assert len(valid) == len(test)

Expand All @@ -276,10 +267,7 @@ def test_get_datasets(ts_data):

# new train length should be 20% of 100, minus the usual for context length and prediction length
fewshot_train_size = (
int((100 - tsp.context_length) * 0.2)
+ tsp.context_length
- (tsp.context_length + tsp.prediction_length)
+ 1
int((100 - tsp.context_length) * 0.2) + tsp.context_length - (tsp.context_length + tsp.prediction_length) + 1
)
assert len(train) == fewshot_train_size

Expand Down Expand Up @@ -326,15 +314,11 @@ def test_get_datasets(ts_data):
},
)

assert (
len(train) == int(150 * 0.7) - (tsp.context_length + tsp.prediction_length) + 1
)
assert len(train) == int(150 * 0.7) - (tsp.context_length + tsp.prediction_length) + 1

assert len(test) == int(150 * 0.2) - tsp.prediction_length + 1

assert (
len(valid) == 150 - int(150 * 0.2) - int(150 * 0.7) - tsp.prediction_length + 1
)
assert len(valid) == 150 - int(150 * 0.2) - int(150 * 0.7) - tsp.prediction_length + 1


def test_train_without_targets(ts_data):
Expand Down Expand Up @@ -397,9 +381,7 @@ def test_id_columns_and_scaling_id_columns(ts_data_runs):
scaling=True,
)

ds_train, ds_valid, ds_test = tsp.get_datasets(
df, split_config={"train": 0.7, "test": 0.2}
)
ds_train, ds_valid, ds_test = tsp.get_datasets(df, split_config={"train": 0.7, "test": 0.2})

assert len(tsp.target_scaler_dict) == 2
assert len(ds_train.datasets) == 4
Loading

0 comments on commit f10005c

Please sign in to comment.