Skip to content

Commit

Permalink
fix quality
Browse files Browse the repository at this point in the history
Signed-off-by: Wesley M. Gifford <[email protected]>
  • Loading branch information
wgifford committed Mar 11, 2024
1 parent e6fe588 commit bd35ed2
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 227 deletions.
3 changes: 1 addition & 2 deletions tests/toolkit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ def ts_data():
{
"id": nreps(["A", "B", "C"], 50),
"id2": nreps(["XX", "YY", "ZZ"], 50),
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)]
* 3,
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)] * 3,
"value1": range(150),
"value2": np.arange(150) / 3 + 10,
}
Expand Down
18 changes: 6 additions & 12 deletions tests/toolkit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ def ts_data_with_categorical():
return pd.DataFrame(
{
"id": nreps(["A", "B", "C"], 50),
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)]
* 3,
"timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)] * 3,
"value1": range(150),
"value2": np.arange(150) / 3 + 10,
"value3": np.arange(150) / 50 - 6,
Expand Down Expand Up @@ -74,9 +73,7 @@ def test_ts_padding(ts_data):

# test date handled
# integer
assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (
context_length - df.shape[0]
)
assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (context_length - df.shape[0])

# date
df_padded = ts_padding(
Expand All @@ -86,9 +83,9 @@ def test_ts_padding(ts_data):
context_length=context_length,
)

assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (
context_length - df.shape[0]
) * timedelta(days=1)
assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (context_length - df.shape[0]) * timedelta(
days=1
)


def test_pretrain_df_dataset(ts_data):
Expand All @@ -106,7 +103,6 @@ def test_pretrain_df_dataset(ts_data):


def test_forecasting_df_dataset(ts_data_with_categorical):

prediction_length = 2
static_categorical_columns = ["color", "material"]
target_columns = ["value1"]
Expand Down Expand Up @@ -141,9 +137,7 @@ def test_forecasting_df_dataset(ts_data_with_categorical):

# check that we produce outputs for static categorical
assert "static_categorical_values" in ds[0]
assert ds[0]["static_categorical_values"].shape == (
len(static_categorical_columns),
)
assert ds[0]["static_categorical_values"].shape == (len(static_categorical_columns),)

# check that frequency token is present
assert "freq_token" in ds[0]
Expand Down
9 changes: 2 additions & 7 deletions tests/toolkit/test_time_series_forecasting_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ def test_forecasting_pipeline_forecasts():
freq="1h",
)

dataset_path = (
"https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv"
)
dataset_path = "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv"
test_end_index = 12 * 30 * 24 + 8 * 30 * 24
test_start_index = test_end_index - context_length

Expand Down Expand Up @@ -67,10 +65,7 @@ def test_forecasting_pipeline_forecasts():
assert forecasts_no_future.shape == (1, 2 * len(target_columns) + 1)

# check forecasts match
assert (
forecasts_no_future.iloc[0]["OT_prediction"]
== forecasts.iloc[0]["OT_prediction"]
)
assert forecasts_no_future.iloc[0]["OT_prediction"] == forecasts.iloc[0]["OT_prediction"]

# test that forecasts are properly exploded
forecast_pipeline = TimeSeriesForecastingPipeline(
Expand Down
12 changes: 4 additions & 8 deletions tests/toolkit/test_time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def test_standard_scaler(sample_data):
# check shape preserved
result = scaler.fit_transform(sample_data[columns])
assert result.shape == sample_data[columns].shape
expected = (
sample_data[columns].values - np.mean(sample_data[columns].values, axis=0)
) / np.std(sample_data[columns].values, axis=0)
expected = (sample_data[columns].values - np.mean(sample_data[columns].values, axis=0)) / np.std(
sample_data[columns].values, axis=0
)
np.testing.assert_allclose(result, expected)

# check serialization
Expand Down Expand Up @@ -69,7 +69,6 @@ def test_ordinal_encoder(sample_data):


def test_time_series_preprocessor_encodes(sample_data):

static_categorical_columns = ["cat", "cat2"]

tsp = TimeSeriesPreprocessor(
Expand All @@ -85,11 +84,8 @@ def test_time_series_preprocessor_encodes(sample_data):


def test_augment_time_series(ts_data):

periods = 5
a = extend_time_series(
ts_data, timestamp_column="timestamp", grouping_columns=["id"], periods=periods
)
a = extend_time_series(ts_data, timestamp_column="timestamp", grouping_columns=["id"], periods=periods)

# check that length increases by periods for each id
assert a.shape[0] == ts_data.shape[0] + 3 * periods
Expand Down
54 changes: 13 additions & 41 deletions tsfm_public/toolkit/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,16 @@ def __init__(
y_cols = [y_cols]

if len(x_cols) > 0:
assert is_cols_in_df(
data_df, x_cols
), f"one or more {x_cols} is not in the list of data_df columns"
assert is_cols_in_df(data_df, x_cols), f"one or more {x_cols} is not in the list of data_df columns"

if len(y_cols) > 0:
assert is_cols_in_df(
data_df, y_cols
), f"one or more {y_cols} is not in the list of data_df columns"
assert is_cols_in_df(data_df, y_cols), f"one or more {y_cols} is not in the list of data_df columns"

if timestamp_column:
assert timestamp_column in list(
data_df.columns
), f"{timestamp_column} is not in the list of data_df columns"
assert (
timestamp_column not in x_cols
), f"{timestamp_column} should not be in the list of x_cols"
assert timestamp_column not in x_cols, f"{timestamp_column} should not be in the list of x_cols"

self.data_df = data_df
self.datetime_col = timestamp_column
Expand Down Expand Up @@ -162,9 +156,7 @@ def __init__(
**kwargs,
):
if len(id_columns) > 0:
assert is_cols_in_df(
data_df, id_columns
), f"{id_columns} is not in the data_df columns"
assert is_cols_in_df(data_df, id_columns), f"{id_columns} is not in the data_df columns"

self.timestamp_column = timestamp_column
self.id_columns = id_columns
Expand Down Expand Up @@ -424,9 +416,7 @@ def __init__(
)

# masking for conditional values which are not observed during future period
self.y_mask_conditional = np.array(
[(c in conditional_columns) for c in y_cols]
)
self.y_mask_conditional = np.array([(c in conditional_columns) for c in y_cols])

# create a mask of x which masks targets
self.x_mask_targets = np.array([(c in target_columns) for c in x_cols])
Expand All @@ -451,10 +441,7 @@ def __getitem__(self, time_id):

# seq_y: batch_size x pred_len x num_x_cols
seq_y = self.y[
time_id
+ self.context_length : time_id
+ self.context_length
+ self.prediction_length
time_id + self.context_length : time_id + self.context_length + self.prediction_length
].values

seq_y[:, self.y_mask_conditional] = 0
Expand All @@ -473,9 +460,7 @@ def __getitem__(self, time_id):
ret["freq_token"] = torch.tensor(self.frequency_token, dtype=torch.int)

if self.static_categorical_columns:
categorical_values = self.data_df[
self.static_categorical_columns
].values[0, :]
categorical_values = self.data_df[self.static_categorical_columns].values[0, :]
ret["static_categorical_values"] = np_to_torch(categorical_values)

return ret
Expand Down Expand Up @@ -543,7 +528,6 @@ def __init__(
input_columns: List[str] = [],
static_categorical_columns: List[str] = [],
):

self.target_columns = target_columns
self.input_columns = input_columns
self.static_categorical_columns = static_categorical_columns
Expand All @@ -566,9 +550,7 @@ def __init__(
def __getitem__(self, time_id):
# seq_x: batch_size x seq_len x num_x_cols
seq_x = self.X[time_id : time_id + self.context_length].values
seq_y = self.y[
time_id + self.context_length - 1 : time_id + self.context_length
].values.ravel()
seq_y = self.y[time_id + self.context_length - 1 : time_id + self.context_length].values.ravel()
# return _torch(seq_x, seq_y)

ret = {
Expand All @@ -582,9 +564,7 @@ def __getitem__(self, time_id):
ret["id"] = self.group_id

if self.static_categorical_columns:
categorical_values = self.data_df[
self.static_categorical_columns
].values[0, :]
categorical_values = self.data_df[self.static_categorical_columns].values[0, :]
ret["static_categorical_values"] = np_to_torch(categorical_values)

return ret
Expand Down Expand Up @@ -661,21 +641,15 @@ def ts_padding(
pad_df[c] = pad_df[c].astype(df.dtypes[c], copy=False)

if timestamp_column:
if (df[timestamp_column].dtype.type == np.datetime64) or (
df[timestamp_column].dtype == int
):
if (df[timestamp_column].dtype.type == np.datetime64) or (df[timestamp_column].dtype == int):
last_timestamp = df.iloc[0][timestamp_column]
period = df.iloc[1][timestamp_column] - df.iloc[0][timestamp_column]
prepended_timestamps = [
last_timestamp + offset * period for offset in range(-fill_length, 0)
]
prepended_timestamps = [last_timestamp + offset * period for offset in range(-fill_length, 0)]
pad_df[timestamp_column] = prepended_timestamps
else:
pad_df[timestamp_column] = None
# Ensure same type
pad_df[timestamp_column] = pad_df[timestamp_column].astype(
df[timestamp_column].dtype
)
pad_df[timestamp_column] = pad_df[timestamp_column].astype(df[timestamp_column].dtype)

if id_columns:
id_values = df.iloc[0][id_columns].to_list()
Expand Down Expand Up @@ -716,6 +690,4 @@ def is_cols_in_df(df: pd.DataFrame, cols: List[str]) -> bool:
d6 = PretrainDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2)
print(f"d6: {d6}")

d7 = ForecastDFDataset(
data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2
)
d7 = ForecastDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2)
36 changes: 8 additions & 28 deletions tsfm_public/toolkit/time_series_forecasting_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@


@add_end_docstrings(
build_pipeline_init_args(
has_tokenizer=False, has_feature_extractor=True, has_image_processor=False
)
build_pipeline_init_args(has_tokenizer=False, has_feature_extractor=True, has_image_processor=False)
)
class TimeSeriesForecastingPipeline(Pipeline):
"""Hugging Face Pipeline for Time Series Forecasting"""
Expand Down Expand Up @@ -64,9 +62,7 @@ def _sanitize_parameters(self, **kwargs):
"""

context_length = kwargs.get("context_length", self.model.config.context_length)
prediction_length = kwargs.get(
"prediction_length", self.model.config.prediction_length
)
prediction_length = kwargs.get("prediction_length", self.model.config.prediction_length)

preprocess_kwargs = {
"prediction_length": prediction_length,
Expand Down Expand Up @@ -174,9 +170,7 @@ def __call__(

return super().__call__(time_series, **kwargs)

def preprocess(
self, time_series, **kwargs
) -> Dict[str, Union[GenericTensor, List[Any]]]:
def preprocess(self, time_series, **kwargs) -> Dict[str, Union[GenericTensor, List[Any]]]:
"""Preprocess step
Load the data, if not already loaded, and then generate a pytorch dataset.
"""
Expand Down Expand Up @@ -204,16 +198,12 @@ def preprocess(
# do we need to check the timestamp column?
pass
else:
raise ValueError(
f"`future_time_series` of type {type(future_time_series)} is not supported."
)
raise ValueError(f"`future_time_series` of type {type(future_time_series)} is not supported.")

# stack the time series
for c in future_time_series.columns:
if c not in time_series.columns:
raise ValueError(
f"Future time series input contains an unknown column {c}."
)
raise ValueError(f"Future time series input contains an unknown column {c}.")

time_series = pd.concat((time_series, future_time_series), axis=0)
else:
Expand Down Expand Up @@ -274,11 +264,7 @@ def _forward(self, model_inputs, **kwargs):

# copy the other inputs
copy_inputs = True
for k in [
akey
for akey in model_inputs.keys()
if (akey not in model_input_keys) or copy_inputs
]:
for k in [akey for akey in model_inputs.keys() if (akey not in model_input_keys) or copy_inputs]:
model_outputs[k] = model_inputs[k]

return model_outputs
Expand All @@ -290,20 +276,14 @@ def postprocess(self, input, **kwargs):
"""
out = {}

model_output_key = (
"prediction_outputs"
if "prediction_outputs" in input.keys()
else "prediction_logits"
)
model_output_key = "prediction_outputs" if "prediction_outputs" in input.keys() else "prediction_logits"

# name the predictions of target columns
# outputs should only have size equal to target columns
prediction_columns = []
for i, c in enumerate(kwargs["target_columns"]):
prediction_columns.append(f"{c}_prediction")
out[prediction_columns[-1]] = (
input[model_output_key][:, :, i].numpy().tolist()
)
out[prediction_columns[-1]] = input[model_output_key][:, :, i].numpy().tolist()
# provide the ground truth values for the targets
# when future is unknown, we will have augmented the provided dataframe with NaN values to cover the future
for i, c in enumerate(kwargs["target_columns"]):
Expand Down
Loading

0 comments on commit bd35ed2

Please sign in to comment.