fix quality

Signed-off-by: Wesley M. Gifford <[email protected]>
ibm-granite · Mar 11, 2024 · bd35ed2 · bd35ed2
1 parent e6fe588
commit bd35ed2
Show file tree

Hide file tree

Showing 11 changed files with 74 additions and 227 deletions.
diff --git a/tests/toolkit/conftest.py b/tests/toolkit/conftest.py
@@ -16,8 +16,7 @@ def ts_data():
         {
             "id": nreps(["A", "B", "C"], 50),
             "id2": nreps(["XX", "YY", "ZZ"], 50),
-            "timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)]
-            * 3,
+            "timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)] * 3,
             "value1": range(150),
             "value2": np.arange(150) / 3 + 10,
         }

diff --git a/tests/toolkit/test_dataset.py b/tests/toolkit/test_dataset.py
@@ -40,8 +40,7 @@ def ts_data_with_categorical():
     return pd.DataFrame(
         {
             "id": nreps(["A", "B", "C"], 50),
-            "timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)]
-            * 3,
+            "timestamp": [datetime(2021, 1, 1) + timedelta(days=i) for i in range(50)] * 3,
             "value1": range(150),
             "value2": np.arange(150) / 3 + 10,
             "value3": np.arange(150) / 50 - 6,
@@ -74,9 +73,7 @@ def test_ts_padding(ts_data):
 
     # test date handled
     # integer
-    assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (
-        context_length - df.shape[0]
-    )
+    assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (context_length - df.shape[0])
 
     # date
     df_padded = ts_padding(
@@ -86,9 +83,9 @@ def test_ts_padding(ts_data):
         context_length=context_length,
     )
 
-    assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (
-        context_length - df.shape[0]
-    ) * timedelta(days=1)
+    assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (context_length - df.shape[0]) * timedelta(
+        days=1
+    )
 
 
 def test_pretrain_df_dataset(ts_data):
@@ -106,7 +103,6 @@ def test_pretrain_df_dataset(ts_data):
 
 
 def test_forecasting_df_dataset(ts_data_with_categorical):
-
     prediction_length = 2
     static_categorical_columns = ["color", "material"]
     target_columns = ["value1"]
@@ -141,9 +137,7 @@ def test_forecasting_df_dataset(ts_data_with_categorical):
 
     # check that we produce outputs for static categorical
     assert "static_categorical_values" in ds[0]
-    assert ds[0]["static_categorical_values"].shape == (
-        len(static_categorical_columns),
-    )
+    assert ds[0]["static_categorical_values"].shape == (len(static_categorical_columns),)
 
     # check that frequency token is present
     assert "freq_token" in ds[0]

diff --git a/tests/toolkit/test_time_series_forecasting_pipeline.py b/tests/toolkit/test_time_series_forecasting_pipeline.py
@@ -29,9 +29,7 @@ def test_forecasting_pipeline_forecasts():
         freq="1h",
     )
 
-    dataset_path = (
-        "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv"
-    )
+    dataset_path = "https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTh2.csv"
     test_end_index = 12 * 30 * 24 + 8 * 30 * 24
     test_start_index = test_end_index - context_length
 
@@ -67,10 +65,7 @@ def test_forecasting_pipeline_forecasts():
     assert forecasts_no_future.shape == (1, 2 * len(target_columns) + 1)
 
     # check forecasts match
-    assert (
-        forecasts_no_future.iloc[0]["OT_prediction"]
-        == forecasts.iloc[0]["OT_prediction"]
-    )
+    assert forecasts_no_future.iloc[0]["OT_prediction"] == forecasts.iloc[0]["OT_prediction"]
 
     # test that forecasts are properly exploded
     forecast_pipeline = TimeSeriesForecastingPipeline(

diff --git a/tests/toolkit/test_time_series_preprocessor.py b/tests/toolkit/test_time_series_preprocessor.py
@@ -26,9 +26,9 @@ def test_standard_scaler(sample_data):
     # check shape preserved
     result = scaler.fit_transform(sample_data[columns])
     assert result.shape == sample_data[columns].shape
-    expected = (
-        sample_data[columns].values - np.mean(sample_data[columns].values, axis=0)
-    ) / np.std(sample_data[columns].values, axis=0)
+    expected = (sample_data[columns].values - np.mean(sample_data[columns].values, axis=0)) / np.std(
+        sample_data[columns].values, axis=0
+    )
     np.testing.assert_allclose(result, expected)
 
     # check serialization
@@ -69,7 +69,6 @@ def test_ordinal_encoder(sample_data):
 
 
 def test_time_series_preprocessor_encodes(sample_data):
-
     static_categorical_columns = ["cat", "cat2"]
 
     tsp = TimeSeriesPreprocessor(
@@ -85,11 +84,8 @@ def test_time_series_preprocessor_encodes(sample_data):
 
 
 def test_augment_time_series(ts_data):
-
     periods = 5
-    a = extend_time_series(
-        ts_data, timestamp_column="timestamp", grouping_columns=["id"], periods=periods
-    )
+    a = extend_time_series(ts_data, timestamp_column="timestamp", grouping_columns=["id"], periods=periods)
 
     # check that length increases by periods for each id
     assert a.shape[0] == ts_data.shape[0] + 3 * periods

diff --git a/tsfm_public/toolkit/dataset.py b/tsfm_public/toolkit/dataset.py
@@ -50,22 +50,16 @@ def __init__(
             y_cols = [y_cols]
 
         if len(x_cols) > 0:
-            assert is_cols_in_df(
-                data_df, x_cols
-            ), f"one or more {x_cols} is not in the list of data_df columns"
+            assert is_cols_in_df(data_df, x_cols), f"one or more {x_cols} is not in the list of data_df columns"
 
         if len(y_cols) > 0:
-            assert is_cols_in_df(
-                data_df, y_cols
-            ), f"one or more {y_cols} is not in the list of data_df columns"
+            assert is_cols_in_df(data_df, y_cols), f"one or more {y_cols} is not in the list of data_df columns"
 
         if timestamp_column:
             assert timestamp_column in list(
                 data_df.columns
             ), f"{timestamp_column} is not in the list of data_df columns"
-            assert (
-                timestamp_column not in x_cols
-            ), f"{timestamp_column} should not be in the list of x_cols"
+            assert timestamp_column not in x_cols, f"{timestamp_column} should not be in the list of x_cols"
 
         self.data_df = data_df
         self.datetime_col = timestamp_column
@@ -162,9 +156,7 @@ def __init__(
         **kwargs,
     ):
         if len(id_columns) > 0:
-            assert is_cols_in_df(
-                data_df, id_columns
-            ), f"{id_columns} is not in the data_df columns"
+            assert is_cols_in_df(data_df, id_columns), f"{id_columns} is not in the data_df columns"
 
         self.timestamp_column = timestamp_column
         self.id_columns = id_columns
@@ -424,9 +416,7 @@ def __init__(
                 )
 
             # masking for conditional values which are not observed during future period
-            self.y_mask_conditional = np.array(
-                [(c in conditional_columns) for c in y_cols]
-            )
+            self.y_mask_conditional = np.array([(c in conditional_columns) for c in y_cols])
 
             # create a mask of x which masks targets
             self.x_mask_targets = np.array([(c in target_columns) for c in x_cols])
@@ -451,10 +441,7 @@ def __getitem__(self, time_id):
 
             # seq_y: batch_size x pred_len x num_x_cols
             seq_y = self.y[
-                time_id
-                + self.context_length : time_id
-                + self.context_length
-                + self.prediction_length
+                time_id + self.context_length : time_id + self.context_length + self.prediction_length
             ].values
 
             seq_y[:, self.y_mask_conditional] = 0
@@ -473,9 +460,7 @@ def __getitem__(self, time_id):
                 ret["freq_token"] = torch.tensor(self.frequency_token, dtype=torch.int)
 
             if self.static_categorical_columns:
-                categorical_values = self.data_df[
-                    self.static_categorical_columns
-                ].values[0, :]
+                categorical_values = self.data_df[self.static_categorical_columns].values[0, :]
                 ret["static_categorical_values"] = np_to_torch(categorical_values)
 
             return ret
@@ -543,7 +528,6 @@ def __init__(
             input_columns: List[str] = [],
             static_categorical_columns: List[str] = [],
         ):
-
             self.target_columns = target_columns
             self.input_columns = input_columns
             self.static_categorical_columns = static_categorical_columns
@@ -566,9 +550,7 @@ def __init__(
         def __getitem__(self, time_id):
             # seq_x: batch_size x seq_len x num_x_cols
             seq_x = self.X[time_id : time_id + self.context_length].values
-            seq_y = self.y[
-                time_id + self.context_length - 1 : time_id + self.context_length
-            ].values.ravel()
+            seq_y = self.y[time_id + self.context_length - 1 : time_id + self.context_length].values.ravel()
             # return _torch(seq_x, seq_y)
 
             ret = {
@@ -582,9 +564,7 @@ def __getitem__(self, time_id):
                 ret["id"] = self.group_id
 
             if self.static_categorical_columns:
-                categorical_values = self.data_df[
-                    self.static_categorical_columns
-                ].values[0, :]
+                categorical_values = self.data_df[self.static_categorical_columns].values[0, :]
                 ret["static_categorical_values"] = np_to_torch(categorical_values)
 
             return ret
@@ -661,21 +641,15 @@ def ts_padding(
         pad_df[c] = pad_df[c].astype(df.dtypes[c], copy=False)
 
     if timestamp_column:
-        if (df[timestamp_column].dtype.type == np.datetime64) or (
-            df[timestamp_column].dtype == int
-        ):
+        if (df[timestamp_column].dtype.type == np.datetime64) or (df[timestamp_column].dtype == int):
             last_timestamp = df.iloc[0][timestamp_column]
             period = df.iloc[1][timestamp_column] - df.iloc[0][timestamp_column]
-            prepended_timestamps = [
-                last_timestamp + offset * period for offset in range(-fill_length, 0)
-            ]
+            prepended_timestamps = [last_timestamp + offset * period for offset in range(-fill_length, 0)]
             pad_df[timestamp_column] = prepended_timestamps
         else:
             pad_df[timestamp_column] = None
         # Ensure same type
-        pad_df[timestamp_column] = pad_df[timestamp_column].astype(
-            df[timestamp_column].dtype
-        )
+        pad_df[timestamp_column] = pad_df[timestamp_column].astype(df[timestamp_column].dtype)
 
     if id_columns:
         id_values = df.iloc[0][id_columns].to_list()
@@ -716,6 +690,4 @@ def is_cols_in_df(df: pd.DataFrame, cols: List[str]) -> bool:
     d6 = PretrainDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2)
     print(f"d6: {d6}")
 
-    d7 = ForecastDFDataset(
-        data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2
-    )
+    d7 = ForecastDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2)
diff --git a/tsfm_public/toolkit/time_series_forecasting_pipeline.py b/tsfm_public/toolkit/time_series_forecasting_pipeline.py
@@ -32,9 +32,7 @@
 
 
 @add_end_docstrings(
-    build_pipeline_init_args(
-        has_tokenizer=False, has_feature_extractor=True, has_image_processor=False
-    )
+    build_pipeline_init_args(has_tokenizer=False, has_feature_extractor=True, has_image_processor=False)
 )
 class TimeSeriesForecastingPipeline(Pipeline):
     """Hugging Face Pipeline for Time Series Forecasting"""
@@ -64,9 +62,7 @@ def _sanitize_parameters(self, **kwargs):
         """
 
         context_length = kwargs.get("context_length", self.model.config.context_length)
-        prediction_length = kwargs.get(
-            "prediction_length", self.model.config.prediction_length
-        )
+        prediction_length = kwargs.get("prediction_length", self.model.config.prediction_length)
 
         preprocess_kwargs = {
             "prediction_length": prediction_length,
@@ -174,9 +170,7 @@ def __call__(
 
         return super().__call__(time_series, **kwargs)
 
-    def preprocess(
-        self, time_series, **kwargs
-    ) -> Dict[str, Union[GenericTensor, List[Any]]]:
+    def preprocess(self, time_series, **kwargs) -> Dict[str, Union[GenericTensor, List[Any]]]:
         """Preprocess step
         Load the data, if not already loaded, and then generate a pytorch dataset.
         """
@@ -204,16 +198,12 @@ def preprocess(
                 # do we need to check the timestamp column?
                 pass
             else:
-                raise ValueError(
-                    f"`future_time_series` of type {type(future_time_series)} is not supported."
-                )
+                raise ValueError(f"`future_time_series` of type {type(future_time_series)} is not supported.")
 
             # stack the time series
             for c in future_time_series.columns:
                 if c not in time_series.columns:
-                    raise ValueError(
-                        f"Future time series input contains an unknown column {c}."
-                    )
+                    raise ValueError(f"Future time series input contains an unknown column {c}.")
 
             time_series = pd.concat((time_series, future_time_series), axis=0)
         else:
@@ -274,11 +264,7 @@ def _forward(self, model_inputs, **kwargs):
 
         # copy the other inputs
         copy_inputs = True
-        for k in [
-            akey
-            for akey in model_inputs.keys()
-            if (akey not in model_input_keys) or copy_inputs
-        ]:
+        for k in [akey for akey in model_inputs.keys() if (akey not in model_input_keys) or copy_inputs]:
             model_outputs[k] = model_inputs[k]
 
         return model_outputs
@@ -290,20 +276,14 @@ def postprocess(self, input, **kwargs):
         """
         out = {}
 
-        model_output_key = (
-            "prediction_outputs"
-            if "prediction_outputs" in input.keys()
-            else "prediction_logits"
-        )
+        model_output_key = "prediction_outputs" if "prediction_outputs" in input.keys() else "prediction_logits"
 
         # name the predictions of target columns
         # outputs should only have size equal to target columns
         prediction_columns = []
         for i, c in enumerate(kwargs["target_columns"]):
             prediction_columns.append(f"{c}_prediction")
-            out[prediction_columns[-1]] = (
-                input[model_output_key][:, :, i].numpy().tolist()
-            )
+            out[prediction_columns[-1]] = input[model_output_key][:, :, i].numpy().tolist()
         # provide the ground truth values for the targets
         # when future is unknown, we will have augmented the provided dataframe with NaN values to cover the future
         for i, c in enumerate(kwargs["target_columns"]):