WorldCereal · gabrieltseng · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/config/default.json b/config/default.json
@@ -7,5 +7,7 @@
     "encoder_num_heads": 8,
     "decoder_embedding_size": 128,
     "decoder_depth": 2,
-    "decoder_num_heads": 8
+    "decoder_num_heads": 8,
+    "valid_month_as_token": false,
+    "valid_month_size": 32
 }
diff --git a/presto/dataset.py b/presto/dataset.py
@@ -68,13 +68,14 @@ def target_crop(row_d: Dict) -> int:
     @classmethod
     def row_to_arrays(
         cls, row: pd.Series, target_function: Callable[[Dict], int]
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, int]:
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, int, int]:
         # https://stackoverflow.com/questions/45783891/is-there-a-way-to-speed-up-the-pandas-getitem-getitem-axis-and-get-label
         # This is faster than indexing the series every time!
         row_d = pd.Series.to_dict(row)
 
         latlon = np.array([row_d["lat"], row_d["lon"]], dtype=np.float32)
         month = datetime.strptime(row_d["start_date"], "%Y-%m-%d").month - 1
+        valid_month = datetime.strptime(row_d["valid_date"], "%Y-%m-%d").month - 1
 
         eo_data = np.zeros((cls.NUM_TIMESTEPS, len(BANDS)))
         # an assumption we make here is that all timesteps for a token
@@ -109,6 +110,7 @@ def row_to_arrays(
             mask.astype(bool),
             latlon,
             month,
+            valid_month,
             target_function(row_d),
         )
 
@@ -154,7 +156,7 @@ def __init__(self, dataframe: pd.DataFrame, mask_params: MaskParamsNoDw):
     def __getitem__(self, idx):
         # Get the sample
         row = self.df.iloc[idx, :]
-        eo, real_mask_per_token, latlon, month, _ = self.row_to_arrays(row, self.target_crop)
+        eo, real_mask_per_token, latlon, month, _, _ = self.row_to_arrays(row, self.target_crop)
         mask_eo, x_eo, y_eo, strat = self.mask_params.mask_data(
             self.normalize_and_mask(eo), real_mask_per_token
         )
@@ -188,6 +190,10 @@ def target_maize(row_d) -> int:
     return int(row_d["CROPTYPE_LABEL"] == 1200)
 
 
+def target_croptype(row_d) -> int:
+    return int(row_d["CROPTYPE_LABEL"])
+
+
 class WorldCerealLabelledDataset(WorldCerealBase):
     # 0: no information, 10: could be both annual or perennial
     FILTER_LABELS = [0, 10]
@@ -246,14 +252,17 @@ def __getitem__(self, idx):
         # Get the sample
         df_index = self.indices[idx]
         row = self.df.iloc[df_index, :]
-        eo, mask_per_token, latlon, month, target = self.row_to_arrays(row, self.target_function)
+        eo, mask_per_token, latlon, month, valid_month, target = self.row_to_arrays(
+            row, self.target_function
+        )
         mask_per_variable = np.repeat(mask_per_token, BAND_EXPANSION, axis=1)
         return (
             self.normalize_and_mask(eo),
             target,
             np.ones(self.NUM_TIMESTEPS) * (DynamicWorld2020_2021.class_amount),
             latlon,
             month,
+            valid_month,
             mask_per_variable,
         )