IBM · Joao-L-S-Almeida · Aug 19, 2024 · Aug 19, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/terratorch/datasets/generic_pixel_wise_dataset.py b/terratorch/datasets/generic_pixel_wise_dataset.py
@@ -178,6 +178,22 @@ def _load_file(self, path, nan_replace: int | float | None = None) -> xr.DataArr
             data = data.fillna(nan_replace)
         return data
 
+    def _generate_bands_intervals(self, bands_intervals: list[int | str | HLSBands | tuple[int]] | None = None):
+        if bands_intervals is None:
+            return None
+        bands = []
+        for element in bands_intervals:
+            # if its an interval
+            if isinstance(element, tuple):
+                if len(element) != 2:  # noqa: PLR2004
+                    msg = "When defining an interval, a tuple of two integers should be passed, defining start and end indices inclusive"
+                    raise Exception(msg)
+                expanded_element = list(range(element[0], element[1])) 
+                bands.extend(expanded_element)
+            else:
+                bands.append(element)
+        return bands
+
 
 class GenericNonGeoSegmentationDataset(GenericPixelWiseDataset):
     """GenericNonGeoSegmentationDataset"""

diff --git a/terratorch/datasets/utils.py b/terratorch/datasets/utils.py
@@ -110,7 +110,7 @@ def generate_bands_intervals(bands_intervals: list[int | str | HLSBands | tuple[
                     msg = "When defining an interval, a tuple of two integers should be passed,\
                         defining start and end indices inclusive"
                     raise Exception(msg)
-                expanded_element = list(range(element[0], element[1] + 1))
+                expanded_element = list(range(element[0], element[1]))
                 bands.extend(expanded_element)
             else:
                 bands.append(element)

diff --git a/terratorch/models/backbones/prithvi_mae.py b/terratorch/models/backbones/prithvi_mae.py
@@ -136,20 +136,35 @@ def __init__(
             input_size: tuple[int, int, int] = (1, 224, 224),
             patch_size: tuple[int, int, int] = (1, 16, 16),
             in_chans: int = 3,
+            tub_size: int = 1,
             embed_dim: int = 768,
+            band_patch_size: int = None,
             norm_layer: nn.Module | None = None,
             flatten: bool = True,
             bias: bool = True,
     ):
         super().__init__()
         self.input_size = input_size
         self.patch_size = patch_size
+        self.tub_size = tub_size
+        self.in_chans = in_chans
+        self.band_patch_size = band_patch_size
         self.grid_size = [s // p for s, p in zip(self.input_size, self.patch_size)]
         assert self.grid_size >= [1,1,1], "Patch size is bigger than input size."
         self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
         self.flatten = flatten
 
-        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        # When spectral patching is used, some adaptations are required
+        if self.band_patch_size:
+            kernel_size = (self.band_patch_size, self.patch_size[1], self.patch_size[2])
+            first_conv_dim = tub_size
+            self.dim_transposer = lambda x: x.transpose(2, 1)
+        else:
+            kernel_size = self.patch_size
+            first_conv_dim = in_chans
+            self.dim_transposer = lambda x: x
+
+        self.proj = nn.Conv3d(first_conv_dim, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=bias)
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 
     def forward(self, x):
@@ -159,6 +174,9 @@ def forward(self, x):
             warnings.warn(f"Input {x.shape[-3:]} is not divisible by patch size {self.patch_size}."
                           f"The border will be ignored, add backbone_padding for pixel-wise tasks.")
 
+        # When spectral patching is used the tensor must be transposed in order
+        # to operate over the proper dimension. 
+        x = self.dim_transposer(x)
         x = self.proj(x)
         if self.flatten:
             x = x.flatten(2).transpose(1, 2)  # B,C,T,H,W -> B,C,L -> B,L,C
@@ -234,6 +252,7 @@ class PrithviViT(nn.Module):
     def __init__(self,
                  img_size: int | tuple[int, int] = 224,
                  patch_size: int | tuple[int, int, int] = (1, 16, 16),
+                 band_patch_size: int = None, 
                  num_frames: int = 1,
                  in_chans: int = 3,
                  embed_dim: int = 1024,
@@ -256,10 +275,20 @@ def __init__(self,
         if isinstance(patch_size, int):
             patch_size = (1, patch_size, patch_size)
 
+        self.band_patch_size = band_patch_size
+
+        # If spectral patching is being used, we need a way to evaluate the
+        # extra number of patches.
+        if self.band_patch_size:
+            self.eval_c_patches = lambda c: c // self.patch_embed.band_patch_size
+        else:
+            self.eval_c_patches = lambda c: 1
+
         # 3D patch embedding
         self.patch_embed = PatchEmbed(
             input_size=(num_frames,) + self.img_size,
             patch_size=patch_size,
+            band_patch_size=band_patch_size,
             in_chans=in_chans,
             embed_dim=embed_dim,
         )
@@ -336,7 +365,7 @@ def random_masking(self, sequence, mask_ratio, noise=None):
 
         return sequence_unmasked, mask, ids_restore
 
-    def interpolate_pos_encoding(self, x, t, w, h):
+    def interpolate_pos_encoding(self, x, t, c, w, h):
         """
         Adapted from:
         - transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding,
@@ -348,6 +377,7 @@ def interpolate_pos_encoding(self, x, t, w, h):
 
         class_pos_embed = self.pos_embed[:, :1]
         patch_pos_embed = self.pos_embed[:, 1:]
+        c_patches = self.eval_c_patches(c)
         t_patches = t // self.patch_embed.patch_size[0]
         w_patches = w // self.patch_embed.patch_size[1]
         h_patches = h // self.patch_embed.patch_size[2]
@@ -357,10 +387,11 @@ def interpolate_pos_encoding(self, x, t, w, h):
 
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed,
-            size=(h_patches, w_patches),
+            size=(c_patches*h_patches, w_patches), # Accounting the extra patches produced by the spectral patching
             mode='bicubic',
             align_corners=True,
         )
+
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, self.embed_dim)
         return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
 
@@ -373,12 +404,12 @@ def forward(
         if len(x.shape) == 4 and self.patch_embed.input_size[0] == 1:
             # add time dim
             x = x.unsqueeze(2)
-        t, h, w = x.shape[-3:]
+        t, c, h, w = x.shape[-4:]
 
         # embed patches
         x = self.patch_embed(x)
 
-        pos_embed = self.interpolate_pos_encoding(x, t, h, w)
+        pos_embed = self.interpolate_pos_encoding(x, t, c, h, w)
         # add pos embed w/o cls token
         x = x + pos_embed[:, 1:, :]
 
@@ -414,12 +445,13 @@ def forward_features(
         if len(x.shape) == 4 and self.patch_embed.input_size[0] == 1:
             # add time dim
             x = x.unsqueeze(2)
-        t, h, w = x.shape[-3:]
+        c, t, h, w = x.shape[-4:]
 
         # embed patches
+
         x = self.patch_embed(x)
 
-        pos_embed = self.interpolate_pos_encoding(x, t, h, w)
+        pos_embed = self.interpolate_pos_encoding(x, t, c, h, w)
         # add pos embed w/o cls token
         x = x + pos_embed[:, 1:, :]
 
@@ -444,22 +476,27 @@ def forward_features(
 
         x = self.norm(x)
         out[-1] = x
+
         return out
 
     def prepare_features_for_image_model(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
         out = []
         effective_time_dim = self.patch_embed.input_size[0] // self.patch_embed.patch_size[0]
+        c = self.eval_c_patches(self.patch_embed.in_chans)
+
         for x in features:
             x_no_token = x[:, 1:, :]
             number_of_tokens = x_no_token.shape[1]
-            tokens_per_timestep = number_of_tokens // effective_time_dim
+            tokens_per_timestep = number_of_tokens // effective_time_dim // c
             h = int(np.sqrt(tokens_per_timestep))
+
             encoded = rearrange(
                 x_no_token,
-                "batch (t h w) e -> batch (t e) h w",
+                "batch (t h w c) e -> batch (t e) (c h) w",
                 e=self.embed_dim,
                 t=effective_time_dim,
                 h=h,
+                c=c,
             )
             out.append(encoded)
         return out

diff --git a/terratorch/models/backbones/prithvi_vit.py b/terratorch/models/backbones/prithvi_vit.py
@@ -27,6 +27,14 @@
 PRITHVI_V2_MEAN = [1087.0, 1342.0, 1433.0, 2734.0, 1958.0, 1363.0]
 PRITHVI_V2_STD = [2248.0, 2179.0, 2178.0, 1850.0, 1242.0, 1049.0]
 
+# TODO This operation is probably a workaround. For some reason the variable 
+# "model_bands" is being repeated. It's necessary to check the reason for it.
+def _overwrite_with_kwargs(extra_kwargs, kwargs):
+
+    for k in extra_kwargs.keys():
+        if k in kwargs.keys():
+            extra_kwargs[k] = kwargs.pop(k)
+    return extra_kwargs, kwargs 
 
 def _cfg(**kwargs):
     return {
@@ -255,7 +263,9 @@ def prithvi_eo_tiny(
     bands: list[HLSBands] | None = None,
     **kwargs,
 ) -> PrithviViT:
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return _create_prithvi("prithvi_eo_tiny", pretrained=pretrained, model_bands=bands, **kwargs)
 
 
@@ -265,7 +275,9 @@ def prithvi_eo_v1_100(
     bands: list[HLSBands] | None = None,
     **kwargs,
 ) -> PrithviViT:
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return _create_prithvi("prithvi_eo_v1_100", pretrained=pretrained, model_bands=bands, **kwargs)
 
 
@@ -275,7 +287,9 @@ def prithvi_eo_v2_300(
     bands: list[HLSBands] | None = None,
     **kwargs,
 ) -> PrithviViT:
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return _create_prithvi("prithvi_eo_v2_300", pretrained=pretrained, model_bands=bands, **kwargs)
 
 
@@ -285,7 +299,9 @@ def prithvi_eo_v2_600(
     bands: list[HLSBands] | None = None,
     **kwargs,
 ) -> PrithviViT:
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return _create_prithvi("prithvi_eo_v2_600", pretrained=pretrained, model_bands=bands, **kwargs)
 
 
@@ -295,7 +311,9 @@ def prithvi_eo_v2_300_tl(
     bands: list[HLSBands] | None = None,
     **kwargs,
 ) -> PrithviViT:
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return _create_prithvi("prithvi_eo_v2_300_tl", pretrained=pretrained, model_bands=bands, **kwargs)
 
 
@@ -305,7 +323,9 @@ def prithvi_eo_v2_600_tl(
     bands: list[HLSBands] | None = None,
     **kwargs,
 ) -> PrithviViT:
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return _create_prithvi("prithvi_eo_v2_600_tl", pretrained=pretrained, model_bands=bands, **kwargs)
 
 
@@ -319,7 +339,9 @@ def prithvi_vit_tiny(
 
     warnings.warn(f"The model prithvi_vit_tiny was renamed to prithvi_eo_tiny. "
                   f"prithvi_vit_tiny will be removed in a future version.", FutureWarning)
-
+    vars_updated, kwargs = _overwrite_with_kwargs({"pretrained": pretrained, "model_bands": model_bands}, kwargs)
+    pretrained = vars_updated["pretrained"]
+    bands = vars_updated["model_bands"]
     return prithvi_eo_tiny(pretrained=pretrained, model_bands=bands, **kwargs)
 
 

diff --git a/terratorch/models/decoders/mlp_decoder.py b/terratorch/models/decoders/mlp_decoder.py
@@ -5,27 +5,29 @@
 
 from torch import Tensor, nn
 import torch
+
 from terratorch.registry import TERRATORCH_DECODER_REGISTRY
 
 
 @TERRATORCH_DECODER_REGISTRY.register
+
 class MLPDecoder(nn.Module):
     """Identity decoder. Useful to pass the feature straight to the head."""
 
     def __init__(self, embed_dim: int, channels: int = 100, out_dim:int = 100, activation: str = "ReLU", out_index=-1) -> None:
         """Constructor
-
         Args:
             embed_dim (int): Input embedding dimension
             out_index (int, optional): Index of the input list to take.. Defaults to -1.
         """
-        
+
         super().__init__()
         self.embed_dim = embed_dim
         self.channels = channels
         self.dim = out_index
         self.n_inputs = len(self.embed_dim)
         self.out_channels = self.embed_dim[self.dim]
+        self.output_embed_dim = self.out_channels
         self.hidden_layer = torch.nn.Linear(self.out_channels*self.n_inputs, self.out_channels)
         self.activation = getattr(nn, activation)()
 

diff --git a/terratorch/models/prithvi_model_factory.py b/terratorch/models/prithvi_model_factory.py
@@ -21,8 +21,27 @@
 
 @MODEL_FACTORY_REGISTRY.register
 class PrithviModelFactory(ModelFactory):
+
+    @staticmethod
+    def _generate_bands_intervals(bands_intervals: list[int | str | HLSBands | tuple[int]] | None = None):
+        if bands_intervals is None:
+            return None
+        bands = []
+        for element in bands_intervals:
+            # if its an interval
+            if isinstance(element, list) or isinstance(element, tuple):
+                if len(element) != 2:  # noqa: PLR2004
+                    msg = "When defining an interval, a tuple of two integers should be passed, defining start and end indices inclusive"
+                    raise Exception(msg)
+                expanded_element = list(range(element[0], element[1])) 
+                bands.extend(expanded_element)
+            else:
+                bands.append(element)
+        return bands
+
     def __init__(self) -> None:
         self._factory: EncoderDecoderFactory = EncoderDecoderFactory()
+
     def build_model(
         self,
         task: str,
@@ -72,7 +91,11 @@ def build_model(
         Returns:
             nn.Module: Full model with encoder, decoder and head.
         """
+        bands = self._generate_bands_intervals(bands)
+        print(bands)
+
         warnings.warn("PrithviModelFactory is deprecated. Please switch to EncoderDecoderFactory.", stacklevel=1)
+
         if in_channels is None:
             in_channels = len(bands)
         # TODO: support auxiliary heads

diff --git a/terratorch/tasks/classification_tasks.py b/terratorch/tasks/classification_tasks.py
@@ -289,6 +289,12 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> T
         rest = {k: batch[k] for k in other_keys}
         model_output: ModelOutput = self(x, **rest)
 
+        # Avoiding GPU memory overloading
+        # Removing GPU cache
+        torch.cuda.empty_cache()
+        # Forcing the Python garbage collector
+        gc.collect()
+
         y_hat = self(x).output
         y_hat = y_hat.argmax(dim=1)
         return y_hat, file_names
diff --git a/terratorch/tasks/regression_tasks.py b/terratorch/tasks/regression_tasks.py
@@ -3,6 +3,7 @@
 from collections.abc import Sequence
 from functools import partial
 from typing import Any
+import gc
 
 import logging
 import lightning
@@ -386,10 +387,12 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> T
 
         def model_forward(x):
             return self(x).output
-
+        
         if self.tiled_inference_parameters:
             # TODO: tiled inference does not work with additional input data (**rest)
             y_hat: Tensor = tiled_inference(model_forward, x, 1, self.tiled_inference_parameters)
         else:
+
             y_hat: Tensor = self(x, **rest).output
+
         return y_hat, file_names