src/models/deploying_t5.py

"""
T5: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L19
"""
from typing import Optional, Tuple, Union, List, Callable
import time
import copy
import datetime
import warnings
import numpy as np
import torch
import math
from einops import rearrange
import torch.distributed as dist
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.utils.checkpoint import checkpoint
from transformers import T5Tokenizer

from transformers.modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions, 
    Seq2SeqLMOutput
)
from transformers.models.t5.modeling_t5 import (
    T5LayerNorm,
    T5Attention,
    T5LayerSelfAttention,
    T5LayerCrossAttention, 
    T5LayerFF,
    T5Block, 
    T5Stack, 
    T5ForConditionalGeneration, 
)
from transformers.models.t5.configuration_t5 import T5Config
from transformers.generation.utils import GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput
from transformers.generation.logits_process import LogitsProcessorList
from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
from transformers.utils import logging

from util import (
    get_skip_mask,
    BetaMixture1D,
) 
from util.skip_conf import get_skip_mask_cd


logger = logging.get_logger(__name__)
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
"""
GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]


class DeployT5Attention(T5Attention):
    def __init__(self, config: T5Config, has_relative_attention_bias=False):
        """
        Initialize the Deploy T5 Attention layer.

        What's New:
        - Initialization of configuration and model parameters.
        - Initialization of Mesh TensorFlow to avoid scaling before softmax.
        - Initialization of additional neural network layers if relative attention biases are enabled.

        Args:
            config (T5Config): Configuration object with model parameters.
            has_relative_attention_bias (bool): Flag to indicate if relative attention biases are used.

        The constructor extends the base T5Attention with specific configurations, and initializes
        additional neural network layers if relative attention biases are enabled.
        """
        super().__init__(config, has_relative_attention_bias)
        self.config = config
        ####### Initialization of configuration and model parameters #######        
        self.is_decoder = config.is_decoder  # Boolean indicating if this is a decoder module
        self.has_relative_attention_bias = has_relative_attention_bias  # Boolean for relative attention bias usage
        self.relative_attention_num_buckets = config.relative_attention_num_buckets  # Number of buckets for relative attention
        self.relative_attention_max_distance = config.relative_attention_max_distance  # Maximum distance for relative attention
        self.d_model = config.d_model  # Dimension of the model
        self.key_value_proj_dim = config.d_kv  # Dimension for key/value projections
        self.n_heads = config.num_heads  # Number of attention heads
        self.dropout = config.dropout_rate  # Dropout rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim  # Compute the inner dimension for the model

        ####### Mesh TensorFlow initialization to avoid scaling before softmax #######
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)  # Linear transformation for query
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)  # Linear transformation for key
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)  # Linear transformation for value
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)  # Linear transformation to output from attention heads

        ####### Initialization of additional neural network layers if relative attention biases are enabled #######
        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = False


    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
        skip_mask=False,
        gen_cross_attn_key_value=False,
        stack_hidden_states=None,
    ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
        
        # Input and output preparation for attention operation
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        _, seq_length = hidden_states.shape[:2]

        real_seq_length = seq_length        

        ####### Handling of past key-value states for incremental decoding ####### 
        if past_key_value is not None:
            assert (
                len(past_key_value) == 2
            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
            if query_length is None:
                if past_key_value[0] is not None: real_seq_length += past_key_value[0].shape[2]
                if stack_hidden_states is not None: real_seq_length += stack_hidden_states.shape[1]
            else:
                real_seq_length += query_length

        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]

        def project(hidden_states, proj_layer, key_value_states, past_key_value):
            """Projects input states into query, key, or value states for attention."""
            if key_value_states is None:
                # Self-attention projection
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = rearrange(proj_layer(hidden_states), 'b l (h d) -> b h l d', h=self.n_heads)
            elif past_key_value is None:
                # Cross-attention projection when there is no past key value
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = rearrange(proj_layer(key_value_states), 'b l (h d) -> b h l d', h=self.n_heads)

            if past_key_value is not None:
                if key_value_states is None:
                    # Append past key or value states for self-attention
                    # (batch_size, n_heads, key_length, dim_per_head)
                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
                elif past_key_value.shape[2] != key_value_states.shape[1]:
                    # Handle mismatch in sequence lengths between past states and current key-value states
                    # Cross-attention
                    # (batch_size, n_heads, seq_length, dim_per_head)
                    hidden_states = rearrange(proj_layer(key_value_states), 'b l (h d) -> b h l d', h=self.n_heads)
                else:
                    # Use past key or value states directly in cross-attention
                    hidden_states = past_key_value
            return hidden_states
        
        ####### Compute key and value states from hidden states #######
        if self.is_decoder and key_value_states is None and stack_hidden_states is not None:
            _hidden_states = torch.cat((stack_hidden_states,) + (hidden_states,), dim=1)
        else:
            _hidden_states = hidden_states

        key_states = project(
            _hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
        )
        value_states = project(
            _hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
        )
        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.is_decoder: self.key_value_gen_time = (datetime.datetime.now() - start)

        ####### Generate cross-attention key-value pairs for past skipped tokens (if specified) #######
        if gen_cross_attn_key_value:
            return [key_states, value_states]
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()

        ####### Initialize or use provided position bias for attention computation #######                
        if position_bias is None:
            if not self.has_relative_attention_bias:
                # Default position bias for models without relative attention bias                
                position_bias = torch.zeros(
                    (1, self.n_heads, real_seq_length, key_length), device=hidden_states.device, dtype=hidden_states.dtype
                )
                if self.gradient_checkpointing and self.training:
                    position_bias.requires_grad = True
            else:
                # Compute position bias using a learned embedding table
                position_bias = self.compute_bias(real_seq_length, key_length, device=hidden_states.device)

            # if key and values are already calculated
            # we want only the last query position bias
            # Adjust position bias based on past states if necessary  
            if past_key_value is not None:
                position_bias = position_bias[:, :, -hidden_states.size(1):, :]

            if mask is not None:
                # Apply mask to position bias for padded tokens or future tokens in causal attention
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)

        ####### Skip mask computation if not required ######## 
        if skip_mask:
            attn_output = None
        else:
            # get query states
            query_states = rearrange(self.q(hidden_states), 'b l (h d) -> b h l d', h=self.n_heads)

            # compute scores
            scores = torch.einsum("bhid,bhjd->bhij", query_states, key_states)

            if self.pruned_heads:
                mask = torch.ones(position_bias.shape[1])
                mask[list(self.pruned_heads)] = 0
                position_bias_masked = position_bias[:, mask.bool()]
            else:
                position_bias_masked = position_bias
            scores += position_bias_masked

            attn_weights = nn.functional.softmax(scores.float(), dim=-1)
            attn_weights = nn.functional.dropout(
                attn_weights, p=self.dropout, training=self.training
            )  # (batch_size, n_heads, seq_length, key_length)

            # Mask heads if we want to
            if layer_head_mask is not None:
                attn_weights = attn_weights * layer_head_mask

            attn_output = torch.einsum("bhij,bhjd->bhid", attn_weights, value_states)
            attn_output = rearrange(attn_output, 'b h s d -> b s (h d)', h=self.n_heads, d=self.key_value_proj_dim)
            attn_output = self.o(attn_output)
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.is_decoder: self.attn_ffn_time = (datetime.datetime.now() - start)

        ####### Prepare outputs including the present key-value states if caching is enabled #######
        present_key_value_state = [key_states, value_states] if (self.is_decoder and use_cache) else None
        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)

        ####### Include attention weights in output if requested #######
        if output_attentions:
            outputs = outputs + (attn_weights,)
            
        return outputs


class DeployT5LayerSelfAttention(T5LayerSelfAttention):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__(config, has_relative_attention_bias)
        self.config = config
        self.SelfAttention = DeployT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        skip_mask=False,
        stack_hidden_states=None,
    ):
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        normed_hidden_states = self.layer_norm(hidden_states)
        normed_stack_hidden_states = self.layer_norm(stack_hidden_states) if stack_hidden_states is not None else None
        if self.config.use_synchronize: torch.cuda.synchronize()
        norm_time = (datetime.datetime.now() - start)

        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
            skip_mask=skip_mask,
            stack_hidden_states=normed_stack_hidden_states,
        )
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if not skip_mask:
            hidden_states = hidden_states + self.dropout(attention_output[0])
            outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
        else:
            outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them     

        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.config.is_decoder:
            self.attn_ffn_time = self.SelfAttention.attn_ffn_time + norm_time + (datetime.datetime.now() - start)
            self.key_value_gen_time = self.SelfAttention.key_value_gen_time
        return outputs


class DeployT5LayerCrossAttention(T5LayerCrossAttention):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.EncDecAttention = DeployT5Attention(config, has_relative_attention_bias=False)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        key_value_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        query_length=None,
        output_attentions=False,
        skip_mask=False,
        parallel_mask=False,
        gen_cross_attn_key_value=False,
    ):
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if (not skip_mask and not gen_cross_attn_key_value) or parallel_mask:
            normed_hidden_states = self.layer_norm(hidden_states)
        else: normed_hidden_states = hidden_states
        if self.config.use_synchronize: torch.cuda.synchronize()
        norm_time = (datetime.datetime.now() - start)
        
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
            skip_mask=skip_mask,
            gen_cross_attn_key_value=gen_cross_attn_key_value,
        )
        if gen_cross_attn_key_value:
            self.key_value_gen_time = self.EncDecAttention.key_value_gen_time
            return attention_output  # non-autoregressively generated key_value_states
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if not skip_mask:
            hidden_states = hidden_states + self.dropout(attention_output[0])
            outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
        else:
            outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them   

        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.config.is_decoder:
            self.attn_ffn_time = self.EncDecAttention.attn_ffn_time + norm_time + (datetime.datetime.now() - start)
            self.key_value_gen_time = self.EncDecAttention.key_value_gen_time 
        return outputs


class DeployT5Block(T5Block):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__(config, has_relative_attention_bias)
        self.config = config
        self.is_decoder = config.is_decoder  # Flag to check if this block is part of the decoder

        # Initialize layers in the block
        self.layer = nn.ModuleList()
        self.layer.append(DeployT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
        
        ######## decoder ########
        if self.is_decoder:
            # Add cross-attention layer only if it's a decoder
            self.layer.append(DeployT5LayerCrossAttention(config))

        ######## common ########
        # Always add the feed-forward layer
        self.layer.append(T5LayerFF(config))
    
    ######## decoder ########
    def get_shallow_logits(self, hidden_states):
        # Generate logits from shallow hidden states (typically used in fast decoding)
        shallow_hidden_states = self.layer[0].layer_norm(hidden_states)
        shallow_hidden_states = self.dropout(shallow_hidden_states)
        shallow_logits = self.lm_head(shallow_hidden_states)
        return shallow_logits
    
    ######## decoder ########
    def gen_cross_attn_key_value(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        r""" 
        In Shallow-Deep framework, if all previous tokens, including <start> token, have skipped Deep decoder,
        generate cross-attn key_values only ONCE because they are shared for all sequence.
        
        return (None, None) + cross_attn_past_key_value: Tuple[torch.Tensor] (length of 2)
        """

        # if all previous tokens, including <start> token, have skipped Deep decoder
        assert self.is_decoder and encoder_hidden_states is not None
        cross_attn_past_key_value = self.layer[1](
            hidden_states,
            key_value_states=encoder_hidden_states,
            attention_mask=encoder_attention_mask,
            position_bias=encoder_decoder_position_bias,
            layer_head_mask=cross_attn_layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
            gen_cross_attn_key_value=True,
        )
        self.key_value_gen_time = self.layer[1].key_value_gen_time

        past_key_value = [None, None,] + cross_attn_past_key_value
        return past_key_value

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
        skip_mask=False,
        parallel_mask=False,
        stack_hidden_states=None,
    ):
        # Process input through the block, handling both self-attention and cross-attention if applicable

        ######## common ########
        # Handling past key values for caching and faster processing
        if past_key_value is not None:
            if not self.is_decoder:
                logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4

            if len(past_key_value) != expected_num_past_key_values:
                raise ValueError(
                    f"There should be {expected_num_past_key_values} past states. "
                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
                    f"Got {len(past_key_value)} past key / value states"
                )

            self_attn_past_key_value = past_key_value[:2]
            cross_attn_past_key_value = past_key_value[2:]
        else:
            self_attn_past_key_value, cross_attn_past_key_value = None, None

        ######## common ########
        # Process self-attention
        self_attention_outputs = self.layer[0](
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=self_attn_past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
            skip_mask=skip_mask,
            stack_hidden_states=stack_hidden_states,
        )
        hidden_states, present_key_value_state = self_attention_outputs[:2]
        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
        
        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
        if do_cross_attention:
            # the actual query length is unknown for cross attention
            # if using past key value states. Need to inject it here
            if present_key_value_state is not None:
                query_length = present_key_value_state[0].shape[2]
            else:
                query_length = None
            
            cross_attention_outputs = self.layer[1](
                hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                position_bias=encoder_decoder_position_bias,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=cross_attn_past_key_value,
                query_length=query_length,
                use_cache=use_cache,
                output_attentions=output_attentions,
                skip_mask=skip_mask,
                parallel_mask=parallel_mask,
            )
            hidden_states = cross_attention_outputs[0]

            # Combine self attn and cross attn key value states
            if present_key_value_state is not None:
                present_key_value_state = present_key_value_state + cross_attention_outputs[1]

            # Keep cross-attention outputs and relative position weights
            attention_outputs = attention_outputs + cross_attention_outputs[2:]
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        # Apply Feed Forward layer
        if not skip_mask:
            hidden_states = self.layer[-1](hidden_states)
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.is_decoder:
            self.ffn_time = datetime.datetime.now() - start
            self.key_value_gen_time = (self.layer[0].key_value_gen_time, self.layer[1].key_value_gen_time)
            self.attn_time = (self.layer[0].attn_ffn_time, self.layer[1].attn_ffn_time)

        outputs = (hidden_states,)

        if use_cache:
            outputs = outputs + (present_key_value_state,) + attention_outputs
        else:
            outputs = outputs + attention_outputs

        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)


class DeployT5Stack(T5Stack):
    def __init__(self, config, embed_tokens=None):
        super().__init__(config, embed_tokens)
        self.graph_top_k_list = []
        self.graph_top_k_confidence = []
        self.top_k_indices = None
        
        self.embed_tokens = embed_tokens
        self.is_decoder = config.is_decoder
        self.flop_counter = 0.0

        self.block = nn.ModuleList(
            [DeployT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )
        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

        # Initialize weights and apply final processing
        self.post_init()
        self.device_map = None
        self.gradient_checkpointing = False

        # Early-Exit framework
        self.use_early_exit = config.use_early_exit
        self.exit_min_layer = config.exit_min_layer
            
        # Shallow-Deep Module
        self.use_shallow_deep = config.use_shallow_deep
        self.shallow_exit_layer = config.shallow_exit_layer
        if self.is_decoder and config.use_shallow_deep:
            assert config.shallow_exit_layer > 0 and config.shallow_exit_layer < len(self.block)
        
        # Synchronized Parallel Decoding
        self.block_op = [0] * config.num_layers  # to calculate the average number of forward block layers
        self.parallel_tokens_shallow = 0  # how much tokens are used in parallel decoding as stack_hidden_states
        self.parallel_tokens_deep = 0  # how much tokens are used in parallel decoding with skip_mask = False
        self.stack_hidden_states = ()  # store hidden_states that do not forward Deep decoder
        
        # Adaptive Threshold Estimator
        self.bmm_model = BetaMixture1D()
        self.bmm_threshold = None
        self.stack_conf, self.stack_pred = (), ()
        self.stack_conf_all, self.stack_ident_all = (), ()

        if self.is_decoder:
            self._reset_time_measure()
        else: self.deploy_time = None
        
        self.render = config.render_jsds
        if self.render:
            self.tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-large")
        
    def _reset_time_measure(self):
        self.deploy_time = {'time_key_value_gen': [datetime.timedelta(), datetime.timedelta()],
                            'time_attn': [datetime.timedelta(), datetime.timedelta()],
                            'time_ffn': datetime.timedelta(),
                            'time_confidence': datetime.timedelta(),
                            'time_exit_key_value_gen': [datetime.timedelta(), datetime.timedelta()],
                            'time_exit_attn': [datetime.timedelta(), datetime.timedelta()],
                            'time_exit_ffn': datetime.timedelta(),
                            'time_parallel_key_value_gen': [datetime.timedelta(), datetime.timedelta()],
                            'time_parallel_attn': [datetime.timedelta(), datetime.timedelta()],
                            'time_parallel_ffn': datetime.timedelta(),
                            'time_others': datetime.timedelta(),}

    def parallel_gen_token(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_extended_attention_mask=None,
        encoder_decoder_position_bias=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        present_key_value_states=None,
        use_cache=None,
        output_attentions=None,
        layer_idx=None,
    ):
        r""" 
        if pask_key_values is not defined, it implies that all previous tokens have skipped Deep decoder.
            Because all sequences share key_value of cross-attn layer,
            we need to generate key_value of cross-attn layer only once for <start> token.
        else:
            key_values of cross-attn are already stored in 'past_key_values'.

        Then, generate the next token in a non-autoregressive manner.
        if copy_skipped_hidden_states is True,
            copy previous skipped hidden_states for Deep decoder blocks.
        else:
            attention calculate for stack_hidden_states as well.
            thus, we can utilize them in RollBack policy.
        """
        
        if not self.config.copy_skipped_hidden_states:
            hidden_states = torch.cat(self.stack_hidden_states + (hidden_states,), dim=1)            
            # reset and re-calculate based on the length of hidden_states
            extended_attention_mask, position_bias = None, None
        else:
            self.stack_hidden_states = torch.cat(self.stack_hidden_states, dim=1)
            extended_attention_mask = attention_mask

        previous_hidden_states = []
        for j in range(layer_idx, len(self.block)):
        
            past_key_value = past_key_values[j]
            if past_key_value is None:
                # if pask_key_values is not defined, it implies that all previous tokens have skipped Deep decoder
                # need to generate key_value of cross-attn layer only once for <start> token
                past_key_value = self.block[j].gen_cross_attn_key_value(
                    hidden_states,  # dummy
                    attention_mask=extended_attention_mask,
                    position_bias=position_bias,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_extended_attention_mask,
                    encoder_decoder_position_bias=encoder_decoder_position_bias,
                    layer_head_mask=head_mask[j],
                    cross_attn_layer_head_mask=cross_attn_head_mask[j],
                    past_key_value=None,
                    use_cache=use_cache,
                    output_attentions=output_attentions,
                )
                self.deploy_time['time_parallel_key_value_gen'][1] += self.block[j].key_value_gen_time
            
            if self.config.use_synchronize: torch.cuda.synchronize()
            start = datetime.datetime.now()
            if extended_attention_mask is None or position_bias is None:
                real_seq_length = hidden_states.shape[1]
                if past_key_value[0] is not None: real_seq_length += past_key_value[0].shape[2]
                key_length = real_seq_length
                
                if self.config.parallel_causal_mask and extended_attention_mask is None:
                    attention_mask = torch.ones(hidden_states.shape[0], real_seq_length, device=hidden_states.device)
                    extended_attention_mask = self.get_extended_attention_mask(attention_mask, torch.Size([hidden_states.shape[0], hidden_states.shape[1]]))
                
                if position_bias is None:      
                    position_bias = self.block[0].layer[0].SelfAttention.compute_bias(real_seq_length, key_length, device=hidden_states.device)

                    # if key and values are already calculated
                    # we want only the last query position bias
                    if past_key_value is not None:
                        position_bias = position_bias[:, :, -hidden_states.size(1):, :]

                    if extended_attention_mask is not None:
                        position_bias = position_bias + extended_attention_mask  # (batch_size, n_heads, seq_length, key_length)
            
            if self.config.use_synchronize: torch.cuda.synchronize()
            self.deploy_time['time_others'] += (datetime.datetime.now() - start)

            layer_outputs = self.block[j]( #### Block forward pass, should be outputting the logits indeed no?
                hidden_states,
                attention_mask=extended_attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_extended_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
                layer_head_mask=head_mask[j],
                cross_attn_layer_head_mask=cross_attn_head_mask[j],
                past_key_value=past_key_value,
                use_cache=use_cache,
                output_attentions=output_attentions,
                skip_mask=False,
                parallel_mask=True,
                stack_hidden_states=self.stack_hidden_states if self.config.copy_skipped_hidden_states else None,
            )

            for idx, t in enumerate(self.block[j].key_value_gen_time): self.deploy_time['time_parallel_key_value_gen'][idx] += t
            for idx, t in enumerate(self.block[j].attn_time): self.deploy_time['time_parallel_attn'][idx] += t
            self.deploy_time['time_parallel_ffn'] += self.block[j].ffn_time
            
            if self.config.use_synchronize: torch.cuda.synchronize()
            start = datetime.datetime.now()
            # layer_outputs is a tuple with:
            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
            if use_cache is False:
                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]

            hidden_states, present_key_value_state = layer_outputs[:2]

            position_bias = layer_outputs[2]
            if self.is_decoder and encoder_hidden_states is not None:
                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
            # append next layer key value states
            if use_cache:
                present_key_value_states = present_key_value_states + [present_key_value_state,]
            
            if self.config.use_synchronize: torch.cuda.synchronize()
            self.deploy_time['time_others'] += (datetime.datetime.now() - start)
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        self.stack_hidden_states = ()
        if self.config.use_synchronize: torch.cuda.synchronize()
        self.deploy_time['time_others'] += (datetime.datetime.now() - start)
        
        return hidden_states, present_key_value_states
    
    
    def func_inverse(self, i, k1, k2, num_layers): # this is the function for doing smoothed pruning
        return max(k2, int(k1 / (1 + (k1 - k2) / k2 * i / num_layers)))

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        inputs_embeds=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        lm_head=None,
        cm_head=None,
    ):
        r""" 
        We have implemented the following inference strategy:

        1) Normal framework: Forward all transformer layers.
        2) Static framework: Only forward the pre-defined number of early layers.
        3) Early-Exit framework: Each token can exit the forward path if confidence is higher than threshold.
        4) Shallow-Deep framework: 
            While a few early layers are defined as 'Shallow' decoder, the entire network including Shallow is defined as 'Deep' decoder.
            Each token can skip the Deep decoder path if confidence at Shallow decoder is higher than threshold.
        """
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")

        if inputs_embeds is None:
            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)

        batch_size, seq_length = input_shape

        # required mask seq length can be calculated via length of past
        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length

        if use_cache is True:
            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"

        if attention_mask is None:
            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
            encoder_seq_length = encoder_hidden_states.shape[1]
            encoder_attention_mask = torch.ones(
                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
            )

        # initialize past_key_values with `None` if past does not exist
        if past_key_values is None:
            past_key_values = [None] * len(self.block)
            self.stack_hidden_states = ()
            self.stack_conf, self.stack_pred = (), ()

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
        present_key_value_states = [] if use_cache else None
        all_hidden_states = None
        all_attentions = None
        all_cross_attentions = None
        position_bias = None
        encoder_decoder_position_bias = None

        hidden_states = self.dropout(inputs_embeds)
        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.is_decoder: self.deploy_time['time_others'] += (datetime.datetime.now() - start)

        skip_mask = False  # False: forward, and True: skip
        self.shallow2deep = False  # False: skip, and True: forward
        self.lm_logits = None  # to prevent calculating logits twice

        prev_probits = {}
        prev_confidences = {}
        if self.is_decoder and self.config.plotting_logits:
            previous_logits = []

        for i, layer_module in enumerate(self.block):
            if self.is_decoder and self.config.plotting_logits:
                _hidden_states = self.dropout(self.final_layer_norm(hidden_states))
                _hidden_states = (_hidden_states * (self.config.d_model ** -0.5)) if self.config.tie_word_embeddings else _hidden_states
                lm_logits = lm_head(_hidden_states)
                previous_logits.append(lm_logits)

            # Static framework
            if self.is_decoder and self.config.static_exit_layer is not None:
                if i == self.config.static_exit_layer: break

            layer_head_mask = head_mask[i]
            cross_attn_layer_head_mask = cross_attn_head_mask[i]
            
            # check that tokens are generated once in a time
            auto_reg = True if hidden_states.shape[1] == 1 else False
            if self.is_decoder and auto_reg and i == 0: self.block_op[i] += 1
                            
            if self.is_decoder and auto_reg and i > 0:
                
                # Shallow-Deep framework 
                if self.use_shallow_deep and i == self.shallow_exit_layer:
                    if self.config.use_synchronize: torch.cuda.synchronize()
                    start = datetime.datetime.now()
                    _hidden_states = self.dropout(self.final_layer_norm(hidden_states))
                    lm_logits = lm_head(_hidden_states) if not self.config.tie_word_embeddings \
                        else lm_head(_hidden_states * (self.config.d_model ** -0.5))
                        
                    skip_mask, conf = get_skip_mask(
                        lm_logits,
                        _hidden_states,
                        cm_head,
                        config=self.config,
                        adapt_threshold=self.bmm_threshold,
                        return_conf=True,
                    )
                    self.stack_conf = self.stack_conf + (conf,)
                    self.stack_pred = self.stack_pred + (lm_logits,)
                    
                    if not skip_mask: self.block_op[i] += 1
                    if self.config.use_synchronize: torch.cuda.synchronize()
                    self.deploy_time['time_confidence'] += (datetime.datetime.now() - start)

                    # if skip Deep decoder, store hidden_states at self.shallow_exit_layer
                    if skip_mask:
                        if self.config.use_synchronize: torch.cuda.synchronize()
                        start = datetime.datetime.now()
                        self.lm_logits = lm_logits
                        if self.config.parallel_gen_token:
                            if use_cache:
                                for j in range(i, len(self.block)):
                                    present_key_value_states = present_key_value_states + [past_key_values[j],]
                            self.stack_hidden_states = self.stack_hidden_states + (hidden_states,)
                        
                        if self.config.use_synchronize: torch.cuda.synchronize()
                        if self.is_decoder: self.deploy_time['time_others'] += (datetime.datetime.now() - start)
                        break

                    if not skip_mask:
                        self.shallow2deep = True
                        # if self.config.parallel_gen_token:
                        if self.config.parallel_gen_token and len(self.stack_hidden_states):
                            self.parallel_tokens_shallow += len(self.stack_hidden_states)
                            self.parallel_tokens_deep += 1
                            
                            # in Shallow-Deep decoder, generate the next token in a non-autoregressive manner
                            hidden_states, present_key_value_states = self.parallel_gen_token(
                                hidden_states,
                                attention_mask=extended_attention_mask,
                                position_bias=position_bias,
                                encoder_hidden_states=encoder_hidden_states,
                                encoder_extended_attention_mask=encoder_extended_attention_mask,
                                encoder_decoder_position_bias=encoder_decoder_position_bias,
                                head_mask=head_mask,
                                cross_attn_head_mask=cross_attn_head_mask,
                                past_key_values=past_key_values,
                                present_key_value_states=present_key_value_states,
                                use_cache=use_cache,
                                output_attentions=output_attentions,
                                layer_idx=self.shallow_exit_layer,
                            )
                            
                            # Adaptive Threshold Estimator
                            if self.config.use_adapt_threshold:
                                # Calibration Set Update
                                self.lm_logits = self.lm_head(self.dropout(self.final_layer_norm(hidden_states)))
                                deep_pred = self.lm_logits.argmax(-1)
                                shallow_pred = torch.cat(self.stack_pred).argmax(-1).view(-1)

                                self.stack_conf_all += self.stack_conf
                                self.stack_ident_all += ((deep_pred.view(-1) == shallow_pred.view(-1)).long().cpu().numpy(),)
                                self.stack_conf, self.stack_pred = (), ()
                                
                            break

                # Early-Exit framework
                elif self.use_early_exit and not skip_mask:	
                    if (self.exit_min_layer is not None and i < self.exit_min_layer):
                        if self.config.use_synchronize: torch.cuda.synchronize()
                        # start = datetime.datetime.now()
                        _hidden_states = self.dropout(self.final_layer_norm(hidden_states))
                        lm_logits = lm_head(_hidden_states) if not self.config.tie_word_embeddings \
                            else lm_head(_hidden_states * (self.config.d_model ** -0.5))
                        
                        probits = lm_logits.softmax(dim=-1).squeeze() #torch.softmax(lm_logits, dim=-1) + squeezing
                        prev_probits[i] = probits
                        self.block_op[i] += 1

                        
                    else:
                        if self.config.use_synchronize: torch.cuda.synchronize()
                        start = datetime.datetime.now()
                        _hidden_states = self.dropout(self.final_layer_norm(hidden_states))


                        # SHRINKING VOCAB PART:
                        if not self.config.type_vocab_reduct: # If we are not using any vocab reduction
                            a = _hidden_states * (self.config.d_model ** -0.5)
                            lm_logits = lm_head(_hidden_states) if not self.config.tie_word_embeddings \
                                else lm_head(a)
                            
                            if self.config.count_flops:
                                self.flop_counter += (self.config.d_model**2)* self.config.vocab_size * 1 # Seq length is always one

                            
                        else:
                            starting_layer = self.config.exit_min_layer if self.config.exit_min_layer > 1  else 1 # Start where exit_min_layer is set or start at 2.
                            if i == starting_layer: # if it is the first layer where we compute the logits
                                lm_logits = lm_head(_hidden_states) if not self.config.tie_word_embeddings \
                                    else lm_head(_hidden_states * (self.config.d_model ** -0.5))
                                # Get the top 2500 logits at block 1.
                                maximum_k_size = 35000 # where 200 is the maximum number of weights to keep ( it actually immediately decreases so it is lower thatn this)
                                minimum_k_size = 250 # where 50 is the minimum number of weights to keep
                                num_layers = len(self.block) # This is the number of layers in the model
                                k = self.func_inverse(i,maximum_k_size, minimum_k_size, num_layers)
                                self.top_k_indices = torch.topk(lm_logits[0][0], k, largest=True, sorted=True)[1].sort()[0]

                                selected_weights = lm_head.weight[self.top_k_indices, : ] # THis can be done here to win some compute time
                            else: # For all the other layers either use fixed, decaying or adaptive pruning
                                if self.config.type_vocab_reduct == "fixed":

                                    if self.config.count_flops:
                                        self.flop_counter +=  (self.config.d_model**2)* k * 1 # Seq length is always one
                                    # Note: There is no bias in self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
                                    a = _hidden_states * (self.config.d_model ** -0.5)
                                    lm_logits_temp = torch.nn.functional.linear(_hidden_states, selected_weights)  if not self.config.tie_word_embeddings \
                                        else torch.nn.functional.linear(a, selected_weights)
               
                                    # Initialize lm_logits with -inf
                                    lm_logits = torch.full((1, 1, self.config.vocab_size), -float("inf"), device=lm_logits_temp.device)

                                    # Create a mask for the top_k_indices
                                    top_k_mask = torch.zeros(self.config.vocab_size, dtype=torch.bool, device=lm_logits_temp.device)
                                    top_k_mask[self.top_k_indices] = True

                                    # Use the mask to assign values from lm_logits_temp to lm_logits
                                    lm_logits[0, 0, top_k_mask] = lm_logits_temp[0, 0, :len(self.top_k_indices)]

                                elif self.config.type_vocab_reduct == "decaying":  # Smoothed pruning! For all the other layers -> smoothed pruning
                                    current_k = self.func_inverse(i,maximum_k_size, minimum_k_size, num_layers)
                                    
                                    if self.config.count_flops:
                                        self.flop_counter +=  (self.config.d_model**2)* current_k * 1 # Seq length is always one

                                    selected_weights = lm_head.weight[self.top_k_indices[:current_k], :]
                    
                                    a = _hidden_states * (self.config.d_model ** -0.5)
                                    lm_logits_temp = torch.nn.functional.linear(_hidden_states, selected_weights)  if not self.config.tie_word_embeddings \
                                        else torch.nn.functional.linear(a, selected_weights)
                                    
                                    # Initialize lm_logits with -inf
                                    lm_logits = torch.full((1, 1, self.config.vocab_size), -float("inf"), device=lm_logits_temp.device)

                                    # Create a mask for the top_k_indices
                                    top_k_mask = torch.zeros(self.config.vocab_size, dtype=torch.bool, device=lm_logits_temp.device)
                                    top_k_mask[self.top_k_indices[:current_k]] = True

                                    # Use the mask to assign values from lm_logits_temp to lm_logits
                                    lm_logits[0, 0, top_k_mask] = lm_logits_temp[0, 0, :len(self.top_k_indices[:current_k])]

                                elif self.config.type_vocab_reduct == "adaptive":
                                        # TODO experiment with not only the top-1 confidence but combining the top-k confidences
                                        # TODO experiment with taking the top-k not (only) in the starting layer
                                        # TODO experiment with different formulas to go from confidence value to retained indices
                                        # TODO experiment with non-confidence parameters (and potentialy combinations of params)
                                        curr_weights_size = lm_head.weight.size(dim=0)
                                        conf = prev_confidences[i-1] # This is the confidence of the previous layer
                                        conf_scaling_factor = 0.9 # TODO experiment with different scaling factors
                                        retained_top_k = int(curr_weights_size * (1 - conf * conf_scaling_factor))
                                        selected_weights = lm_head.weight[self.top_k_indices[:retained_top_k], :]
                                        ############################
                                        a = _hidden_states * (self.config.d_model ** -0.5)
                                        lm_logits_temp = torch.nn.functional.linear(_hidden_states, selected_weights)  if not self.config.tie_word_embeddings \
                                            else torch.nn.functional.linear(a, selected_weights)
                                        
                                        if self.config.count_flops:
                                            self.flop_counter +=  (self.config.d_model**2)* retained_top_k * 1 # Seq length is always one
                                    
                                        # Initialize lm_logits with -inf
                                        lm_logits = torch.full((1, 1, self.config.vocab_size), -float("inf"), device=lm_logits_temp.device)

                                        # Create a mask for the top_k_indices
                                        top_k_mask = torch.zeros(self.config.vocab_size, dtype=torch.bool, device=lm_logits_temp.device)
                                        top_k_mask[self.top_k_indices[:retained_top_k]] = True

                                        # Use the mask to assign values from lm_logits_temp to lm_logits
                                        lm_logits[0, 0, top_k_mask] = lm_logits_temp[0, 0, :len(self.top_k_indices[:retained_top_k])]
                                else:
                                    raise("Please provide a valid type_vocab_reduct argument. Either use fixed, decaying, or adaptive.")

                        # END OF SHRINKING VOCAB PART                        
                        if self.config.exit_conf_type == "reweight_contrastive_decoding":
                            
                            out = get_skip_mask_cd(
                                lm_logits,
                                _hidden_states,
                                cm_head,
                                config=self.config,
                                pos_time=past_key_values[i][0].shape[2] + 1 if past_key_values[i] is not None else 1,
                                layer_exp = i,
                                prev_probits = prev_probits, 
                                layer_am = i//2,
                                alpha = 0.1,
                                return_jsds=False,
                                return_conf=True if self.config.type_vocab_reduct == "adaptive" else False,
                                )
                            if self.config.type_vocab_reduct == "adaptive":
                                skip_mask, conf = out
                                prev_confidences[i] = conf
                            else:
                                skip_mask = out
                            
                        elif self.config.exit_conf_type == "JSD_contrastive_confidence":
                            
                            out = get_skip_mask_cd(
                                lm_logits,
                                _hidden_states,
                                cm_head,
                                config=self.config,
                                pos_time=past_key_values[i][0].shape[2] + 1 if past_key_values[i] is not None else 1,
                                layer_exp = i,
                                prev_probits = prev_probits, 
                                layer_am = i//2,
                                alpha = 0.1,
                                return_jsds=self.render,
                                return_conf=True if self.config.type_vocab_reduct == "adaptive" else False,
                                )
                            if self.config.type_vocab_reduct == "adaptive":
                                if self.render:
                                    skip_mask, jsds, conf = out
                                else:
                                    skip_mask, conf = out
                                prev_confidences[i] = conf
                            else:
                                if self.render:
                                    skip_mask, jsds = out
                                else:
                                    skip_mask = out

                        else:
                            out = get_skip_mask(
                                lm_logits,
                                _hidden_states,
                                cm_head,
                                config=self.config,
                                pos_time=past_key_values[i][0].shape[2] + 1 if past_key_values[i] is not None else 1,
                                return_conf=True if self.config.type_vocab_reduct == "adaptive" else False
                            )
                            if self.config.type_vocab_reduct == "adaptive":
                                skip_mask, conf = out
                                prev_confidences[i] = conf
                            else:
                                skip_mask = out
                        

                        if not skip_mask: self.block_op[i] += 1                    
                        if skip_mask: 
                            
                            self.lm_logits = lm_logits # This is where the logits are sent to do the predictions.
    
                            if self.render: #and len(jsds) >= 23 : # When we have all the jdss values, we can use them to check jsds between layers
                                print("JSDS: ", jsds)
                                probits = torch.softmax(lm_logits, dim=-1)
                                argmax_index = torch.argmax(probits).item()
                                # Tokenizer to get the words
                                word = self.tokenizer.decode(argmax_index)
                                print("Word: ", word, " Token_id: ", argmax_index)
                    
                        if self.config.use_synchronize: torch.cuda.synchronize()
                        self.deploy_time['time_confidence'] += (datetime.datetime.now() - start)

                # Normal framework
                elif (not self.use_shallow_deep and not self.use_early_exit):
                    self.block_op[i] += 1
                
            past_key_value = past_key_values[i]
            layer_outputs = layer_module(
                hidden_states,
                attention_mask=extended_attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_extended_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
                layer_head_mask=layer_head_mask,
                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=past_key_value,
                use_cache=use_cache,
                output_attentions=output_attentions,
                skip_mask=skip_mask,
            )

            # logits are in layer_outputs

            # save them and compute new logits!

            if self.is_decoder:
                if self.config.use_early_exit: prefix = 'time_exit_' if skip_mask else 'time_'
                elif self.config.use_shallow_deep: prefix = 'time_parallel_' if self.shallow2deep else 'time_'
                else: prefix = 'time_'
                for idx, t in enumerate(layer_module.key_value_gen_time): self.deploy_time[prefix + 'key_value_gen'][idx] += t
                for idx, t in enumerate(layer_module.attn_time): self.deploy_time[prefix + 'attn'][idx] += t
                self.deploy_time[prefix + 'ffn'] += layer_module.ffn_time
            
            if self.config.use_synchronize: torch.cuda.synchronize()
            start = datetime.datetime.now()
            # layer_outputs is a tuple with:
            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
            if use_cache is False:
                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]

            hidden_states, present_key_value_state = layer_outputs[:2]

            # We share the position biases between the layers - the first layer store them
            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
            # (cross-attention position bias), (cross-attention weights)
            position_bias = layer_outputs[2]
            if self.is_decoder and encoder_hidden_states is not None:
                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
            # append next layer key value states
            if use_cache:
                present_key_value_states = present_key_value_states + [present_key_value_state,]
            
            if self.config.use_synchronize: torch.cuda.synchronize()
            if self.is_decoder: self.deploy_time['time_others'] += (datetime.datetime.now() - start)

        if self.is_decoder and self.config.plotting_logits:
            # Get the top-1 index of last block.
            index_top_1 = torch.topk(previous_logits[-1], 1)[1][0][0][0].item()
            confidence, max_index = torch.max(torch.softmax(previous_logits[-1], dim=-1), dim=-1)
            confidence = confidence[0].item()

            # Initialize a list to store ranks at each layer
            ranks_at_layers = []
            confidences_at_layers = []

            # Loop over previous layers in reverse order, stopping at the first layer
            for i in range(len(previous_logits) - 1, -1, -1):
                # Get the sorted indices for this layer's logits
                sorted_indices = torch.argsort(previous_logits[i][-1], descending=True)

                # Find the rank of the top-1 index of the last block in the sorted indices of block i
                rank = np.where(sorted_indices.cpu() == index_top_1)[-1]
                #conf = torch.max(torch.softmax(previous_logits[i], dim=-1), dim=-1)[0].item()

                conf, max_index = torch.max(torch.softmax(previous_logits[i], dim=-1), dim=-1)
                conf = conf[0].item()

                # Store the rank positions
                ranks_at_layers.append(rank[0])
                confidences_at_layers.append(conf)
                    
            ranks_at_layers.reverse() # Reverse the list to have the ranks in the correct order
            #ranks_at_layers.append(0) # Append 0 to the end of the list to represent the rank at the last layer

            confidences_at_layers.reverse() # Reverse the list to have the ranks in the correct order
            #confidences_at_layers.append(confidence) # Append 0 to the end of the list to represent the rank at the last layer

            self.graph_top_k_list.append(ranks_at_layers) # Append the ranks at each layer to the list of ranks
            self.graph_top_k_confidence.append(confidences_at_layers) # Append the ranks at each layer to the list of ranks


        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if not skip_mask and self.lm_logits is None: # If threshold is "not satisfied", then compute the new block logits
            hidden_states = self.final_layer_norm(hidden_states)
            hidden_states = self.dropout(hidden_states)
        if self.config.use_synchronize: torch.cuda.synchronize()
        if self.is_decoder: self.deploy_time['time_others'] += (datetime.datetime.now() - start)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    present_key_value_states,
                    all_hidden_states,
                    all_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=present_key_value_states,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )


class DeployT5ForConditionalGeneration(T5ForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        self.model_dim = config.d_model

        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        encoder_config.static_exit_layer = None
        self.encoder = DeployT5Stack(encoder_config, self.shared)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = DeployT5Stack(decoder_config, self.shared)

        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.decoder.lm_head = self.lm_head
        if self.config.exit_conf_type == 'meta' or self.config.shallow2deep_conf_type == "meta":
            self.cm_head = nn.Sequential(
                nn.Linear(config.d_model, config.d_model, bias=True),
                nn.ReLU(),
                nn.Linear(config.d_model, 2, bias=True),
            )
        else:
            self.cm_head = None

        # RollBack policy
        self.rollback_num = 0
        self.criterion = nn.CrossEntropyLoss(reduction='none')
        
        # BMM
        self.bmm_update_iter = 0
        self.bmm_update_max_iter = 300
        
        self.deploy_time = {
            'time_encoder_forward': datetime.timedelta(),
            'time_decoder_forward': datetime.timedelta(),
            'time_key_value_gen': [datetime.timedelta(), datetime.timedelta()],
            'time_attn': [datetime.timedelta(), datetime.timedelta()],
            'time_ffn': datetime.timedelta(),
            'time_confidence': datetime.timedelta(),
            'time_exit_key_value_gen': [datetime.timedelta(), datetime.timedelta()],
            'time_exit_attn': [datetime.timedelta(), datetime.timedelta()],
            'time_exit_ffn': datetime.timedelta(),
            'time_parallel_key_value_gen': [datetime.timedelta(), datetime.timedelta()],
            'time_parallel_attn': [datetime.timedelta(), datetime.timedelta()],
            'time_parallel_ffn': datetime.timedelta(),
            'time_estimate_conf': datetime.timedelta(),
            'time_others': datetime.timedelta(),
        }

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
        r"""
        DeployT5ForConditionalGeneration class is for a deployment scenario,
        where the decoder models are communicating with only one user (i.e., the batch size of 1).

        Here, for the faster inference, we have implemented non-autoregressive hidden_state copying in Shallow-Deep framework.
        """

        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        encoder_outputs, decoder_outputs = self.forward_impl(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask,
                                                            head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs,
                                                            past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache,
                                                            output_attentions, output_hidden_states, return_dict)
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if self.decoder.lm_logits is None:  # token has not skipped
            sequence_output = decoder_outputs[0]

            if self.config.tie_word_embeddings:
                # Rescale output before projecting on vocab
                # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
                sequence_output = sequence_output * (self.model_dim**-0.5)
            
            lm_logits = self.lm_head(sequence_output)
        else: lm_logits = self.decoder.lm_logits
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        self.deploy_time['time_others'] += (datetime.datetime.now() - start)
        if self.config.use_synchronize: torch.cuda.synchronize()
        self.deploy_time['time_decoder_forward'] += (datetime.datetime.now() - start)

        if self.decoder.shallow2deep: 
            self.decoder.stack_conf, self.decoder.stack_pred = (), ()
        if self.config.rollback_conf_threshold is None:
            lm_logits = lm_logits[:, [-1], :]
        loss = self.compute_model_loss(lm_logits, labels)

        if not return_dict:
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )
    
    def compute_model_loss(self, lm_logits=None, labels=None):
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            assert lm_logits is not None
            labels = labels.to(lm_logits.device)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
        return loss
    
    def forward_impl(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
    
        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                decoder_head_mask = head_mask

        # Encode if needed (training, first prediction pass)
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )
        if self.config.use_synchronize: torch.cuda.synchronize()
        self.deploy_time['time_encoder_forward'] += (datetime.datetime.now() - start)
        
        hidden_states = encoder_outputs[0]
        
        if self.config.use_synchronize: torch.cuda.synchronize()
        start = datetime.datetime.now()
        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
            # get decoder inputs from shifting lm labels to the right
            decoder_input_ids = self._shift_right(labels)
            
        if past_key_values is None and len(self.decoder.stack_conf_all) > 0 and self.bmm_update_iter < self.bmm_update_max_iter:
            X = np.hstack(self.decoder.stack_conf_all)
            Y = np.hstack(self.decoder.stack_ident_all)
            self.decoder.bmm_model.fit(X, Y)
            
            self.decoder.bmm_threshold = self.decoder.bmm_model.predict_proba(0.3, 0.9)
            self.bmm_update_iter += 1
        
        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            lm_head=self.lm_head,
            cm_head=self.cm_head,
        )
        if self.config.use_synchronize: torch.cuda.synchronize()
        self.deploy_time['time_decoder_forward'] += (datetime.datetime.now() - start)
        for k, v in self.decoder.deploy_time.items():
            if type(v) != list: self.deploy_time[k] += v
            else: self.deploy_time[k] = [_d + _v for _d, _v in zip(self.deploy_time[k], v)]
        self.decoder._reset_time_measure()

        return encoder_outputs, decoder_outputs
    
    @staticmethod
    def apply_repetition_penalty(logits, input_ids, penalty=1.2):
        if penalty == 1.0:
            return logits

        for i in range(input_ids.shape[0]):
            for token_id in set(input_ids[i].tolist()):
                if logits[i, token_id] < 0:
                    logits[i, token_id] *= penalty
                else:
                    logits[i, token_id] /= penalty
        return logits

    def greedy_search(
        self,
        input_ids: torch.LongTensor,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        streamer: Optional["BaseStreamer"] = None,
        **model_kwargs,
    ) -> Union[GreedySearchOutput, torch.LongTensor]:
        r"""
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
        instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).
        """

        # init values
        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
        if max_length is not None:
            warnings.warn(
                "`max_length` is deprecated in this function, use"
                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                UserWarning,
            )
            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
        if isinstance(eos_token_id, int):
            eos_token_id = [eos_token_id]
        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
        output_attentions = (
            output_attentions if output_attentions is not None else self.generation_config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
        )
        return_dict_in_generate = (
            return_dict_in_generate
            if return_dict_in_generate is not None
            else self.generation_config.return_dict_in_generate
        )

        # init attention / hidden states / scores tuples
        scores = () if (return_dict_in_generate and output_scores) else None
        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None

        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
        if return_dict_in_generate and self.config.is_encoder_decoder:
            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
            encoder_hidden_states = (
                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
            )

        # keep track of which sequences are already finished
        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)

        this_peer_finished = False  # used by synced_gpus only

        # for RollBack policy
        self.rollback_candidates = ()
        self.pass_length_rollback = 0

        while True:
            if synced_gpus:
                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
                # The following logic allows an early break if all peers finished generating their sequence
                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
                # send 0.0 if we finished, 1.0 otherwise
                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
                # did all peers finish? the reduced sum will be 0.0 then
                if this_peer_finished_flag.item() == 0.0:
                    break

            # prepare model inputs
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

            # forward pass to get next token
            outputs = self(
                **model_inputs,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )       

            if synced_gpus and this_peer_finished:
                continue  # don't waste resources running the code we don't need
            
            # RollBack policy
            if self.config.use_shallow_deep and self.decoder.shallow2deep and not self.config.copy_skipped_hidden_states and self.config.rollback_conf_threshold is not None:
                if self.config.use_synchronize: torch.cuda.synchronize()
                start = datetime.datetime.now()

                seq_len = outputs.logits.size(1)
                if seq_len == 1:
                    # stack_hidden_states is empty, so do not need to RollBack
                    assert len(self.rollback_candidates) == 0
                    self.pass_length_rollback += 1
                
                else:
                    # we should check RollBack
                    assert seq_len - 1 == len(self.rollback_candidates)
                    
                    deep_logits = outputs.logits[:, :-1, :]
                    shallow_preds = torch.cat(self.rollback_candidates, dim=0)
                    rollback_loss = self.criterion(deep_logits.squeeze(0), shallow_preds)

                    for j, _loss in enumerate(rollback_loss):
                        if _loss.item() > self.config.rollback_conf_threshold:
                            # RollBack
                            outputs.logits = deep_logits[:, [j], :]
                            
                            # remove RollBacked tokens
                            input_ids = input_ids[:, :self.pass_length_rollback + 1]  # consider sos token
                            past_key_values = []
                            for past in outputs.past_key_values:
                                past_key_values += [[past[0][:, :, :self.pass_length_rollback + 1, :],  # self-attn key
                                                     past[1][:, :, :self.pass_length_rollback + 1, :],  # self-attn value
                                                     past[2],
                                                     past[3]],]
                            outputs.past_key_values = past_key_values

                            self.decoder.block_op[0] -= (seq_len - 1) - j
                            self.rollback_num += (seq_len - 1) - j
                            break
                        else:
                            self.pass_length_rollback += 1
                    
                    self.rollback_candidates = ()
                    self.pass_length_rollback += 1
                    
                if self.config.use_synchronize: torch.cuda.synchronize()
                self.deploy_time['time_decoder_forward'] += (datetime.datetime.now() - start)

            next_token_logits = outputs.logits[:, -1, :]

            try:
                next_token_logits = self.apply_repetition_penalty(next_token_logits, input_ids, penalty=1.2)
            except:
                pass
            # pre-process distribution
            next_tokens_scores = logits_processor(input_ids, next_token_logits)

            # Store scores, attentions and hidden_states when required
            if return_dict_in_generate:
                if output_scores:
                    scores += (next_tokens_scores,)
                if output_attentions:
                    decoder_attentions += (
                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
                    )
                    if self.config.is_encoder_decoder:
                        cross_attentions += (outputs.cross_attentions,)

                if output_hidden_states:
                    decoder_hidden_states += (
                        (outputs.decoder_hidden_states,)
                        if self.config.is_encoder_decoder
                        else (outputs.hidden_states,)
                    )

            # argmax
            next_tokens = torch.argmax(next_tokens_scores, dim=-1)

            # for RollBack, store Shallow decoder's predictions
            if self.config.use_shallow_deep and not self.decoder.shallow2deep and not self.config.copy_skipped_hidden_states and self.config.rollback_conf_threshold is not None:
                self.rollback_candidates += (next_tokens,)

            # finished sentences should have their next token be a padding token
            if eos_token_id is not None:
                if pad_token_id is None:
                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

            # update generated ids, model inputs, and length for next step
            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)

            if streamer is not None:
                streamer.put(next_tokens.cpu())
            model_kwargs = self._update_model_kwargs_for_generation(
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
            )

            # if eos_token was found in one sentence, set sentence to finished
            if eos_token_id_tensor is not None:
                unfinished_sequences = unfinished_sequences.mul(
                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
                )

                # stop when each sentence is finished
                if unfinished_sequences.max() == 0:
                    this_peer_finished = True

            # stop if we exceed the maximum length
            if stopping_criteria(input_ids, scores):
                this_peer_finished = True

            if this_peer_finished and not synced_gpus:
                break

        if streamer is not None:
            streamer.end()

        if return_dict_in_generate:
            if self.config.is_encoder_decoder:
                return GreedySearchEncoderDecoderOutput(
                    sequences=input_ids,
                    scores=scores,
                    encoder_attentions=encoder_attentions,
                    encoder_hidden_states=encoder_hidden_states,
                    decoder_attentions=decoder_attentions,
                    cross_attentions=cross_attentions,
                    decoder_hidden_states=decoder_hidden_states,
                )
            else:
                return GreedySearchDecoderOnlyOutput(
                    sequences=input_ids,
                    scores=scores,
                    attentions=decoder_attentions,
                    hidden_states=decoder_hidden_states,
                )
        else:
            return input_ids