add src_q/k/v_bias for cross_att (#2685)

* add src_q/k/v_bias for cross_att * fix lint * add src_key_bias for whisper cross attn
wenet-e2e · Feb 7, 2025 · 59dc505 · 59dc505
1 parent ead4d14
commit 59dc505
Show file tree

Hide file tree

Showing 7 changed files with 11 additions and 2 deletions.
diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3.yaml
@@ -25,6 +25,7 @@ decoder_conf:
   gradient_checkpointing: true
   input_layer: embed_learnable_pe
   key_bias: false
+  src_key_bias: false
   linear_units: 5120
   normalize_before: true
   num_blocks: 32

diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4.yaml
@@ -25,6 +25,7 @@ decoder_conf:
   gradient_checkpointing: true
   input_layer: embed_learnable_pe
   key_bias: false
+  src_key_bias: false
   linear_units: 5120
   normalize_before: true
   num_blocks: 32

diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_conv2d4_onlyattn.yaml
@@ -25,6 +25,7 @@ decoder_conf:
   gradient_checkpointing: true
   input_layer: embed_learnable_pe
   key_bias: false
+  src_key_bias: false
   linear_units: 5120
   normalize_before: true
   num_blocks: 32

diff --git a/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml b/examples/aishell/whisper/conf/finetune_whisper_largev3_onlyattn.yaml
@@ -25,6 +25,7 @@ decoder_conf:
   gradient_checkpointing: true
   input_layer: embed_learnable_pe
   key_bias: false
+  src_key_bias: false
   linear_units: 5120
   normalize_before: true
   num_blocks: 32

diff --git a/examples/wenetspeech/whisper/conf/finetune_whisper_largev3.yaml b/examples/wenetspeech/whisper/conf/finetune_whisper_largev3.yaml
@@ -25,6 +25,7 @@ decoder_conf:
   gradient_checkpointing: true
   input_layer: embed_learnable_pe
   key_bias: false
+  src_key_bias: false
   linear_units: 5120
   normalize_before: true
   num_blocks: 32

diff --git a/wenet/transformer/decoder.py b/wenet/transformer/decoder.py
@@ -90,6 +90,9 @@ def __init__(
         mlp_bias: bool = True,
         n_expert: int = 8,
         n_expert_activated: int = 2,
+        src_query_bias: bool = True,
+        src_key_bias: bool = True,
+        src_value_bias: bool = True,
     ):
         super().__init__()
         attention_dim = encoder_output_size
@@ -123,8 +126,8 @@ def __init__(
                     value_bias, use_sdpa, n_kv_head, head_dim),
                 WENET_ATTENTION_CLASSES["crossattn"](
                     attention_heads, attention_dim, src_attention_dropout_rate,
-                    query_bias, key_bias, value_bias, use_sdpa, n_kv_head,
-                    head_dim) if src_attention else None,
+                    src_query_bias, src_key_bias, src_value_bias, use_sdpa,
+                    n_kv_head, head_dim) if src_attention else None,
                 mlp_class(attention_dim,
                           linear_units,
                           dropout_rate,

diff --git a/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py b/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
@@ -84,6 +84,7 @@ def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
     configs['decoder_conf']['normalize_before'] = True
     configs['decoder_conf']['src_attention'] = True
     configs['decoder_conf']['key_bias'] = False
+    configs['decoder_conf']['src_key_bias'] = False
     configs['decoder_conf']['activation_type'] = "gelu"
 
     configs['tokenizer'] = 'whisper'