speechbrain
diff --git a/‎recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml‎
Lines changed: 3 additions & 2 deletions b/‎recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎recipes/LibriSpeech/ASR/transformer/README.md‎
Lines changed: 2 additions & 0 deletions b/‎recipes/LibriSpeech/ASR/transformer/README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml‎
Lines changed: 7 additions & 5 deletions b/‎recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎recipes/LibriSpeech/ASR/transformer/train.py‎
Lines changed: 15 additions & 3 deletions b/‎recipes/LibriSpeech/ASR/transformer/train.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎speechbrain/lobes/models/transformer/Conformer.py‎
Lines changed: 14 additions & 3 deletions b/‎speechbrain/lobes/models/transformer/Conformer.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎speechbrain/lobes/models/transformer/Transformer.py‎
Lines changed: 18 additions & 3 deletions b/‎speechbrain/lobes/models/transformer/Transformer.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎speechbrain/lobes/models/transformer/TransformerASR.py‎
Lines changed: 24 additions & 8 deletions b/‎speechbrain/lobes/models/transformer/TransformerASR.py‎
Lines changed: 24 additions & 8 deletions
@@ -57,7 +57,7 @@ ctc_weight: 0.3 # Multitask with CTC for the encoder (0.0 = disabled)
 ce_weight: 0.0 # Multitask with CE for the decoder (0.0 = disabled)
 max_grad_norm: 5.0
 loss_reduction: 'batchmean'
-precision: fp32 # bf16, fp16 or fp32
+precision: fp16 # bf16, fp16 or fp32
 
 # The batch size is used if and only if dynamic batching is set to False
 # Validation and testing are done with fixed batches and not dynamic batching.
@@ -136,6 +136,7 @@ output_neurons: 1000
 dec_dim: 512
 dec_emb_dropout: 0.2
 dec_dropout: 0.1
+attention_type: RoPEMHA
 
 # Decoding parameters
 blank_index: 0
@@ -236,7 +237,7 @@ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.Transforme
    dropout: !ref <transformer_dropout>
    activation: !ref <activation>
    encoder_module: conformer
-   attention_type: RelPosMHAXL
+   attention_type: !ref <attention_type>
    normalize_before: True
    causal: False
 
 
@@ -37,7 +37,9 @@ Following table contains whisper-finetuning results for 1 epoch using Whisper mo
 
 | Release | hyperparams file | Dev Clean WER (No LM, small beam) | Test Clean WER (Transformer LM) | Test Other WER (Transformer LM) | HuggingFace link | Model link | GPUs |
 |:-------------:|:-------------:|:-------------:|:---------------------------:| :-----:| :-----:| :-----:| :--------:|
+| 30-09-24 | conformer_large.yaml (new RoPE version) |1.85 with LM | 1.96 | 4.50 | Not Avail. | Not Avail. | 4xA40 46GB |
 | 23-05-23 | branchformer_large.yaml | 2.72 (1.9 with LM) | 2.04 | 4.13 | Not Avail. | [DropBox](https://www.dropbox.com/scl/fo/qhtds5rrdvhhhjywa7ovw/AMiIL5YvQENw5JKVpzXlP5o?rlkey=hz8vlpy3qf9kcyfx0cox089e6&st=ufckv6tb&dl=0) | 4xA100 80GB |
+| 10-02-25 | conformer_large.yaml | 1.85 with LM | 1.97 | 4.50 | N/A | N/A | 4xA100 80GB |
 | 23-05-23 | conformer_large.yaml | 2.62 (1.9 with LM) | 2.01 | 4.52 | [HuggingFace](https://huggingface.co/speechbrain/asr-conformer-transformerlm-librispeech) | [DropBox](https://www.dropbox.com/scl/fo/9we244tgdf47ay20hrdoz/AKnoqQ13nLwSv1ITeJEQ3wY?rlkey=05o5jiszr8rhj6dlprw87t2x4&st=u2odesyk&dl=0) | 4xA100 80GB |
 | 24-03-22 | transformer.yaml | 3.32 | 2.27 | 5.53 | [HuggingFace](https://huggingface.co/speechbrain/asr-transformer-transformerlm-librispeech) | [DropBox](https://www.dropbox.com/sh/653kq8h2k87md4p/AAByAaAryXtQKpRzYtzV9ih5a?dl=0) | 4xV100 32GB |
 | 24-03-22 | conformer_small.yaml | 4.05 | 2.49 | 6.1 (**only 13.3M parameters**) | [HuggingFace](https://huggingface.co/speechbrain/asr-conformersmall-transformerlm-librispeech) | [DropBox](https://www.dropbox.com/sh/s0x6ni124858b8i/AAALaCH6sGTMRUVTjh8Tm8Jwa?dl=0) | 1xV100 32GB |
 
@@ -55,22 +55,24 @@ max_grad_norm: 5.0
 loss_reduction: 'batchmean'
 sorting: random
 num_workers: 4
-precision: fp32 # bf16, fp16 or fp32
+precision: fp16 # bf16, fp16 or fp32
 avg_checkpoints: 10 # Number of checkpoints to average for evaluation
 
 # stages related parameters
 lr_adam: 0.0008
+warmup: 50000
+augment_warmup: 8000
 
 # Feature parameters
 sample_rate: 16000
 n_fft: 512
 n_mels: 80
 win_length: 32
 
-# This setup works well for A100 80GB GPU, adapts it to your needs.
+# This setup works well for V100 32GB GPU, adapts it to your needs.
 # Or turn it off (but training speed will decrease)
 dynamic_batching: True
-max_batch_length_train: 500
+max_batch_length_train: 150
 max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
 num_bucket: 200
 shuffle: True # if true re-creates batches at each epoch shuffling examples.
@@ -153,7 +155,7 @@ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.Transforme
     dropout: !ref <transformer_dropout>
     activation: !ref <activation>
     encoder_module: conformer
-    attention_type: RelPosMHAXL
+    attention_type: RoPEMHA
     normalize_before: True
     causal: False
 
@@ -261,7 +263,7 @@ seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
 
 noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr_adam>
-    n_warmup_steps: 30000
+    n_warmup_steps: !ref <warmup>
 
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
 
@@ -62,9 +62,15 @@ def compute_forward(self, batch, stage):
         feats = self.modules.normalize(feats, wav_lens, epoch=current_epoch)
 
         # Add feature augmentation if specified.
+        augment_warmup = 0
+        if hasattr(self.hparams, "augment_warmup"):
+            augment_warmup = self.hparams.augment_warmup
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "fea_augment"):
-            feats, fea_lens = self.hparams.fea_augment(feats, wav_lens)
-            tokens_bos = self.hparams.fea_augment.replicate_labels(tokens_bos)
+            if self.optimizer_step > augment_warmup:
+                feats, fea_lens = self.hparams.fea_augment(feats, wav_lens)
+                tokens_bos = self.hparams.fea_augment.replicate_labels(
+                    tokens_bos
+                )
 
         # forward modules
         src = self.modules.CNN(feats)
@@ -118,7 +124,13 @@ def compute_objectives(self, predictions, batch, stage):
         if stage == sb.Stage.TRAIN:
             # Labels must be extended if parallel augmentation or concatenated
             # augmentation was performed on the input (increasing the time dimension)
-            if hasattr(self.hparams, "fea_augment"):
+            augment_warmup = 0
+            if hasattr(self.hparams, "augment_warmup"):
+                augment_warmup = self.hparams.augment_warmup
+            if (
+                hasattr(self.hparams, "fea_augment")
+                and self.optimizer_step > augment_warmup
+            ):
                 (
                     tokens,
                     tokens_lens,
 
@@ -5,6 +5,7 @@
 * Jianyuan Zhong 2020
 * Samuele Cornell 2021
 * Sylvain de Langen 2023
+* Shucong Zhang 2024
 """
 
 import warnings
@@ -21,6 +22,7 @@
     MultiheadAttention,
     PositionalwiseFeedForward,
     RelPosMHAXL,
+    RoPEMHA,
 )
 from speechbrain.nnet.hypermixing import HyperMixing
 from speechbrain.nnet.normalization import LayerNorm
@@ -407,6 +409,12 @@ def __init__(
                 num_heads=nhead,
                 fix_tm_hidden_size=False,
             )
+        elif attention_type == "RoPEMHA":
+            self.mha_layer = RoPEMHA(
+                num_heads=nhead,
+                embed_dim=d_model,
+                dropout=dropout,
+            )
 
         self.convolution_module = ConvolutionModule(
             d_model, kernel_size, bias, activation, dropout, causal=causal
@@ -728,7 +736,7 @@ def forward(
         if self.attention_type == "RelPosMHAXL":
             if pos_embs is None:
                 raise ValueError(
-                    "The chosen attention type for the Conformer is RelPosMHAXL. For this attention type, the positional embeddings are mandatory"
+                    f"The chosen attention type for the Conformer is {self.attention_type}. For this attention type, the positional embeddings are mandatory"
                 )
 
         output = src
@@ -794,10 +802,13 @@ def forward_streaming(
             The attention values.
         """
 
-        if self.attention_type == "RelPosMHAXL":
+        if (
+            self.attention_type == "RelPosMHAXL"
+            or self.attention_type == "RoPEMHA"
+        ):
             if pos_embs is None:
                 raise ValueError(
-                    "The chosen attention type for the Conformer is RelPosMHAXL. For this attention type, the positional embeddings are mandatory"
+                    f"The chosen attention type for the Conformer is {self.attention_type}. For this attention type, the positional embeddings are mandatory"
                 )
 
         output = src
 
@@ -2,6 +2,7 @@
 Authors
 * Jianyuan Zhong 2020
 * Samuele Cornell 2021
+* Shucong Zhang 2024
 """
 
 import math
@@ -137,7 +138,12 @@ def __init__(
         self.output_hidden_states = output_hidden_states
         self.layerdrop_prob = layerdrop_prob
 
-        assert attention_type in ["regularMHA", "RelPosMHAXL", "hypermixing"]
+        assert attention_type in [
+            "regularMHA",
+            "RelPosMHAXL",
+            "hypermixing",
+            "RoPEMHA",
+        ]
         assert positional_encoding in ["fixed_abs_sine", None]
 
         assert (
@@ -157,6 +163,11 @@ def __init__(
                 d_mod
CDB3
el, max_length
             )
 
+        if attention_type == "RoPEMHA":
+            self.positional_encoding_decoder = PositionalEncoding(
+                d_model, max_length
+            )
+
         # initialize the encoder
         if num_encoder_layers > 0:
             if custom_src_module is not None:
@@ -374,6 +385,12 @@ def __init__(
                 num_heads=nhead,
                 fix_tm_hidden_size=False,
             )
+        elif attention_type == "RoPEMHA":
+            self.self_att = sb.nnet.attention.RoPEMHA(
+                d_model,
+                nhead,
+                dropout,
+            )
 
         if ffn_type == "regularFFN":
             self.pos_ffn = sb.nnet.attention.PositionalwiseFeedForward(
@@ -704,7 +721,6 @@ def __init__(
                 vdim=vdim,
                 dropout=dropout,
             )
-
         elif attention_type == "RelPosMHAXL":
             self.self_attn = sb.nnet.attention.RelPosMHAXL(
                 d_model, nhead, dropout, mask_pos_future=causal
@@ -787,7 +803,6 @@ def forward(
             tgt1 = tgt
 
         # multi-head attention over the target sequence and encoder states
-
         tgt2, multihead_attention = self.multihead_attn(
             query=tgt1,
             key=memory,
 
@@ -4,6 +4,7 @@
 * Jianyuan Zhong 2020
 * Titouan Parcollet 2024
 * Luca Della Libera 2024
+* Shucong Zhang 2024
 """
 
 from dataclasses import dataclass
@@ -362,12 +363,15 @@ def forward(self, src, tgt, wav_len=None, pad_idx=0):
 
         src = self.custom_src_module(src)
         # add pos encoding to queries if are sinusoidal ones else
-        if self.attention_type == "hypermixing":
+        if (
+            self.attention_type == "hypermixing"
+            or self.attention_type == "RoPEMHA"
+        ):
             pos_embs_encoder = None
         elif self.attention_type == "RelPosMHAXL":
             pos_embs_encoder = self.positional_encoding(src)
         elif self.positional_encoding_type == "fixed_abs_sine":
-            src = src + self.positional_encoding(src)  # add the encodings here
+            src = src + self.positional_encoding(src)
             pos_embs_encoder = None
 
         outputs = self.encoder(
@@ -388,9 +392,12 @@ def forward(self, src, tgt, wav_len=None, pad_idx=0):
 
         tgt = self.custom_tgt_module(tgt)
 
-        if self.attention_type == "RelPosMHAXL":
+        if (
+            self.attention_type == "RelPosMHAXL"
+            or self.attention_type == "RoPEMHA"
+        ):
             tgt = tgt + self.positional_encoding_decoder(tgt)
-            pos_embs_encoder = None  # self.positional_encoding(src)
+            pos_embs_encoder = None
             pos_embs_target = None
         elif (
             self.positional_encoding_type == "fixed_abs_sine"
@@ -439,15 +446,19 @@ def decode(self, tgt, encoder_out, enc_len=None):
             src_key_padding_mask = (1 - length_to_mask(enc_len)).bool()
 
         tgt = self.custom_tgt_module(tgt)
-        if self.attention_type == "RelPosMHAXL":
+
+        if (
+            self.attention_type == "RelPosMHAXL"
+            or self.attention_type == "RoPEMHA"
+        ):
             tgt = tgt + self.positional_encoding_decoder(tgt)
-            pos_embs_encoder = None  # self.positional_encoding(src)
+            pos_embs_encoder = None
             pos_embs_target = None
         elif (
             self.positional_encoding_type == "fixed_abs_sine"
             or self.attention_type == "hypermixing"
         ):
-            tgt = tgt + self.positional_encoding(tgt)  # add the encodings here
+            tgt = tgt + self.positional_encoding(tgt)
             pos_embs_target = None
             pos_embs_encoder = None
 
@@ -506,7 +517,10 @@ def encode(
         )
 
         src = self.custom_src_module(src)
-        if self.attention_type == "hypermixing":
+        if (
+            self.attention_type == "hypermixing"
+            or self.attention_type == "RoPEMHA"
+        ):
             pos_embs_source = None
         elif self.attention_type == "RelPosMHAXL":
             pos_embs_source = self.positional_encoding(src)
@@ -612,6 +626,8 @@ def encode_streaming(self, src, context: TransformerASRStreamingContext):
         src = self.custom_src_module(src)
         if self.attention_type == "RelPosMHAXL":
             pos_embs_source = self.positional_encoding(pos_encoding_dummy)
+        elif self.attention_type == "RoPEMHA":
+            pos_embs_source = None
 
         elif self.positional_encoding_type == "fixed_abs_sine":
             src = src + self.positional_encoding(pos_encoding_dummy)