speechbrain · flexthink · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
     hooks:
       - id: trailing-whitespace
         types: [file, text]
+        exclude: "(char_[a-z]{2}.txt|arpabet.txt)"
       - id: end-of-file-fixer
         types: [python]
       - id: requirements-txt-fixer

diff --git a/recipes/LJSpeech/TTS/README.md b/recipes/LJSpeech/TTS/README.md
@@ -28,6 +28,13 @@ You can find the pre-trained model with an easy-inference function on [HuggingFa
 # FastSpeech2
 The subfolder "fastspeech2" contains the recipes for training the non-autoregressive transformer based TTS model [FastSpeech2](https://arxiv.org/abs/2006.04558).
 
+# Tokotron
+The subfolder "tokotron" contains the recipes for training the transformer-based that uses discrete audio representations.
-The subfolder "tokotron" contains the recipes for training the transformer-based that uses discrete audio representations.
+The subfolder "tokotron" contains the recipes for training a transformer-based model that uses discrete audio representations.
-The subfolder "tokotron" contains the recipes for training the transformer-based that uses discrete audio representations.
+The subfolder "tokotron" contains the recipes for training a transformer-based model that uses discrete audio representations.
+
+You can find the pre-trained model with an easy-inference function on [HuggingFace](https://huggingface.co/speechbrain/tts-tokotron-wavlm-ljspeech).
+
+
+
 ### FastSpeech2 with pre-extracted durations from a forced aligner
 Training FastSpeech2 requires pre-extracted phoneme alignments (durations). The LJSpeech phoneme alignments from Montreal Forced Aligner are automatically downloaded, decompressed and stored at this location: ```/your_folder/LJSpeech-1.1/TextGrid```.
 

diff --git a/recipes/LJSpeech/TTS/tokotron/hparams/train.yaml b/recipes/LJSpeech/TTS/tokotron/hparams/train.yaml
@@ -0,0 +1,321 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriTTS
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+ssl_model_type: wavlm
+representation_mode: discrete
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+progress_folder: !ref <output_folder>/progress
+valid_inter_data_count: 50
+samples_interval: 1
+
+freeze_token_model: True
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+
+g2p_src: speechbrain/soundchoice-g2p
+
+vocoder_repo_id_discrete: speechbrain/hifigan-<ssl_model_type>-k1000-LibriTTS
+vocoder_repo_id_continuous: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
+        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
+        wav2vec: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: !ref <vocoder_repo_id_discrete>
+        continuous: !ref <vocoder_repo_id_continuous>
+
+token_model_kmeans_dataset: LibriSpeech
+speech_model_layers: [1, 3, 7, 12, 18, 23]
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+spk_emb_injection: null
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+eos_mode: gate
+scale_factor: 4
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./tokens/char_en.txt
+token_list_file_phn: ./tokens/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:speechbrain.lobes.models.discrete.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+
+
+token_model: !apply:speechbrain.utils.hparams.conditional
+    condition: !ref <representation_mode>
+    condition_value: discrete
+    apply: True
+    value: !name:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
+        ssl_model: !ref <ssl_model>
+        vocoder_repo_id: !ref <vocoder_repo_id>
+        kmeans_dataset: !ref <token_model_kmeans_dataset>
+        num_clusters: !ref <audio_num_tokens>
+        save_path: !ref <pretrained_model_save_folder>
+        layers_num: !ref <speech_model_layers>
+
+audio_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: !ref <token_model>
+        continuous: !ref <ssl_model>
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+    deduplicates: null
+    bpe_tokenizers: null
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6
+dec_num_layers: 12
+layerwise_renorm: True
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+emb_dropout: 0.0
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1000
+audio_dim: 1024
+audio_emb_size: 1024
+audio_emb_freeze: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+
+############################## models ################################
+
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: True
+        injection: !ref <spk_emb_injection>
+
+model: !new:speechbrain.lobes.models.discrete.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    emb_dropout: !ref <emb_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !apply:speechbrain.utils.hparams.choice
+        value: !ref <use_spk_emb>
+        choices:
+            True: !ref <emb>
+            False: null
+    layerwise_renorm: !ref <layerwise_renorm>
+    vocoder: !ref <vocoder>
+
+vocoder_continuous: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams
+    source: !ref <vocoder_repo_id>
+    savedir: !ref <pretrained_model_save_folder>/hifigan-continuous-<ssl_model_type>
+
+
+vocoder_discrete: !name:speechbrain.lobes.models.discrete.Tokotron.DiscreteSSLVocoder
+    discrete_ssl: !ref <token_model>
+    layers: !ref <speech_model_layers>
+
+vocoder: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    apply: True
+    choices:
+        continuous: !ref <vocoder_continuous>
+        discrete: !ref <vocoder_discrete>
+
+modules:
+    model: !ref <model>
+    audio_model: !ref <audio_model>
+    compute_cost: !ref <compute_cost>
+    vocoder: !ref <vocoder>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:speechbrain.lobes.models.discrete.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    bos_width: !ref <bos_width>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>\
+    representation_mode: !ref <representation_mode>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/recipes/LJSpeech/TTS/tokotron/ljspeech_prepare.py b/recipes/LJSpeech/TTS/tokotron/ljspeech_prepare.py
@@ -0,0 +1 @@
+../../ljspeech_prepare.py
diff --git a/recipes/LJSpeech/TTS/tokotron/tokens/arpabet.txt b/recipes/LJSpeech/TTS/tokotron/tokens/arpabet.txt
@@ -0,0 +1,51 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+
-Original file line number
+Diff line change
@@ -0,0 +1,51 @@
+    AA
+    AE
+    AH
+    AO
+    AW
+    AY
+    B
+    CH
+    D
+    DH
+    EH
+    ER
+    EY
+    F
+    G
+    HH
+    IH
+    IY
+    JH
+    K
+    L
+    M
+    N
+    NG
+    OW
+    OY
+    P
+    R
+    S
+    SH
+    T
+    TH
+    UH
+    UW
+    V
+    W
+    Y
+    Z
+    ZH
+    '
+    "
+    !
+    (
+    )
+    ,
+    -
+    .
+    :
+    ;
+    ?