speechbrain
diff --git a/‎recipes/CoVoST/AST/README.md‎
Lines changed: 44 additions & 0 deletions b/‎recipes/CoVoST/AST/README.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎recipes/CoVoST/AST/covost_prepare.py‎
Lines changed: 1 addition & 0 deletions b/‎recipes/CoVoST/AST/covost_prepare.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/CoVoST/AST/extra_requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎recipes/CoVoST/AST/extra_requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/CoVoST/AST/hparams/conformer.yaml‎
Lines changed: 268 additions & 0 deletions b/‎recipes/CoVoST/AST/hparams/conformer.yaml‎
Lines changed: 268 additions & 0 deletions
@@ -0,0 +1,44 @@
+# CoVoST speech to text translation
+
+This folder contains script necessary to run automatic speech translation with the [CoVoST dataset](https://github.com/facebookresearch/covost) based on [CommonVoice](https://commonvoice.mozilla.org/en/datasets).
+
+Two heuristics are available:
+1. Training from scratch with a conformer encoder-decoder model and multitask speech recognition plus speech translation training.
+2. SpeechLLM fine-tuning based on SSL speech encoders and LLaMA large language models (with and without adapters).
+
+# How to run
+```shell
+python train{_xlsr_llama}.py hparams/{hparam_file}.py
+```
+
+# Data preparation
+It is important to note that CommonVoice initially offers mp3 audio files. It is feasible to convert these files to .wav during data preparation, this will speed up training but also make the first data preparation to be pretty slow. Audio files are downsampled on the fly within the dataio function of the training script.
+
+# Languages
+While CoVoST offers multiple languages, this recipe only was tested on English to German translation. However, there is nothing special to do to select another language pair aside from adding a proper text normalisation on the covost_prepary.py file.
+
+# Results
+| Language | hyperparams file | Encoder | LLM | Test BLEU | Hugging Face link | Model link | GPUs |
+| ------------- |:-------------:|:---------------------------:| -----:| -----:| -----:| -----:| -----:|
+| English - German | conformer.yaml | conformer | None | 13.9 | None | None | 2x A40 |
+| English - German | w2v2_llama3.yaml| wavlm-large | LLaMA 3.1 7B | 27.2 | None | None | 2x A100 |
+
+# **About SpeechBrain**
+- Website: https://speechbrain.github.io/
+- Code: https://github.com/speechbrain/speechbrain/
+- HuggingFace: https://huggingface.co/speechbrain/
+
+
+# **Citing SpeechBrain**
+Please, cite SpeechBrain if you use it for your research or business.
+
+```bibtex
+@misc{speechbrainV1,
+  title={Open-Source Conversational AI with SpeechBrain 1.0},
+  author={Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Gaelle Laperriere and Mickael Rouvier and Renato De Mori and Yannick Esteve},
+  year={2024},
+  eprint={2407.00463},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG},
+  url={https://arxiv.org/abs/2407.00463},
+}
@@ -0,0 +1 @@
+../covost_prepare.py
@@ -0,0 +1 @@
+sacrebleu
@@ -0,0 +1,268 @@
+# ############################################################################
+# Model: E2E AST with Transformer
+# Encoder: Conformer Encoder
+# Decoder: Transformer Decoder
+# Tokens: unigram
+# losses: CTC + KLdiv (Label Smoothing loss)
+# Authors:  Titouan Parcollet
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 3407
+__set_seed: !apply:speechbrain.utils.seed_everything [!ref <seed>]
+output_folder: !ref results/conformer_en/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g, /localscratch/cv-corpus-4.0-en/fr
+train_tsv_file: !PLACEHOLDER  # Standard CoVoST .tsv files
+dev_tsv_file: !PLACEHOLDER  # Standard CoVoST .tsv files
+test_tsv_file: !PLACEHOLDER  # Standard CoVoST .tsv files
+src_language: en
+tgt_language: de
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev.csv
+test_csv: !ref <output_folder>/test.csv
+skip_prep: False # Skip data preparation
+convert_to_wav: True # Switch this to True to convert all mp3 files to wav.
+
+# We remove utterance slonger than 10s in the train/dev/test sets as
+# longer sentences certainly correspond to "open microphones".
+avoid_if_longer_than: 10.0
+avoid_if_shorter_than: 1.0
+
+# THIS IS TERRIBLE BUT WE HAVE NO CHOICE.
+# Some version of the CV dataset may contain one or two files of more than
+# 2 min in the validation and or test. This is an error by design of the dataset
+# as these files contain 90% of silence. We exclude them.
+avoid_if_longer_than_val_test: 90.0
+
+ckpt_interval_minutes: 15 # save checkpoint every N min
+
+####################### Training Parameters ####################################
+number_of_epochs: 200
+optimizer_step_limit: 150000
+batch_size: 32 # Only used if dynamic batching is not used.
+ctc_weight: 0.3
+grad_accumulation_factor: 1
+loss_reduction: 'batchmean'
+sorting: random
+num_workers: 4
+precision: fp16 # bf16, fp16 or fp32
+
+# stages related parameters
+lr_adam: 0.0008
+weight_decay: 0.01
+asr_warmup_steps: !ref <optimizer_step_limit>
+warmup_steps: 20000
+augment_warmup: 25000
+
+# BPE parameters
+token_type: unigram  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+
+# Feature parameters
+sample_rate: 16000
+n_fft: 400
+n_mels: 80
+
+# This setup works well for A40 46GB GPU, adapts it to your needs.
+# Or turn it off (but training speed will decrease)
+dynamic_batching: True
+max_batch_length_train: 300
+max_batch_length_val: 300
+num_bucket: 200
+shuffle: True # if true re-creates batches at each epoch shuffling examples.
+batch_ordering: random
+max_batch_ex: 256
+
+dynamic_batch_sampler_train:
+    max_batch_length: !ref <max_batch_length_train>
+    num_buckets: !ref <num_bucket>
+    shuffle: !ref <shuffle>
+    batch_ordering: !ref <batch_ordering>
+    max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_valid:
+    max_batch_length: !ref <max_batch_length_val>
+    num_buckets: !ref <num_bucket>
+    shuffle: !ref <shuffle>
+    batch_ordering: !ref <batch_ordering>
+    max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+    batch_size: 8
+
+test_dataloader_opts:
+    batch_size: 8
+
+
+####################### Model Parameters ###########################
+# Transformer
+d_model: 256
+nhead: 4
+num_encoder_layers: 12
+num_decoder_layers: 6
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: !name:torch.nn.GELU
+output_neurons: 2048
+asr_output_neurons: 1024
+
+# Outputs
+blank_index: 0
+label_smoothing: 0.0
+pad_index: 1
+bos_index: 2
+eos_index: 3
+
+# Decoding parameters
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0
+valid_search_interval: 10
+valid_beam_size: 5 # We do greedy here so it's faster to decode ...
+test_beam_size: 80
+
+############################## models ################################
+
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+    input_shape: (8, 10, 80)
+    num_blocks: 2
+    num_layers_per_block: 1
+    out_channels: (64, 32)
+    kernel_sizes: (3, 3)
+    strides: (2, 2)
+    residuals: (False, False)
+
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+    input_size: 640
+    tgt_vocab: !ref <output_neurons>
+    d_model: !ref <d_model>
+    nhead: !ref <nhead>
+    num_encoder_layers: !ref <num_encoder_layers>
+    num_decoder_layers: !ref <num_decoder_layers>
+    d_ffn: !ref <d_ffn>
+    dropout: !ref <transformer_dropout>
+    conformer_activation: !ref <activation>
+    activation: !ref <activation>
+    encoder_module: conformer
+    attention_type: RelPosMHAXL
+    normalize_before: True
+    causal: False
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <d_model>
+    n_neurons: !ref <asr_output_neurons>
+
+seq_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <d_model>
+    n_neurons: !ref <output_neurons>
+
+modules:
+    CNN: !ref <CNN>
+    Transformer: !ref <Transformer>
+    seq_lin: !ref <seq_lin>
+    ctc_lin: !ref <ctc_lin>
+
+model: !new:torch.nn.ModuleList
+    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
+
+# We define two optimizers as we have two stages (training + finetuning)
+Adam: !name:torch.optim.AdamW
+    lr: !ref <lr_adam>
+    weight_decay: !ref <weight_decay>
+
+
+valid_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
+    modules: [!ref <Transformer>, !ref <seq_lin>]
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <valid_beam_size>
+    using_eos_threshold: False
+    length_normalization: True
+
+test_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
+    modules: [!ref <Transformer>, !ref <seq_lin>]
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <test_beam_size>
+    temperature: 1.15
+    using_eos_threshold: True
+
+log_softmax: !new:torch.nn.LogSoftmax
+    dim: -1
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+    blank_index: !ref <blank_index>
+    reduction: !ref <loss_reduction>
+
+seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
+    label_smoothing: !ref <label_smoothing>
+    reduction: !ref <loss_reduction>
+
+noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr_adam>
+    n_warmup_steps: !ref <warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        noam_scheduler: !ref <noam_annealing>
+        normalizer: !ref <normalize>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+normalize: !new:speechbrain.processing.features.InputNormalization
+    norm_type: sentence
+
+############################## Augmentations ###################################
+
+# Time Drop
+time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop
+    drop_length_low: 15
+    drop_length_high: 25
+    drop_count_low: 3
+    drop_count_high: 3
+    replace: "zeros"
+    dim: 1
+
+# Frequency Drop
+freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop
+    drop_length_low: 25
+    drop_length_high: 35
+    drop_count_low: 2
+    drop_count_high: 2
+    replace: "zeros"
+    dim: 2
+
+fea_augment: !new:speechbrain.augment.augmenter.Augmenter
+    min_augmentations: 3
+    max_augmentations: 3
+    augment_prob: 1.0
+    augmentations: [
+        !ref <time_drop>,
+        !ref <freq_drop>]
+
+compute_features: !new:speechbrain.lobes.features.Fbank
+    sample_rate: !ref <sample_rate>
+    n_fft: !ref <n_fft>
+    n_mels: !ref <n_mels>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
+bleu_computer: !name:speechbrain.utils.bleu.BLEUStats