speechbrain
diff --git a/‎.github/workflows/pythonapp.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/pythonapp.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/verify-docs-gen.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/verify-docs-gen.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎recipes/LibriSpeech/ASR/CTC/README.md‎
Lines changed: 25 additions & 0 deletions b/‎recipes/LibriSpeech/ASR/CTC/README.md‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎recipes/LibriSpeech/ASR/CTC/extra_requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎recipes/LibriSpeech/ASR/CTC/extra_requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_k2.yaml‎
Lines changed: 255 additions & 0 deletions b/‎recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_k2.yaml‎
Lines changed: 255 additions & 0 deletions
@@ -36,10 +36,14 @@ jobs:
             - name: Full dependencies
               run: |
                   sudo apt-get update
+                  # up to k2 compatible torch version
+                  pip install torch==2.1.2 torchaudio==2.1.2
                   pip install -r requirements.txt
                   pip install --editable .
                   pip install ctc-segmentation
+                  pip install k2==1.24.4.dev20231220+cpu.torch2.1.2 -f https://k2-fsa.github.io/k2/cpu.html
                   pip install protobuf
+                  pip install kaldilm==1.15
             - name: Consistency tests with pytest
               run: |
                   pytest tests/consistency
 
@@ -17,9 +17,12 @@ jobs:
                   python-version: '3.8'
             - name: Full dependencies
               run: |
+                  # up to k2 compatible torch version
+                  pip install torch==2.1.2 torchaudio==2.1.2
                   pip install -r requirements.txt
                   pip install --editable .
                   pip install -r docs/docs-requirements.txt
+                  pip install k2==1.24.4.dev20231220+cpu.torch2.1.2 -f https://k2-fsa.github.io/k2/cpu.html
             - name: Generate docs
               run: |
                   cd docs
 
@@ -1,6 +1,9 @@
 # LibriSpeech ASR with CTC and pre-trained wav2vec2 or whisper models.
 This folder contains the scripts to finetune a wav2vec2 or a whisper based system using LibriSpeech.
 You can download LibriSpeech at http://www.openslr.org/12.
+The loss function is the CTC loss and it is implemented in two different ways:
+- Using the [CTCLoss](https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html) from PyTorch.
+- Using the [CTC implementation](https://github.com/k2-fsa/k2/blob/master/k2/python/k2/ctc_loss.py) from K2 (WFST-based). For an example of such recipe, check the `train_with_wav2vec_k2.py` file.
 
 **Supported pre-trained wav2vec2:** [SpeechBrain](https://github.com/speechbrain/speechbrain/tree/develop/recipes/LibriSpeech/self-supervised-learning/wav2vec2) and [HuggingFace](https://github.com/speechbrain/speechbrain/tree/develop/recipes/CommonVoice/self-supervised-learning/wav2vec2)
 
@@ -25,6 +28,24 @@ To run a fine-tuning of "WavLM" with signal downsampled inputs (for faster train
 python train_with_wav2vec.py hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml --downsampling_factor 2
 ```
 
+# WFST-based CTC loss
+To fine-tune a wav2vec 2.0 model with the WFST-based CTC loss, you can use the `train_with_wav2vec_k2.py` script. This will create a `lang` directory inside your output folder, which will contain the files required to build a lexicon FST. The tokenization method used here is a very basic character-based tokenization (e.g. `hello -> h e l l o`).
+
+To use this script, you will first need to install `k2`. The integration has been tested with `k2==1.24.4` and `torch==2.0.1`, although it should also work with any `torch` version as long as `k2` supports it (compatibility list [here](https://k2-fsa.github.io/k2/installation/pre-compiled-cuda-wheels-linux/index.html)). You can install `k2` by following the instructions [here](https://k2-fsa.github.io/k2/installation/from_wheels.html#linux-cuda-example).
+
+Using a lexicon FST (L) while training can help guide the model to better predictions. When decoding, you can either use a simple HL decoding graph (where H is the ctc topology), or use an HLG graph (where G is usually a 3-gram language model) to further improve the results. In addition, whole lattice rescoring is also supported. This typically happens with a 4-gram language model. See `hparams/train_with_wav2vec_k2.yaml`` for more details.
+
+If you choose to use a 3-gram or a 4-gram language model, you can either supply pre-existing ARPA LMs for both cases, including the option to train your own, or you can specify the name in the YAML docstring for automatic downloading. Comprehensive instructions are provided in `train_hf_wav2vec_k2.yaml`.
+
+For those interested in training their own language model, please consult our recipe at LibriSpeech/LM/train_ngram.py.
+
+Example usage:
+```
+python train_with_wav2vec_k2.py hparams/train_hf_wav2vec_k2.yaml --data_folder=/path/to/LibriSpeech
+```
+
+To use the HLG graph (instead of the default HL), pass `--compose_HL_with_G=True`. To use the 4-gram LM for rescoring, pass the `--decoding_method=whole-lattice-rescoring` argument. Note that this will require more memory, as the whole lattice will be kept in memory during the decoding. In this recipe, the `lm_scale` used by default is 0.4. This is the value that gave the best results in our HL-graph experiments after trying scales of `[0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]`. When rescoring is used alongside the HLG graph, the 4-gram seems to not bring any improvement. The best lm scale in that case was 0.2 (the lowest value we tried).
+
 # KenLM n-gram CTC rescoring
 To enable n-gram rescoring during the decoding, you can download the LibriSpeech official LM from [here](https://www.openslr.org/11/). Please make sure to install the extra dependencies first. Any KenLM language model may be used with this rescoring technique. The n-gram can either be a binary or an arpa file, but note that the binary format is faster to load. The following command shows how to use the official LibriSpeech 4-gram LM with SpeechBrain:
 ```bash
@@ -57,6 +78,10 @@ Note: by default, `topk` is set to 20 as it gives a good trade-off between WER a
 | 08-12-23 | train_hf_whisper.yaml (small) | CTCPrefixBeamSearch  + test batch size = 1 | 960h | 4.73 | 3.19 | 12.65 |3.39 | Not Avail. | [Link](https://www.dropbox.com/sh/zmtp13huxn02fot/AADyKL5q0MwRhEG1-WbSXDWda?dl=0) |  1xRTX3090 24GB | 2xTesla V100 32GB |
 | 08-12-23 | train_hf_whisper.yaml (small) | CTCBeamSearch + 4-gram  + test batch size = 1 | 960h | 4.37 | 3.16 | 11.76 | 3.43 | Not Avail. | [Link](https://www.dropbox.com/sh/zmtp13huxn02fot/AADyKL5q0MwRhEG1-WbSXDWda?dl=0) |  1xRTX3090 24GB | 2xTesla V100 32GB |
 | 08-12-23 | train_hf_whisper.yaml (small) | CTCPrefixBeamSearch + 4-gram  + test batch size = 1 | 960h | 4.44 | 3.30 | 11.89 | 3.47 | Not Avail. | [Link](https://www.dropbox.com/sh/zmtp13huxn02fot/AADyKL5q0MwRhEG1-WbSXDWda?dl=0) |  1xRTX3090 24GB | 2xTesla V100 32GB |
+| 23-01-24 | train_hf_wav2vec_k2.yaml | k2CTC + HL graph + 1best decoding + test batch size = 1 | 960h | 1.83 | Not Avail. | 3.82 | Not Avail. | Not Avail. | [Link](https://www.dropbox.com/scl/fo/678rj1a44jt4zrxjwaetu/h?rlkey=x0xwz31nkl01qwr3k5ivtywvz&dl=0) |  1xRTX2080Ti 12GB | 1xRTX2080Ti 12GB |
+| 23-01-24 | train_hf_wav2vec_k2.yaml | k2CTC + HLG graph + 1best decoding + test batch size = 1 | 960h | 1.69 | Not Avail. | 3.44 | Not Avail. | Not Avail. | [Link](https://www.dropbox.com/scl/fo/c91vqlr8ase90x0m7u3v3/h?rlkey=duh55n0qzlfnfhy4auu0a4f8g&dl=0) |  1xRTX2080Ti 12GB | 1xRTX2080Ti 12GB |
+| 23-01-24 | train_hf_wav2vec_k2.yaml | k2CTC + HL graph + whole lattice rescoring + test batch size = 1 | 960h | 1.72 | Not Avail. | 3.51 | Not Avail. | Not Avail. | [Link](https://www.dropbox.com/scl/fo/mx6hd4zc0iyzqvixxre6q/h?rlkey=xxbpb949btmeiecw30be5qwhj&dl=0) |  1xRTX2080Ti 12GB | 1xRTX2080Ti 12GB |
+| 23-01-24 | train_hf_wav2vec_k2.yaml | k2CTC + HLG graph + whole lattice rescoring + test batch size = 1 | 960h | 1.81 | Not Avail. | 3.57 | Not Avail. | Not Avail. | [Link](https://www.dropbox.com/scl/fo/kj2ujqj3votq7ue6ydh0l/h?rlkey=mibyoria19zasvuxs0iwx6plt&dl=0) |  1xRTX2080Ti 12GB | 1xRTX2080Ti 12GB |
 | 08-12-23 | train_hf_wav2vec.yaml | CTCBeamSearch + RNNLM Rescorer  + test batch size = 1 + topk = 100  | 960h | 1.69 | 26mins15 | 3.55 | 32min44s | Not Avail. | [Link](https://www.dropbox.com/sh/k4ixa211yp5b1tm/AAD85sgYw2CH7NKk_qKMO9Tja?dl=0) |  1x A100 40GB | 2xTesla V100 40GB |
 | 08-12-23 | train_hf_wav2vec.yaml | CTCBeamSearch + TransformerLM Rescorer + test batch size = 1 + topk = 100 | 960h | 1.57 | 26mins56s | 3.37 | 32min46 | Not Avail. | [Link](https://www.dropbox.com/sh/ijqalvre7mm08ng/AAD_hsN-8dBneUMMkELsOOxga?dl=0) |  1x A100 40GB | 2xTesla V100 32GB |
 
 
@@ -1 +1,3 @@
+# k2 # It is better to install k2 with the procedure listed here: https://k2-fsa.github.io/k2/installation/from_wheels.html
+kaldilm==1.15
 kenlm
@@ -0,0 +1,255 @@
+# ################################
+# Model: wav2vec2 + DNN + CTC + LM (k2)
+# Augmentation: SpecAugment
+#
+# This recipe trains a wav2vec2 model with a DNN and DWFST-based CTC loss.
+# To use this recipe you need to have the following:
+#  - A folder with the LibriSpeech dataset (see `datafolder`)
+#  - A folder with a small, and (optionally) a big LM (see `lm_dir`)
+#    These can be downloaded in ARPA format from: http://www.openslr.org/resources/11/.
+#  - A working installation of k2 (and kaldilm if you want to use ARPA LMs).
+#
+# Authors: Zeyu Zhao 2023
+#          Georgios Karakasidis 2023
+#          Pierre Champion 2023
+# ################################
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1111
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/train_wav2vec2_char_k2/<seed>
+output_wer_folder: !ref <output_folder>/
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# URL for the biggest Fairseq english wav2vec2 model.
+wav2vec2_hub: facebook/wav2vec2-large-960h-lv60-self
+wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
+
+# Data files
+data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
+# noise/ris dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean", "dev-other"]
+test_splits: ["test-clean", "test-other"]
+skip_prep: False
+ckpt_interval_minutes: 25 # save checkpoint every N min
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/test-clean.csv
+   - !ref <output_folder>/test-other.csv
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/dev-other.csv
+
+# For k2 CTC training
+lang_dir: !ref <output_folder>/lang
+vocab_file: !ref <data_folder>/librispeech-vocab.txt
+sil_prob: 0.
+add_word_boundary: True
+# For k2 decoding
+test_search_beam: 32
+# Beam size (for decoding)
+test_output_beam: 8
+test_min_active_state: 300
+test_max_active_state: 3000
+# Acoustic scale (mutliplied by the log probs)
+ac_scale: 1.5
+compose_HL_with_G: False
+# 1best or whole-lattice-rescoring
+# decoding_method: whole-lattice-rescoring
+decoding_method: 1best
+# LM scale to be used for rescoring. Only used if rescoring
+rescoring_lm_scale: 0.4
+# This is where the 3gram and (optionally) 4gram LM are stored
+# They can be in either ARPA or FST format. If the former, then
+# the FST equivalent will be created in the same directory by
+# using kaldilm.
+lm_dir: !ref <output_folder>/lm
+# The ARPA LM files are located under the lm_dir.
+# - Use (recommended):
+#     - 3-gram_sb.arpa
+#     - 4-gram_sb.arpa
+#     To downloads speechbrain pretrained models (trained on train-960+librispeech-lm-norm.txt, 214k words)
+# - Use:
+#    - 3-gram.arpa
+#    - 3-gram.pruned.1e-7.arpa
+#    - 3-gram.pruned.3e-7.arpa
+#    - 4-gram.arpa
+#    To downloads http://www.openslr.org/resources/11/ pretrained models (trained on librispeech-lm-norm.txt, 200k words)
+# - Use another name for a model you trained yourself.
+#    If the arpa does not exist in the lm_dir, you'll need to train it yourself.
+#    Please see LibriSpeech/LM/README.md for instructions.
+# Using one of the above name will automatically download the corresponding model.
+# You can speciy a different name, but you'll need to make sure the file exists in the lm_dir.
+# Make sure to use enough RAM and CPUs as the conversion to FST can be quite demanding.
+G_arpa: 3-gram_sb.arpa
+G_rescoring_arpa: 4-gram_sb.arpa
+# caching: False
+
+# Training parameters
+number_of_epochs: 1
+lr: 0.9
+lr_wav2vec: 0.0001
+sorting: ascending  # only ascending and descending are supported currently
+precision: fp32
+sample_rate: 16000
+
+# With data_parallel batch_size is split into N jobs
+# With DDP batch_size is multiplied by N jobs
+# Must be 3 per GPU to fit 32GB of VRAM
+batch_size: 6
+test_batch_size: 1
+num_workers: 10
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <batch_size>
+   num_workers: !ref <num_workers>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+   num_workers: !ref <num_workers>
+
+# Model parameters
+activation: !name:torch.nn.LeakyReLU
+dnn_layers: 2
+dnn_neurons: 1024
+freeze_wav2vec: True
+
+# Outputs
+output_neurons: 30  # BPE size, index(blank/eos/bos) = 0
+
+#
+# Functions and classes
+#
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
+   orig_freq: !ref <sample_rate>
+   speeds: [95, 100, 105]
+
+# Frequency drop: randomly drops a number of frequency bands to zero.
+drop_freq_low: 0  # Min frequency band dropout probability
+drop_freq_high: 1  # Max frequency band dropout probability
+drop_freq_count_low: 1  # Min number of frequency bands to drop
+drop_freq_count_high: 3  # Max number of frequency bands to drop
+drop_freq_width: 0.05  # Width of frequency bands to drop
+
+drop_freq: !new:speechbrain.augment.time_domain.DropFreq
+   drop_freq_low: !ref <drop_freq_low>
+   drop_freq_high: !ref <drop_freq_high>
+   drop_freq_count_low: !ref <drop_freq_count_low>
+   drop_freq_count_high: !ref <drop_freq_count_high>
+   drop_freq_width: !ref <drop_freq_width>
+
+# Time drop: randomly drops a number of temporal chunks.
+drop_chunk_count_low: 1  # Min number of audio chunks to drop
+drop_chunk_count_high: 5  # Max number of audio chunks to drop
+drop_chunk_length_low: 1000  # Min length of audio chunks to drop
+drop_chunk_length_high: 2000  # Max length of audio chunks to drop
+
+drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
+   drop_length_low: !ref <drop_chunk_length_low>
+   drop_length_high: !ref <drop_chunk_length_high>
+   drop_count_low: !ref <drop_chunk_count_low>
+   drop_count_high: !ref <drop_chunk_count_high>
+
+# Augmenter: Combines previously defined augmentations to perform data augmentation
+wav_augment: !new:speechbrain.augment.augmenter.Augmenter
+   parallel_augment: False
+   repeat_augment: 1
+   shuffle_augmentations: False
+   min_augmentations: 4
+   max_augmentations: 4
+   augment_prob: 1.0
+   augmentations: [
+      !ref <speed_perturb>,
+      !ref <drop_freq>,
+      !ref <drop_chunk>]
+
+enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+   input_shape: [null, null, 1024]
+   activation: !ref <activation>
+   dnn_blocks: !ref <dnn_layers>
+   dnn_neurons: !ref <dnn_neurons>
+
+wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.Wav2Vec2
+   source: !ref <wav2vec2_hub>
+   output_norm: True
+   freeze: !ref <freeze_wav2vec>
+   save_path: !ref <wav2vec2_folder>
+
+#####
+# Uncomment this block if you prefer to use a Fairseq pretrained model instead
+# of a HuggingFace one. Here, we provide an URL that is obtained from the
+# Fairseq github for the multilingual XLSR.
+#
+#wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt
+#wav2vec2: !new:speechbrain.lobes.models.fairseq_wav2vec.FairseqWav2Vec2
+#    pretrained_path: !ref <wav2vec2_url>
+#    output_norm: True
+#    freeze: False
+#    save_path: !ref <save_folder>/wav2vec2_checkpoint/model.pt
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: !ref <dnn_neurons>
+   n_neurons: !ref <output_neurons>
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+ctc_cost: !name:speechbrain.k2_integration.losses.ctc_k2
+   reduction: mean
+   beam_size: 10
+
+modules:
+   wav2vec2: !ref <wav2vec2>
+   enc: !ref <enc>
+   ctc_lin: !ref <ctc_lin>
+
+model: !new:torch.nn.ModuleList
+   - [!ref <enc>, !ref <ctc_lin>]
+
+model_opt_class: !name:torch.optim.Adadelta
+   lr: !ref <lr>
+   rho: 0.95
+   eps: 1.e-8
+
+wav2vec_opt_class: !name:torch.optim.Adam
+   lr: !ref <lr_wav2vec>
+
+lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+
+lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_wav2vec>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.9
+   patient: 0
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      wav2vec2: !ref <wav2vec2>
+      model: !ref <model>
+      scheduler_model: !ref <lr_annealing_model>
+      scheduler_wav2vec: !ref <lr_annealing_wav2vec>
+      counter: !ref <epoch_counter>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
	`1`	`+# k2 # It is better to install k2 with the procedure listed here: https://k2-fsa.github.io/k2/installation/from_wheels.html`
	`2`	`+kaldilm==1.15`
`1`	`3`	`kenlm`