77# Authors: Titouan Parcollet and Jianyuan Zhong
88# ############################################################################
99# Seed needs to be set at top of yaml, before objects with parameters are made
10- seed : 1234
10+ seed : 3407
1111__set_seed : !apply:torch.manual_seed [!ref <seed>]
12- output_folder : !ref results/transformer_de/<seed>
13- test_wer_file : !ref <output_folder>/wer_test.txt
14- valid_wer_file : !ref <output_folder>/wer_valid.txt
12+ output_folder : !ref results/conformer_en/<seed>
13+ output_wer_folder : !ref <output_folder>/
1514save_folder : !ref <output_folder>/save
1615train_log : !ref <output_folder>/train_log.txt
1716
@@ -20,12 +19,13 @@ data_folder: !PLACEHOLDER # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
2019train_tsv_file : !ref <data_folder>/train.tsv # Standard CommonVoice .tsv files
2120dev_tsv_file : !ref <data_folder>/dev.tsv # Standard CommonVoice .tsv files
2221test_tsv_file : !ref <data_folder>/test.tsv # Standard CommonVoice .tsv files
23- accented_letters : True
24- language : de # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
25- train_csv : !ref <save_folder >/train.csv
26- valid_csv : !ref <save_folder >/dev.csv
27- test_csv : !ref <save_folder >/test.csv
22+ accented_letters : False
23+ language : en # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
24+ train_csv : !ref <output_folder >/train.csv
25+ valid_csv : !ref <output_folder >/dev.csv
26+ test_csv : !ref <output_folder >/test.csv
2827skip_prep : False # Skip data preparation
28+ convert_to_wav : False # Switch this to True to convert all mp3 files to wav.
2929
3030# We remove utterance slonger than 10s in the train/dev/test sets as
3131# longer sentences certainly correspond to "open microphones".
@@ -40,12 +40,14 @@ ctc_weight: 0.3
4040grad_accumulation_factor : 4
4141loss_reduction : ' batchmean'
4242sorting : random
43+ num_workers : 4
4344precision : fp32 # bf16, fp16 or fp32
4445
4546# stages related parameters
46- stage_one_epochs : 40
47- lr_adam : 1.0
48- lr_sgd : 0.00003
47+ lr_adam : 0.0008
48+ weight_decay : 0.01
49+ warmup_steps : 25000
50+ augment_warmup : 8000
4951
5052# BPE parameters
5153token_type : unigram # ["unigram", "bpe", "char"]
@@ -56,30 +58,53 @@ sample_rate: 16000
5658n_fft : 400
5759n_mels : 80
5860
61+ # This setup works well for A100 80GB GPU, adapts it to your needs.
62+ # Or turn it off (but training speed will decrease)
63+ dynamic_batching : True
64+ max_batch_length_train : 500
65+ max_batch_length_val : 100 # we reduce it as the beam is much wider (VRAM)
66+ num_bucket : 200
67+ shuffle : True # if true re-creates batches at each epoch shuffling examples.
68+ batch_ordering : random
69+ max_batch_ex : 256
70+
71+ dynamic_batch_sampler_train :
72+ max_batch_length : !ref <max_batch_length_train>
73+ num_buckets : !ref <num_bucket>
74+ shuffle : !ref <shuffle>
75+ batch_ordering : !ref <batch_ordering>
76+ max_batch_ex : !ref <max_batch_ex>
77+
78+ dynamic_batch_sampler_valid :
79+ max_batch_length : !ref <max_batch_length_val>
80+ num_buckets : !ref <num_bucket>
81+ shuffle : !ref <shuffle>
82+ batch_ordering : !ref <batch_ordering>
83+ max_batch_ex : !ref <max_batch_ex>
84+
5985# Dataloader options
6086train_dataloader_opts :
6187 batch_size : !ref <batch_size>
6288 shuffle : True
63- num_workers : 6
89+ num_workers : !ref <num_workers>
6490
6591valid_dataloader_opts :
66- batch_size : !ref <batch_size>
67- num_workers : 6
92+ batch_size : 1
6893
6994test_dataloader_opts :
70- batch_size : !ref <batch_size>
71- num_workers : 6
95+ batch_size : 1
96+
7297
7398# ###################### Model Parameters ###########################
7499# Transformer
75- d_model : 768
100+ d_model : 512
76101nhead : 8
77102num_encoder_layers : 12
78103num_decoder_layers : 6
79- d_ffn : 3072
80- transformer_dropout : 0.0
104+ d_ffn : 2048
105+ transformer_dropout : 0.1
81106activation : !name:torch.nn.GELU
82- output_neurons : 500
107+ output_neurons : 5120
83108
84109# Outputs
85110blank_index : 0
@@ -91,8 +116,8 @@ eos_index: 2
91116# Decoding parameters
92117min_decode_ratio : 0.0
93118max_decode_ratio : 1.0
94- valid_search_interval : 5
95- valid_beam_size : 10
119+ valid_search_interval : 10
120+ valid_beam_size : 1 # We do greedy here so it's faster to decode ...
96121test_beam_size : 80
97122ctc_weight_decode : 0.3
98123scorer_beam_scale : 0.3
@@ -101,24 +126,28 @@ scorer_beam_scale: 0.3
101126
102127CNN : !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
103128 input_shape : (8, 10, 80)
104- num_blocks : 3
129+ num_blocks : 2
105130 num_layers_per_block : 1
106- out_channels : (128, 200, 256 )
107- kernel_sizes : (3, 3, 1 )
108- strides : (2, 2, 1 )
109- residuals : (False, False, False )
131+ out_channels : (64, 32 )
132+ kernel_sizes : (3, 3)
133+ strides : (2, 2)
134+ residuals : (False, False)
110135
111136Transformer : !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
112- input_size : 5120
137+ input_size : 640
113138 tgt_vocab : !ref <output_neurons>
114139 d_model : !ref <d_model>
115140 nhead : !ref <nhead>
116141 num_encoder_layers : !ref <num_encoder_layers>
117142 num_decoder_layers : !ref <num_decoder_layers>
118143 d_ffn : !ref <d_ffn>
119144 dropout : !ref <transformer_dropout>
145+ conformer_activation : !ref <activation>
120146 activation : !ref <activation>
121- normalize_before : False
147+ encoder_module : conformer
148+ attention_type : RelPosMHAXL
149+ normalize_before : True
150+ causal : False
122151
123152ctc_lin : !new:speechbrain.nnet.linear.Linear
124153 input_size : !ref <d_model>
@@ -138,15 +167,9 @@ model: !new:torch.nn.ModuleList
138167 - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
139168
140169# We define two optimizers as we have two stages (training + finetuning)
141- Adam : !name:torch.optim.Adam
170+ Adam : !name:torch.optim.AdamW
142171 lr : !ref <lr_adam>
143- betas : (0.9, 0.98)
144- eps : 0.000000001
145-
146- SGD : !name:torch.optim.SGD
147- lr : !ref <lr_sgd>
148- momentum : 0.99
149- nesterov : True
172+ weight_decay : !ref <weight_decay>
150173
151174# Scorer
152175ctc_scorer : !new:speechbrain.decoders.scorer.CTCScorer
@@ -195,8 +218,7 @@ seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
195218
196219noam_annealing : !new:speechbrain.nnet.schedulers.NoamScheduler
197220 lr_initial : !ref <lr_adam>
198- n_warmup_steps : 25000
199- model_size : !ref <d_model>
221+ n_warmup_steps : !ref <warmup_steps>
200222
201223checkpointer : !new:speechbrain.utils.checkpoints.Checkpointer
202224 checkpoints_dir : !ref <save_folder>
@@ -211,23 +233,26 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
211233
212234normalize : !new:speechbrain.processing.features.InputNormalization
213235 norm_type : global
214- update_until_epoch : 3
236+ update_until_epoch : 4
215237
216238# ############################# Augmentations ###################################
217239
218240# Time Drop
219241time_drop : !new:speechbrain.augment.freq_domain.SpectrogramDrop
220242 drop_length_low : 15
221243 drop_length_high : 25
222- drop_count_low : 5
223- drop_count_high : 5
244+ drop_count_low : 3
245+ drop_count_high : 3
246+ replace : " zeros"
247+ dim : 1
224248
225249# Frequency Drop
226250freq_drop : !new:speechbrain.augment.freq_domain.SpectrogramDrop
227251 drop_length_low : 25
228252 drop_length_high : 35
229253 drop_count_low : 2
230254 drop_count_high : 2
255+ replace : " zeros"
231256 dim : 2
232257
233258# Time warp
0 commit comments