Recipe test fixes (#2874)

pplantinga · Adel-Moumen · web-flow · commit 89cd54f075cb · 2025-04-07T09:44:47.000-04:00
Co-authored-by: Adel Moumen &lt;adelmoumen.pro@gmail.com&gt;
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -30,7 +30,7 @@ jobs:
               run: |
                   pip install uv
                   uv pip install --system ctc-segmentation sacrebleu # ctc-segmentation is funky with uv due to their oldest-supported-numpy dependency
-                  uv pip install --system -r requirements.txt torch==2.6.0+cpu torchaudio==2.6.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu k2==1.24.4.dev20250307+cpu.torch2.6.0 --find-links https://k2-fsa.github.io/k2/cpu.html gensim==4.3.2 bitsandbytes==0.45.3 scikit-learn==1.6.1
+                  uv pip install --system -r requirements.txt torch==2.6.0+cpu torchaudio==2.6.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu k2==1.24.4.dev20250307+cpu.torch2.6.0 --find-links https://k2-fsa.github.io/k2/cpu.html gensim==4.3.2 bitsandbytes==0.45.3 scikit-learn==1.6.1 accelerate
                   uv pip install --system --editable . --no-deps  # already installed pinned deps from requirements.txt, we're good
             - name: Install sox
               run: |
diff --git a/recipes/LJSpeech/quantization/train.py b/recipes/LJSpeech/quantization/train.py
@@ -142,7 +142,7 @@ def dataio_prepare(hparams):
     """
     train_data = sb.dataio.dataset.DynamicItemDataset.from_json(
         json_path=hparams["train_json"],
-        replacements={"DATA_ROOT": hparams["data_folder"]},
+        replacements={"data_root": hparams["data_folder"]},
     )
     # Sort training data to speed up training
     train_data = train_data.filtered_sorted(
@@ -153,7 +153,7 @@ def dataio_prepare(hparams):
 
     valid_data = sb.dataio.dataset.DynamicItemDataset.from_json(
         json_path=hparams["valid_json"],
-        replacements={"DATA_ROOT": hparams["data_folder"]},
+        replacements={"data_root": hparams["data_folder"]},
     )
     # Sort validation data to speed up validation
     valid_data = valid_data.filtered_sorted(
@@ -164,7 +164,7 @@ def dataio_prepare(hparams):
 
     test_data = sb.dataio.dataset.DynamicItemDataset.from_json(
         json_path=hparams["test_json"],
-        replacements={"DATA_ROOT": hparams["data_folder"]},
+        replacements={"data_root": hparams["data_folder"]},
     )
     # Sort the test data to speed up testing
     test_data = test_data.filtered_sorted(
diff --git a/recipes/Voicebank/dereverb/MetricGAN-U/train.py b/recipes/Voicebank/dereverb/MetricGAN-U/train.py
@@ -401,7 +401,7 @@ def write_wavs(self, batch_id, wavs, score, lens):
         for i, (name, pred_wav, length) in enumerate(zip(batch_id, wavs, lens)):
             path = os.path.join(self.hparams.MetricGAN_folder, name + ".wav")
             data = torch.unsqueeze(pred_wav[: int(length)].cpu(), 0)
-            torchaudio.save(path, data, self.hparams.Sample_rate)
+            torchaudio.save(path, data.detach(), self.hparams.Sample_rate)
 
             # Make record of path and score for historical training
             score = float(score[i][0])
diff --git a/recipes/Voicebank/enhance/MetricGAN-U/train.py b/recipes/Voicebank/enhance/MetricGAN-U/train.py
@@ -394,7 +394,7 @@ def write_wavs(self, batch_id, wavs, score, lens):
         for i, (name, pred_wav, length) in enumerate(zip(batch_id, wavs, lens)):
             path = os.path.join(self.hparams.MetricGAN_folder, name + ".wav")
             data = torch.unsqueeze(pred_wav[: int(length)].cpu(), 0)
-            torchaudio.save(path, data, self.hparams.Sample_rate)
+            torchaudio.save(path, data.detach(), self.hparams.Sample_rate)
 
             # Make record of path and score for historical training
             score = float(score[i][0])
diff --git a/recipes/Voicebank/enhance/MetricGAN/train.py b/recipes/Voicebank/enhance/MetricGAN/train.py
@@ -280,7 +280,7 @@ def write_wavs(self, batch_id, wavs, clean_paths, scores, lens):
         ):
             path = os.path.join(self.hparams.MetricGAN_folder, name + ".wav")
             data = torch.unsqueeze(pred_wav[: int(length)].cpu(), 0)
-            torchaudio.save(path, data, self.hparams.Sample_rate)
+            torchaudio.save(path, data.detach(), self.hparams.Sample_rate)
 
             # Make record of path and score for historical training
             score = float(scores[i][0])
diff --git a/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml b/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml
@@ -79,9 +79,9 @@ asr_model: !apply:speechbrain.inference.ASR.EncoderDecoderASR.from_hparams
             embedding_dim: 128
             dropout: 0.
             rnn_layers: 2
-            rnn_neurons: 2048
+            rnn_neurons: 1024
             dnn_blocks: 1
-            dnn_neurons: 512
+            dnn_neurons: 1024
             return_hidden: True
         pretrainer:
             paths:
diff --git a/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml b/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml
@@ -137,9 +137,9 @@ asr_model: !apply:speechbrain.inference.ASR.EncoderDecoderASR.from_hparams
             embedding_dim: 128
             dropout: 0.
             rnn_layers: 2
-            rnn_neurons: 2048
+            rnn_neurons: 1024
             dnn_blocks: 1
-            dnn_neurons: 512
+            dnn_neurons: 1024
             return_hidden: True
         pretrainer:
             paths:
diff --git a/tests/recipes/GigaSpeech.csv b/tests/recipes/GigaSpeech.csv
@@ -1,3 +1,3 @@
 Task,Dataset,Script_file,Hparam_file,Data_prep_file,Readme_file,Result_url,HF_repo,test_debug_flags,test_debug_checks,performance
 ASR-CTC,GigaSpeech,recipes/GigaSpeech/ASR/CTC/train_with_wavlm.py,recipes/GigaSpeech/ASR/CTC/hparams/train_hf_wavlm.yaml,recipes/GigaSpeech/ASR/CTC/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True --wav2vec2_folder=tests/tmp/wav2vec2_checkpoint,
-ASR-Transducers,GigaSpeech,recipes/GigaSpeech/ASR/transducer/train.py,recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml,recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/transducer/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --skip_prep=True,
+ASR-Transducers,GigaSpeech,recipes/GigaSpeech/ASR/transducer/train.py,recipes/GigaSpeech/ASR/transducer/hparams/conformer_transducer.yaml,recipes/GigaSpeech/ASR/transducer/gigaspeech_prepare.py,recipes/GigaSpeech/ASR/transducer/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --number_of_epochs=1 --output_neurons=29 --skip_prep=True --use_torchaudio=True,
diff --git a/tests/recipes/LJSpeech.csv b/tests/recipes/LJSpeech.csv
@@ -5,4 +5,4 @@ TTS,LJSpeech,recipes/LJSpeech/TTS/tacotron2/train.py,recipes/LJSpeech/TTS/tacotr
 TTS,LJSpeech,recipes/LJSpeech/TTS/vocoder/hifigan/train.py,recipes/LJSpeech/TTS/vocoder/hifigan/hparams/train.yaml,recipes/LJSpeech/ljspeech_prepare.py,recipes/LJSpeech/TTS/README.md,https://www.dropbox.com/sh/m2xrdssiroipn8g/AAD-TqPYLrSg6eNxUkcImeg4a?dl=0,https://huggingface.co/speechbrain/tts-hifigan-ljspeech,--epochs=2 --data_folder=tests/samples/ASR --train_json=tests/samples/annotation/ASR_train.json --valid_json=tests/samples/annotation/ASR_dev.json --test_json=tests/samples/annotation/ASR_dev.json --skip_prep=True --sample_rate=16000,"file_exists=[train_log.txt,log.txt,env.log,train.py,hyperparams.yaml,samples/1/synthesized.wav,samples/1/target.wav,samples/2/synthesized.wav,samples/2/target.wav]"
 TTS,LJSpeech,recipes/LJSpeech/TTS/vocoder/diffwave/train.py,recipes/LJSpeech/TTS/vocoder/diffwave/hparams/train.yaml,recipes/LJSpeech/ljspeech_prepare.py,recipes/LJSpeech/TTS/README.md,,,--number_of_epochs=2 --data_folder=tests/samples/ASR --train_json=tests/samples/annotation/ASR_train.json --valid_json=tests/samples/annotation/ASR_dev.json --test_json=tests/samples/annotation/ASR_dev.json --skip_prep=True --sample_rate=16000 --num_workers 0,"file_exists=[train_log.txt,log.txt,env.log,train.py,hyperparams.yaml]"
 TTS,LJSpeech,recipes/LJSpeech/TTS/vocoder/hifigan_discrete/train.py,recipes/LJSpeech/TTS/vocoder/hifigan_discrete/hparams/train.yaml,recipes/LJSpeech/ljspeech_prepare.py,recipes/LJSpeech/TTS/README.md,,,--batch_size=2 --epochs=2 --data_folder=tests/samples/TTS --train_json=tests/samples/annotation/TTS_train.json --valid_json=tests/samples/annotation/TTS_train.json --test_json=tests/samples/annotation/TTS_train.json --skip_prep=True --sample_rate=16000 --codes_save_folder=tests/samples/TTS/codes --skip_extract=True,"file_exists=[train_log.txt,log.txt,env.log,train.py,hyperparams.yaml,samples/1/synthesized.wav,samples/1/target.wav,samples/2/synthesized.wav,samples/2/target.wav]"
-quantization,LJSpeech,recipes/LJSpeech/quantization/train.py,recipes/LJSpeech/quantization/hparams/train_discrete_ssl.yaml,recipes/LJSpeech/quantization/ljspeech_prepare.py,recipes/LJSpeech/quantization/README.md,,https://huggingface.co/speechbrain/SSL_Quantization,--data_folder=tests/samples/TTS --train_json=tests/samples/annotation/TTS_train.json --valid_json=tests/samples/annotation/TTS_train.json --test_json=tests/samples/annotation/TTS_train.json --skip_prep=True,"file_exists=[train_log.txt,train.py,log.txt,env.log,hyperparams.yaml]"
+quantization,LJSpeech,recipes/LJSpeech/quantization/train.py,recipes/LJSpeech/quantization/hparams/train_discrete_ssl.yaml,recipes/LJSpeech/quantization/ljspeech_prepare.py,recipes/LJSpeech/quantization/README.md,,https://huggingface.co/speechbrain/SSL_Quantization,--data_folder=tests/samples/TTS --train_json=tests/samples/annotation/TTS_train.json --valid_json=tests/samples/annotation/TTS_train.json --test_json=tests/samples/annotation/TTS_train.json --skip_prep=True --n_clusters 10 --kmeans_batch_size 10,"file_exists=[train_log.txt,train.py,log.txt,env.log,hyperparams.yaml]"
diff --git a/tests/recipes/LibriSpeech.csv b/tests/recipes/LibriSpeech.csv
@@ -45,4 +45,4 @@ ASR-CTC,LibriSpeech,recipes/LibriSpeech/ASR/CTC/train.py,recipes/LibriSpeech/ASR
 ASR-CTC,LibriSpeech,recipes/LibriSpeech/ASR/CTC/train.py,recipes/LibriSpeech/ASR/CTC/hparams/branchformer_large.yaml,recipes/LibriSpeech/ASR/CTC/librispeech_prepare.py,recipes/LibriSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=[tests/samples/annotation/ASR_train.csv] --number_of_epochs=2 --skip_prep=True --output_neurons=21,"file_exists=[env.log,hyperparams.yaml,log.txt,train_log.txt,train.py,wer_ASR_train.txt,save/21_char.model,save/21_char.vocab]",
 ASR-CTC,LibriSpeech,recipes/LibriSpeech/ASR/CTC/train_with_wav2vec_k2.py,recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_k2.yaml,recipes/LibriSpeech/ASR/CTC/librispeech_prepare.py,recipes/LibriSpeech/ASR/CTC/README.md,,,--data_folder=tests/samples/ASR/ --skip_prep=True --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=[tests/samples/annotation/ASR_train.csv] --number_of_epochs=2,"file_exists=[metric_ASR_train/wer_HL_1best.txt,train_log.txt,log.txt,train_with_wav2vec_k2.py,env.log,hyperparams.yaml]",
 LM,LibriSpeech,recipes/LibriSpeech/LM/train_ngram.py,recipes/LibriSpeech/LM/hparams/train_ngram.yaml,recipes/LibriSpeech/LM/librispeech_prepare.py,recipes/LibriSpeech/LM/README.md,,,--data_folder=tests/samples/ASR/ --skip_prep=True --train_csv=tests/samples/annotation/ASR_train.csv,"file_exists=[env.log,hyperparams.yaml,log.txt,lang/words.txt,libri_lm_corpus.txt,train_ngram.py]",
-quantization,LibriSpeech,recipes/LibriSpeech/quantization/train.py,recipes/LibriSpeech/quantization/hparams/train_discrete_ssl.yaml,recipes/LibriSpeech/quantization/librispeech_prepare.py,recipes/LibriSpeech/quantization/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --skip_prep=True,"file_exists=[train_log.txt,train.py,log.txt,env.log,hyperparams.yaml]",
+quantization,LibriSpeech,recipes/LibriSpeech/quantization/train.py,recipes/LibriSpeech/quantization/hparams/train_discrete_ssl.yaml,recipes/LibriSpeech/quantization/librispeech_prepare.py,recipes/LibriSpeech/quantization/README.md,,,--data_folder=tests/samples/ASR/ --train_csv=tests/samples/annotation/ASR_train.csv --valid_csv=tests/samples/annotation/ASR_train.csv --test_csv=tests/samples/annotation/ASR_train.csv --skip_prep=True --n_clusters 10 --kmeans_batch_size 10,"file_exists=[train_log.txt,train.py,log.txt,env.log,hyperparams.yaml]",
diff --git a/tests/recipes/REAL-M.csv b/tests/recipes/REAL-M.csv
@@ -1,2 +1,2 @@
 Task,Dataset,Script_file,Hparam_file,Data_prep_file,Readme_file,Result_url,HF_repo,test_debug_flags,test_debug_checks,performance
-Sisnr-estimation,REAL-M,recipes/REAL-M/sisnr-estimation/train.py,recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml,,recipes/REAL-M/sisnr-estimation/README.md,https://www.dropbox.com/sh/n55lm8i5z51pbm1/AABHfByOEy__UP_bmT4GJvSba?dl=0,https://huggingface.co/speechbrain/REAL-M-sisnr-estimator,--rir_path=tests/tmp/RIRS_NOISES/pointsource_noises --data_folder=tests/samples/separation --base_folder_dm=tests/samples/separation --train_data=tests/samples/annotation/separation_train.csv --valid_data=tests/samples/annotation/separation_dev.csv --test_data=tests/samples/annotation/separation_dev.csv --skip_prep=True --N_epochs=2 --whamr_data_folder=data_folder=tests/samples/separation --train_whamr_data=tests/samples/annotation/separation_train.csv --base_folder_dm_whamr=tests/samples/separation --use_wavedrop=True,"file_exists=[test_results_wsj.csv,train_log.txt,log.txt,env.log,train.py,hyperparams.yaml,save/whamr_rirs.csv]",L1-Error=1.71dB
+Sisnr-estimation,REAL-M,recipes/REAL-M/sisnr-estimation/train.py,recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml,,recipes/REAL-M/sisnr-estimation/README.md,https://www.dropbox.com/sh/n55lm8i5z51pbm1/AABHfByOEy__UP_bmT4GJvSba?dl=0,https://huggingface.co/speechbrain/REAL-M-sisnr-estimator,--rir_path=tests/tmp/RIRS_NOISES/pointsource_noises --data_folder=tests/samples/separation --base_folder_dm=tests/samples/separation --train_data=tests/samples/annotation/separation_train.csv --valid_data=tests/samples/annotation/separation_dev.csv --test_data=tests/samples/annotation/separation_dev.csv --skip_prep=True --N_epochs=2 --whamr_data_folder=data_folder=tests/samples/separation --train_whamr_data=tests/samples/annotation/separation_train.csv --base_folder_dm_whamr=tests/samples/separation --use_wavedrop=True --use_reverb_augment=False,"file_exists=[test_results_wsj.csv,train_log.txt,log.txt,env.log,train.py,hyperparams.yaml,save/whamr_rirs.csv]",L1-Error=1.71dB
diff --git a/tests/samples/annotation/ASR_train.csv b/tests/samples/annotation/ASR_train.csv
@@ -1,3 +1,3 @@
-ID, duration,start,stop,wav,clean_wav,noisy_wav,noise_wav,s1_wav,s2_wav,mix_wav,target,spk_id,ali,phn,char,wrd,text,transcript,semantics,command
-spk1_snt1,2.87,0,16000,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,spk1,tests/samples/ASR/spk1_snt1.pkl,dh ax cl ch ay l vcl d ao l m ow s cl t hh er cl t sil dh ax s m ao l vcl d ao vcl,t h e c h i l d a l m o s t h u r t t h e s m a l l d o g ,the child almost hurt the small dog,the child almost hurt the small dog,the child almost hurt the small dog,{[this is a fake semantics]},{[this is a fake semantics]}
-spk1_snt2,3.15,0,16000,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,spk1,tests/samples/ASR/spk1_snt2.pkl,vcl d r aa cl p dh ax cl t uw sil w eh n y uw ae vcl d dh ax f ih vcl g y er,d r o p t h e t u e w h e n y o u a d d t h e f i g u r e s,drop the tue when you add the figures,drop the tue when you add the figures,drop the tue when you add the figures,{[this is a fake semantics]},{[this is a fake semantics]}
+ID,duration,start,stop,begin_time,end_time,wav,audio_path,clean_wav,noisy_wav,noise_wav,s1_wav,s2_wav,mix_wav,target,spk_id,ali,phn,char,wrd,text,transcript,semantics,command
+spk1_snt1,2.87,0,16000,0.0,2.87,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt1.wav,spk1,tests/samples/ASR/spk1_snt1.pkl,dh ax cl ch ay l vcl d ao l m ow s cl t hh er cl t sil dh ax s m ao l vcl d ao vcl,t h e c h i l d a l m o s t h u r t t h e s m a l l d o g ,the child almost hurt the small dog,the child almost hurt the small dog,the child almost hurt the small dog,{[this is a fake semantics]},{[this is a fake semantics]}
+spk1_snt2,3.15,0,16000,0.0,3.15,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt1.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,tests/samples/ASR/spk1_snt2.wav,spk1,tests/samples/ASR/spk1_snt2.pkl,vcl d r aa cl p dh ax cl t uw sil w eh n y uw ae vcl d dh ax f ih vcl g y er,d r o p t h e t u e w h e n y o u a d d t h e f i g u r e s,drop the tue when you add the figures,drop the tue when you add the figures,drop the tue when you add the figures,{[this is a fake semantics]},{[this is a fake semantics]}