speechbrain
diff --git a/‎recipes/ESC50/classification/README.md‎
Lines changed: 26 additions & 2 deletions b/‎recipes/ESC50/classification/README.md‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎recipes/ESC50/classification/extra_requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎recipes/ESC50/classification/extra_requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎recipes/ESC50/classification/hparams/cnn14.yaml‎
Lines changed: 19 additions & 9 deletions b/‎recipes/ESC50/classification/hparams/cnn14.yaml‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎recipes/ESC50/classification/hparams/conv2d.yaml‎
Lines changed: 12 additions & 1 deletion b/‎recipes/ESC50/classification/hparams/conv2d.yaml‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎recipes/ESC50/classification/hparams/focalnet.yaml‎
Lines changed: 11 additions & 0 deletions b/‎recipes/ESC50/classification/hparams/focalnet.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎recipes/ESC50/classification/hparams/vit.yaml‎
Lines changed: 10 additions & 0 deletions b/‎recipes/ESC50/classification/hparams/vit.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎recipes/ESC50/classification/train.py‎
Lines changed: 54 additions & 15 deletions b/‎recipes/ESC50/classification/train.py‎
Lines changed: 54 additions & 15 deletions
diff --git a/‎recipes/ESC50/classification/wham_prepare.py‎
Lines changed: 1 addition & 0 deletions b/‎recipes/ESC50/classification/wham_prepare.py‎
Lines changed: 1 addition & 0 deletions
@@ -1,10 +1,22 @@
 # Sound Classification - ESC50 Dataset
 
 This recipe trains a classifier for the ESC50 multiclass sound classification dataset.
-It is mainly adapted from the Speechbrain UrbanSound8k recipe.
+
+The task involves classifying audio sounds into 50 different categories. These categories are divided into the following groups:
+
+- Animals
+- Natural soundscapes and water sounds
+- Human, non-speech sounds
+- Interior/domestic sounds
+- Exterior/urban noises
 
 The scripts offer the possibility to train both with log-spectra and log-mel audio features.
 
+## Dataset Download
+
+The ESC50 dataset will be automatically downloaded when running the recipe. If you prefer to download it manually, please visit: [https://github.com/karolpiczak/ESC-50](https://github.com/karolpiczak/ESC-50)
+
+
 ---------------------------------------------------------------------------------------------------------
 
 ## Installing Extra Dependencies
@@ -29,6 +41,8 @@ This script trains a [CNN14 model](https://arxiv.org/abs/1912.10211) on the ESC5
 python train.py hparams/cnn14.yaml --data_folder /yourpath/ESC50
 ```
 
+The dataset will be automatically download at the specified data folder.
+
 ---------------------------------------------------------------------------------------------------------
 
 ### Conv2D
@@ -61,6 +75,16 @@ python train.py hparams/vit.yaml --data_folder /yourpath/ESC50
 
 ---------------------------------------------------------------------------------------------------------
 
+### To train with WHAM! noise
+
+In order to train the classifier with WHAM! noise, you need to download the  WHAM! noise dataset from [here](http://wham.whisper.ai/).
+Then, you can train your classifier with the following command:
+
+```shell
+python train.py hparams/modelofchoice.yaml --data_folder /yourpath/ESC50 --add_wham_noise True --wham_folder /yourpath/wham_noise
+```
+
+
 ## Results
 
 | Hyperparams file | Accuracy (%) |   Training time    |                        HuggingFace link                         |                                                         Model link                                                         |    GPUs     |
@@ -139,4 +163,4 @@ If you use **SpeechBrain**, please cite:
 - Code: https://github.com/speechbrain/speechbrain/
 - HuggingFace: https://huggingface.co/speechbrain/
 
----------------------------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------------------------
@@ -1,3 +1,6 @@
 matplotlib
+pandas
 scikit-learn
 torchvision
+transformers
+wget
@@ -4,7 +4,7 @@
 #
 # Authors:
 #  * Cem Subakan 2022, 2023
-#  * Francesco Paissan 2022, 2023
+#  * Francesco Paissan 2022, 2023, 2024
 #  (based on the SpeechBrain UrbanSound8k recipe)
 # #################################
 
@@ -16,11 +16,20 @@ __set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 data_folder: !PLACEHOLDER  # e.g., /localscratch/ESC-50-master
 audio_data_folder: !ref <data_folder>/audio
 
-experiment_name: cnn14-esc50
+experiment_name: !ref cnn14-esc50
 output_folder: !ref ./results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 
+add_wham_noise: False
+test_only: False
+
+wham_folder: null # Set it if add_wham_noise is True.
+wham_audio_folder: !ref <wham_folder>/tr
+
+
+sample_rate: 16000
+signal_length_s: 5
 
 # Tensorboard logs
 use_tensorboard: False
@@ -47,9 +56,7 @@ lr: 0.0002
 base_lr: 0.00000001
 max_lr: !ref <lr>
 step_size: 65000
-sample_rate: 44100
 
-device: "cpu"
 
 # Feature parameters
 n_mels: 80
@@ -58,6 +65,7 @@ right_frames: 0
 deltas: False
 
 use_melspectra: True
+use_log1p_mel: True
 
 # Number of classes
 out_n_neurons: 50
@@ -84,10 +92,9 @@ embedding_model: !new:speechbrain.lobes.models.Cnn14.Cnn14
     mel_bins: !ref <n_mels>
     emb_dim: 2048
 
-classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
-    input_size: 2048
-    out_neurons: !ref <out_n_neurons>
-    lin_blocks: 1
+classifier: !new:torch.nn.Linear
+    in_features: 2048
+    out_features: !ref <out_n_neurons>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
@@ -107,6 +114,7 @@ compute_fbank: !new:speechbrain.processing.features.Filterbank
     n_mels: 80
     n_fft: !ref <n_fft>
     sample_rate: !ref <sample_rate>
+    log_mel: False
 
 modules:
     compute_stft: !ref <compute_stft>
@@ -145,7 +153,9 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         counter: !ref <epoch_counter>
 
 use_pretrained: True
-# If you do not want to use the pretrained encoder you can simply delete pretrained_encoder field.
+# If you do not want to use the pretrained encoder
+# you can simply delete pretrained_encoder field,
+# or set use_pretrained=False
 embedding_model_path: speechbrain/cnn14-esc50/embedding_model.ckpt
 pretrained_encoder: !new:speechbrain.utils.parameter_transfer.Pretrainer
     collect_in: !ref <save_folder>
 
@@ -16,11 +16,15 @@ __set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
 data_folder: !PLACEHOLDER  # e.g., /localscratch/ESC-50-master
 audio_data_folder: !ref <data_folder>/audio
 
+wham_folder: null # Set it if add_wham_noise is True
+wham_audio_folder: !ref <wham_folder>/tr
+
 experiment_name: conv2dv2_classifier-16k
 output_folder: !ref ./results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 
+test_only: False
 
 # Tensorboard logs
 use_tensorboard: False
@@ -48,8 +52,10 @@ base_lr: 0.000002
 max_lr: !ref <lr>
 step_size: 65000
 sample_rate: 16000
+signal_length_s: 5
+
+add_wham_noise: False
 
-device: "cpu"
 
 # Feature parameters
 n_mels: 80
@@ -65,6 +71,7 @@ dataloader_options:
 
 use_pretrained: True
 use_melspectra: False
+use_log1p_mel: False
 embedding_model: !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
     dim: 256
 
@@ -73,6 +80,10 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
     out_neurons: !ref <out_n_neurons>
     lin_blocks: 1
 
+ #classifier: !new:torch.nn.Linear
+    #in_features: 256
+    #out_features: !ref <out_n_neurons>
+
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 
@@ -22,6 +22,15 @@ output_folder: !ref ./results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 
+add_wham_noise: False
+test_only: False
+
+wham_folder: null # Set it if add_wham_noise is True
+wham_audio_folder: !ref <wham_folder>/tr
+
+use_melspectra: False
+use_log1p_mel: False
+
 # Tensorboard logs
 use_tensorboard: False
 tensorboard_logs_folder: !ref <output_folder>/tb_logs/
@@ -49,6 +58,8 @@ max_lr: !ref <lr>
 step_size: 65000
 sample_rate: 16000
 
+signal_length_s: 5
+
 # Number of classes
 out_n_neurons: 50
 
 
@@ -22,6 +22,14 @@ output_folder: !ref ./results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 
+add_wham_noise: False
+use_melspectra: False
+use_log1p_mel: False
+test_only: False
+
+wham_folder: null # Set it if add_wham_noise is True
+wham_audio_folder: !ref <wham_folder>/tr
+
 # Tensorboard logs
 use_tensorboard: False
 tensorboard_logs_folder: !ref <output_folder>/tb_logs/
@@ -47,7 +55,9 @@ lr: 0.0002
 base_lr: 0.00000001
 max_lr: !ref <lr>
 step_size: 65000
+
 sample_rate: 16000
+signal_length_s: 5
 
 # Number of classes
 out_n_neurons: 50
 
@@ -20,11 +20,14 @@
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 import torchaudio
 import torchvision
 from confusion_matrix_fig import create_cm_fig
+from esc50_prepare import prepare_esc50
 from hyperpyyaml import load_hyperpyyaml
 from sklearn.metrics import confusion_matrix
+from wham_prepare import combine_batches, prepare_wham
 
 import speechbrain as sb
 from speechbrain.utils.distributed import run_on_main
@@ -42,18 +45,23 @@ def compute_forward(self, batch, stage):
         if hasattr(self.hparams, "augmentation") and stage == sb.Stage.TRAIN:
             wavs, lens = self.hparams.augmentation(wavs, lens)
 
-        # Extract features
+        # augment batch with WHAM!
+        if hasattr(self.hparams, "add_wham_noise"):
+            if self.hparams.add_wham_noise:
+                wavs = combine_batches(wavs, iter(self.hparams.wham_dataset))
+
         X_stft = self.modules.compute_stft(wavs)
-        X_stft_power = sb.processing.features.spectral_magnitude(
+        net_input = sb.processing.features.spectral_magnitude(
             X_stft, power=self.hparams.spec_mag_power
         )
         if (
             hasattr(self.hparams, "use_melspectra")
             and self.hparams.use_melspectra
         ):
-            net_input = self.modules.compute_fbank(X_stft_power)
-        else:
-            net_input = torch.log1p(X_stft_power)
+            net_input = self.modules.compute_fbank(net_input)
+
+        if (not self.hparams.use_melspectra) or self.hparams.use_log1p_mel:
+            net_input = torch.log1p(net_input)
 
         # Embeddings + sound classifier
         if hasattr(self.modules.embedding_model, "config"):
@@ -80,11 +88,18 @@ def compute_forward(self, batch, stage):
         else:
             # SpeechBrain model
             embeddings = self.modules.embedding_model(net_input)
+            if isinstance(embeddings, tuple):
+                embeddings, _ = embeddings
+
             if embeddings.ndim == 4:
                 embeddings = embeddings.mean((-1, -2))
 
+        # run through classifier
         outputs = self.modules.classifier(embeddings)
 
+        if outputs.ndim == 2:
+            outputs = outputs.unsqueeze(1)
+
         return outputs, lens
 
     def compute_objectives(self, predictions, batch, stage):
@@ -93,7 +108,17 @@ def compute_objectives(self, predictions, batch, stage):
         uttid = batch.id
         classid, _ = batch.class_string_encoded
 
-        loss = self.hparams.compute_cost(predictions, classid, lens)
+        # Target augmentation
+        N_augments = int(predictions.shape[0] / classid.shape[0])
+        classid = torch.cat(N_augments * [classid], dim=0)
+
+        # loss = self.hparams.compute_cost(predictions.squeeze(1), classid, lens)
+        target = F.one_hot(
+            classid.squeeze(), num_classes=self.hparams.out_n_neurons
+        )
+        loss = (
+            -(F.log_softmax(predictions.squeeze(1), 1) * target).sum(1).mean()
+        )
 
         if stage != sb.Stage.TEST:
             if hasattr(self.hparams.lr_annealing, "on_batch_end"):
@@ -378,8 +403,6 @@ def label_pipeline(class_string):
             hparams["tensorboard_logs_folder"]
         )
 
-    from esc50_prepare import prepare_esc50
-
     run_on_main(
         prepare_esc50,
         kwargs={
@@ -399,6 +422,18 @@ def label_pipeline(class_string):
     datasets, label_encoder = dataio_prep(hparams)
     hparams["label_encoder"] = label_encoder
 
+    if "wham_folder" in hparams:
+        hparams["wham_dataset"] = prepare_wham(
+            hparams["wham_folder"],
+            hparams["add_wham_noise"],
+            hparams["sample_rate"],
+            hparams["signal_length_s"],
+            hparams["wham_audio_folder"],
+        )
+
+    if hparams["wham_dataset"] is not None:
+        assert hparams["signal_length_s"] == 5, "Fix wham sig length!"
+
     class_labels = list(label_encoder.ind2lab.values())
     print("Class Labels:", class_labels)
 
@@ -411,17 +446,21 @@ def label_pipeline(class_string):
     )
 
     # Load pretrained encoder if it exists in the yaml file
+    if not hasattr(ESC50_brain.modules, "embedding_model"):
+        ESC50_brain.hparams.embedding_model.to(ESC50_brain.device)
+
     if "pretrained_encoder" in hparams and hparams["use_pretrained"]:
         run_on_main(hparams["pretrained_encoder"].collect_files)
         hparams["pretrained_encoder"].load_collected()
 
-    ESC50_brain.fit(
-        epoch_counter=ESC50_brain.hparams.epoch_counter,
-        train_set=datasets["train"],
-        valid_set=datasets["valid"],
-        train_loader_kwargs=hparams["dataloader_options"],
-        valid_loader_kwargs=hparams["dataloader_options"],
-    )
+    if not hparams["test_only"]:
+        ESC50_brain.fit(
+            epoch_counter=ESC50_brain.hparams.epoch_counter,
+            train_set=datasets["train"],
+            valid_set=datasets["valid"],
+            train_loader_kwargs=hparams["dataloader_options"],
+            valid_loader_kwargs=hparams["dataloader_options"],
+        )
 
     # Load the best checkpoint for evaluation
     test_stats = ESC50_brain.evaluate(
 
@@ -0,0 +1 @@
+../wham_prepare.py