Implement per-key padding configuration in PaddedBatch (#3008)

Adel-Moumen · Copilot · web-flow · commit c94d5e29a3fa · 2025-12-01T10:23:48.000Z
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/speechbrain/dataio/batch.py b/speechbrain/dataio/batch.py
@@ -43,8 +43,14 @@ class PaddedBatch:
     padding_func : callable, optional
         Called with a list of tensors to be padded together. Needs to return
         two tensors: the padded data, and another tensor for the data lengths.
-    padding_kwargs : dict
+    padding_kwargs : dict, None
         (Optional) Extra kwargs to pass to padding_func. E.G. mode, value
+        This is used as the default padding configuration for all keys.
+    per_key_padding_kwargs : dict, None
+        (Optional) Per-key padding configuration. Keys in this dict should match
+        the keys in the examples. Each value should be a dict with padding parameters
+        (e.g., {'value': -100, 'mode': 'constant'}). If a key is not in this dict,
+        the global padding_kwargs will be used.
     apply_default_convert : bool
         Whether to apply PyTorch default_convert (numpy to torch recursively,
         etc.) on all data. Default:True, usually does the right thing.
@@ -111,6 +117,26 @@ class PaddedBatch:
     ... )
     >>> batch.text
     [['Hello'], ['How', 'are', 'you?']]
+    >>> # Per-key padding configuration:
+    >>> batch = PaddedBatch(
+    ...     [
+    ...         {
+    ...             "wav": torch.tensor([1, 2, 3]),
+    ...             "labels": torch.tensor([1, 2]),
+    ...         },
+    ...         {"wav": torch.tensor([4, 5]), "labels": torch.tensor([3])},
+    ...     ],
+    ...     per_key_padding_kwargs={
+    ...         "wav": {"value": 0},
+    ...         "labels": {"value": -100},
+    ...     },
+    ... )
+    >>> batch.wav.data
+    tensor([[1, 2, 3],
+            [4, 5, 0]])
+    >>> batch.labels.data
+    tensor([[   1,    2],
+            [   3, -100]])
 
     """
 
@@ -120,10 +146,15 @@ def __init__(
         padded_keys=None,
         device_prep_keys=None,
         padding_func=batch_pad_right,
-        padding_kwargs={},
+        padding_kwargs=None,
+        per_key_padding_kwargs=None,
         apply_default_convert=True,
         nonpadded_stack=True,
     ):
+        padding_kwargs = padding_kwargs if padding_kwargs is not None else {}
+        per_key_padding_kwargs = (
+            per_key_padding_kwargs if per_key_padding_kwargs is not None else {}
+        )
         self.__length = len(examples)
         self.__keys = list(examples[0].keys())
         self.__padded_keys = []
@@ -138,7 +169,13 @@ def __init__(
             ):
                 # Padding and PaddedData
                 self.__padded_keys.append(key)
-                padded = PaddedData(*padding_func(values, **padding_kwargs))
+
+                # Use per-key padding config if available, otherwise fall back to global padding_kwargs
+                if key in per_key_padding_kwargs:
+                    key_padding_kwargs = per_key_padding_kwargs[key]
+                else:
+                    key_padding_kwargs = padding_kwargs
+                padded = PaddedData(*padding_func(values, **key_padding_kwargs))
                 setattr(self, key, padded)
             else:
                 # Default PyTorch collate usually does the right thing
diff --git a/tests/unittests/test_batching.py b/tests/unittests/test_batching.py
@@ -80,3 +80,133 @@ def test_pin_memory():
     )
     batch.pin_memory()
     assert batch.foo.data.is_pinned()
+
+
+def test_paddedbatch_per_key_padding(device):
+    """Test per-key padding configuration functionality."""
+    from speechbrain.dataio.batch import PaddedBatch
+
+    examples = [
+        {
+            "wav": torch.tensor([1, 2, 3]).to(device),
+            "labels": torch.tensor([1, 2]).to(device),
+        },
+        {
+            "wav": torch.tensor([4, 5]).to(device),
+            "labels": torch.tensor([3]).to(device),
+        },
+    ]
+
+    # Configure different padding values for different keys
+    per_key_padding_kwargs = {
+        "wav": {"value": 0},  # Pad wav with 0
+        "labels": {"value": -100},  # Pad labels with -100
+    }
+
+    batch = PaddedBatch(examples, per_key_padding_kwargs=per_key_padding_kwargs)
+
+    # Check that wav is padded with 0
+    assert torch.all(batch.wav.data[1, 2:] == 0)
+    assert torch.all(
+        batch.wav.data[0, :3] == torch.tensor([1, 2, 3]).to(device)
+    )
+
+    # Check that labels is padded with -100
+    assert torch.all(batch.labels.data[1, 1:] == -100)
+    assert torch.all(
+        batch.labels.data[0, :2] == torch.tensor([1, 2]).to(device)
+    )
+
+
+def test_paddedbatch_mixed_padding_config(device):
+    """Test mixed configuration where some keys use global config and others use per-key config."""
+    from speechbrain.dataio.batch import PaddedBatch
+
+    examples = [
+        {
+            "wav": torch.tensor([1, 2, 3]).to(device),
+            "labels": torch.tensor([1, 2]).to(device),
+            "features": torch.tensor([0.1, 0.2]).to(device),
+        },
+        {
+            "wav": torch.tensor([4, 5]).to(device),
+            "labels": torch.tensor([3]).to(device),
+            "features": torch.tensor([0.3]).to(device),
+        },
+    ]
+
+    # Global padding config (default)
+    padding_kwargs = {"value": 0}
+
+    # Per-key config (overrides global for specific keys)
+    per_key_padding_kwargs = {
+        "labels": {"value": -100}  # Only labels get special padding
+    }
+
+    batch = PaddedBatch(
+        examples,
+        padding_kwargs=padding_kwargs,
+        per_key_padding_kwargs=per_key_padding_kwargs,
+    )
+
+    # Check that wav uses global padding (0)
+    assert torch.all(batch.wav.data[1, 2:] == 0)
+
+    # Check that labels uses per-key padding (-100)
+    assert torch.all(batch.labels.data[1, 1:] == -100)
+
+    # Check that features uses global padding (0)
+    assert torch.all(batch.features.data[1, 1:] == 0)
+
+
+def test_paddedbatch_numpy_arrays():
+    """Test with numpy arrays to ensure conversion works with per-key padding."""
+    from speechbrain.dataio.batch import PaddedBatch
+
+    examples = [
+        {"wav": np.array([1, 2, 3]), "labels": np.array([1, 2])},
+        {"wav": np.array([4, 5]), "labels": np.array([3])},
+    ]
+
+    per_key_padding_kwargs = {"wav": {"value": 0}, "labels": {"value": -100}}
+
+    batch = PaddedBatch(examples, per_key_padding_kwargs=per_key_padding_kwargs)
+
+    # Check that numpy arrays are converted to torch tensors and padded correctly
+    assert isinstance(batch.wav.data, torch.Tensor)
+    assert isinstance(batch.labels.data, torch.Tensor)
+
+    # Check padding values
+    assert torch.all(batch.wav.data[1, 2:] == 0)
+    assert torch.all(batch.labels.data[1, 1:] == -100)
+
+
+def test_paddedbatch_backward_compatibility(device):
+    """Test that the new functionality maintains backward compatibility."""
+    from speechbrain.dataio.batch import PaddedBatch
+
+    examples = [
+        {
+            "wav": torch.tensor([1, 2, 3]).to(device),
+            "labels": torch.tensor([1, 2]).to(device),
+        },
+        {
+            "wav": torch.tensor([4, 5]).to(device),
+            "labels": torch.tensor([3]).to(device),
+        },
+    ]
+
+    # Test with only padding_kwargs (old behavior)
+    batch_old = PaddedBatch(examples, padding_kwargs={"value": 0})
+
+    # Test with only per_key_padding_kwargs (new behavior)
+    batch_new = PaddedBatch(
+        examples,
+        per_key_padding_kwargs={"wav": {"value": 0}, "labels": {"value": 0}},
+    )
+
+    # Both should produce the same result
+    assert torch.allclose(batch_old.wav.data, batch_new.wav.data)
+    assert torch.allclose(batch_old.labels.data, batch_new.labels.data)
+    assert torch.allclose(batch_old.wav.lengths, batch_new.wav.lengths)
+    assert torch.allclose(batch_old.labels.lengths, batch_new.labels.lengths)