Make run_on_main and main_process_only return the result to all proce… (#2943)

rogiervd · Rogier van Dalen · web-flow · commit 2eb47df951c8 · 2025-07-09T15:52:10.000-04:00
Co-authored-by: Rogier van Dalen &lt;r.vandalen@samsung.com&gt;
diff --git a/speechbrain/utils/distributed.py b/speechbrain/utils/distributed.py
@@ -104,6 +104,11 @@ def run_on_main(
         Keyword args to pass to post_func.
     run_post_on_main : bool
         Whether to run post_func on main process as well. (default: False)
+
+    Returns
+    -------
+    On all processes: the value that func returned, when it ran on the main
+    process.
     """
     # Handle the mutable data types' default args:
     if args is None:
@@ -115,7 +120,7 @@ def run_on_main(
     if post_kwargs is None:
         post_kwargs = {}
 
-    main_process_only(func)(*args, **kwargs)
+    result = main_process_only(func)(*args, **kwargs)
     ddp_barrier()
 
     if post_func is not None:
@@ -128,6 +133,8 @@ def run_on_main(
                 post_func(*post_args, **post_kwargs)
             ddp_barrier()
 
+    return result
+
 
 def is_distributed_initialized() -> bool:
     r"Returns whether the current system is distributed."
@@ -171,16 +178,19 @@ def main_process_only(function):
     r"""Function decorator to ensure the function runs only on the main process.
     This is useful for things like saving to the filesystem or logging
     to a web address where you only want it to happen on a single process.
+    The function will return the result computed on the main process to all
+    processes.
     """
 
     @wraps(function)
     def main_proc_wrapped_func(*args, **kwargs):
         """This decorated function runs only if this is the main process."""
         with MainProcessContext():
             if if_main_process():
-                return function(*args, **kwargs)
+                result = function(*args, **kwargs)
             else:
-                return None
+                result = None
+        return ddp_broadcast(result)
 
     return main_proc_wrapped_func
 
diff --git a/tests/unittests/test_distributed.py b/tests/unittests/test_distributed.py
@@ -0,0 +1,93 @@
+"""
+Test for distributed.py.
+"""
+
+import operator
+
+import torch
+
+from speechbrain.utils import distributed
+
+
+def mock_initialise_process_group(rank: int, world_size: int, tmpdir):
+    """
+    Pretend to run on under "torchrun".
+    """
+    import os
+
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    sync_file = f"file://{tmpdir}/sync"
+    torch.distributed.init_process_group(
+        "gloo", rank=rank, world_size=world_size, init_method=sync_file
+    )
+
+
+# Test @main_process_only a single call with a return value.
+
+
+@distributed.main_process_only
+def return_rank_345(rank, world_size):
+    # This should run on the main process only.
+    assert rank == 0
+    return 345 + rank
+
+
+def return_ranks_345_in_mock_process(rank, world_size, tmpdir):
+    mock_initialise_process_group(rank, world_size, tmpdir)
+    result = return_rank_345(rank, world_size)
+    assert result == 345
+
+
+def test_main_process_only(tmpdir):
+    world_size = 2
+    torch.multiprocessing.spawn(
+        return_ranks_345_in_mock_process,
+        (world_size, tmpdir),
+        world_size,
+        join=True,
+    )
+
+
+# Test @main_process_only a recursive call.
+
+
+@distributed.main_process_only
+def fibonacci(n):
+    if n == 0 or n == 1:
+        return 1
+    return fibonacci(n - 2) + fibonacci(n - 1)
+
+
+def check_fibonacci(rank, world_size, tmpdir):
+    mock_initialise_process_group(rank, world_size, tmpdir)
+    assert fibonacci(0) == 1
+    assert fibonacci(1) == 1
+    assert fibonacci(2) == 2
+    assert fibonacci(3) == 3
+    assert fibonacci(4) == 5
+
+
+def test_main_process_only_nested(tmpdir):
+    world_size = 2
+    torch.multiprocessing.spawn(
+        check_fibonacci, (world_size, tmpdir), world_size, join=True
+    )
+
+
+# Test run_on_main.
+
+
+def check_add_in_mock_process(rank, world_size, tmpdir, i, j):
+    mock_initialise_process_group(rank, world_size, tmpdir)
+    assert distributed.run_on_main(operator.add, args=(i, j)) == i + j
+
+
+def test_run_on_main(tmpdir):
+    world_size = 2
+    torch.multiprocessing.spawn(
+        check_add_in_mock_process,
+        (world_size, tmpdir, 23, 54),
+        world_size,
+        join=True,
+    )