Enable cpu/xpu support for the benchmarking suite (deepspeedai#905)

louie-tsai · web-flow · commit b04fedd32665 · 2024-08-14T13:34:42.000-07:00
* enable cpu/xpu support for the benchmarking suite

* fixes according to review feedback
diff --git a/benchmarks/communication/README.md b/benchmarks/communication/README.md
@@ -1,6 +1,6 @@
 # The DeepSpeed Communication Benchmarking Suite
 
-The intent of these benchmarks is to measure communication latency/bw of deepspeed and/or pytorch distributed communication operations at the Python layer. These benchmarks are complementary to C-level comms benchmarks like [OSU Micro-Benchmarks](https://mvapich.cse.ohio-state.edu/benchmarks/) and [NCCL Tests](https://github.com/NVIDIA/nccl-tests) in that users can:
+The intent of these benchmarks is to measure communication latency/bw of deepspeed and/or pytorch distributed communication operations at the Python layer. These benchmarks are complementary to C-level comms benchmarks like [OSU Micro-Benchmarks](https://mvapich.cse.ohio-state.edu/benchmarks/) , [NCCL Tests](https://github.com/NVIDIA/nccl-tests) and [oneCCL Benchmark](https://oneapi-src.github.io/oneCCL/benchmark.html) in that users can:
 - Easily debug which layer of the communication software stack hangs or performance degradations originate from.
 - Measure the expected communication performance of either DeepSpeed comms or pure PyTorch distributed
 
@@ -77,6 +77,14 @@ Finally, users can choose specific communication operations to run in `run_all.p
 deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
 </pre>
 
+## CPU Support
+Those benchmarks could also support other devices like Intel CPU via oneCCL.
+Users just need to append one more argument "--device cpu" for all python scripts to run on Intel CPU.
+For example, run with a single large message size on Intel CPU:
+<pre>
+deepspeed all_reduce.py --device cpu
+</pre>
+
 
 # Adding Communication Benchmarks
 
diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py
@@ -17,6 +17,9 @@
 
 # Run all_gather and print metrics
 def timed_all_gather(input, output, start_event, end_event, args):
+    if args.device == "cpu":
+        print_rank_0(f"No Event support on CPU to measure time for now")
+        return
     if args.dist == 'torch':
         import torch.distributed as dist
 
@@ -64,8 +67,15 @@ def run_all_gather(local_rank, args):
     global_rank = dist.get_rank()
     world_size = dist.get_world_size()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    if args.device == "xpu":
+        start_event = torch.xpu.Event(enable_timing=True)
+        end_event = torch.xpu.Event(enable_timing=True)
+    elif args.device == "cpu":
+        start_event = torch.cpu.Event()
+        end_event = torch.cpu.Event()
+    else:
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
 
     if args.scan:
         # Create list of message sizes
diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py
@@ -15,6 +15,9 @@
 
 
 def timed_all_reduce(input, start_event, end_event, args):
+    if args.device == "cpu":
+        print_rank_0(f"No Event support on CPU to measure time for now")
+        return
     if args.dist == 'torch':
         import torch.distributed as dist
     elif args.dist == 'deepspeed':
@@ -60,8 +63,15 @@ def run_all_reduce(local_rank, args):
     world_size = dist.get_world_size()
     global_rank = dist.get_rank()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    if args.device == "xpu":
+        start_event = torch.xpu.Event(enable_timing=True)
+        end_event = torch.xpu.Event(enable_timing=True)
+    elif args.device == "cpu":
+        start_event = torch.cpu.Event()
+        end_event = torch.cpu.Event()
+    else:
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
 
     if args.scan:
         M_LIST = []
diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py
@@ -15,6 +15,9 @@
 
 
 def timed_all_to_all(input, output, start_event, end_event, args):
+    if args.device == "cpu":
+        print_rank_0(f"No Event support on CPU to measure time for now")
+        return
     if args.dist == 'torch':
         import torch.distributed as dist
     elif args.dist == 'deepspeed':
@@ -59,8 +62,15 @@ def run_all_to_all(local_rank, args):
     # Prepare benchmark header
     print_header(args, 'all_to_all')
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    if args.device == "xpu":
+        start_event = torch.xpu.Event(enable_timing=True)
+        end_event = torch.xpu.Event(enable_timing=True)
+    elif args.device == "cpu":
+        start_event = torch.cpu.Event()
+        end_event = torch.cpu.Event()
+    else:
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
 
     if args.scan:
         M_LIST = []
diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py
@@ -15,6 +15,9 @@
 
 
 def timed_broadcast(input, start_event, end_event, args):
+    if args.device == "cpu":
+        print_rank_0(f"No Event support on CPU to measure time for now")
+        return
     if args.dist == 'torch':
         import torch.distributed as dist
     elif args.dist == 'deepspeed':
@@ -60,8 +63,15 @@ def run_broadcast(local_rank, args):
     world_size = dist.get_world_size()
     global_rank = dist.get_rank()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    if args.device == "xpu":
+        start_event = torch.xpu.Event(enable_timing=True)
+        end_event = torch.xpu.Event(enable_timing=True)
+    elif args.device == "cpu":
+        start_event = torch.cpu.Event()
+        end_event = torch.cpu.Event()
+    else:
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
 
     if args.scan:
         M_LIST = []
diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py
@@ -12,4 +12,5 @@
 DEFAULT_UNIT = 'Gbps'
 DEFAULT_DIST = 'deepspeed'
 DEFAULT_MAXSIZE = 24
+DEFAULT_DEVICE = 'cuda'
 TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py
@@ -15,6 +15,9 @@
 
 
 def timed_pt2pt(input, start_event, end_event, args):
+    if args.device == "cpu":
+        print_rank_0(f"No Event support on CPU to measure time for now")
+        return
     if args.dist == 'torch':
         import torch.distributed as dist
     elif args.dist == 'deepspeed':
@@ -78,8 +81,15 @@ def run_pt2pt(local_rank, args):
     global_rank = dist.get_rank()
     world_size = dist.get_world_size()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    if args.device == "xpu":
+        start_event = torch.xpu.Event(enable_timing=True)
+        end_event = torch.xpu.Event(enable_timing=True)
+    elif args.device == "cpu":
+        start_event = torch.cpu.Event()
+        end_event = torch.cpu.Event()
+    else:
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
 
     if args.scan:
         # Create list of message sizes
diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py
@@ -108,6 +108,11 @@ def get_bw(comm_op, size, duration, args):
     n = dist.get_world_size()
     tput = 0
     busbw = 0
+
+    if duration == 0:
+        print_rank_0("Error. Duration is 0.")
+        return tput, busbw
+
     if comm_op == "all_to_all":
         tput = (size / duration)
         busbw = (size / duration) * ((n - 1) / n)
@@ -235,4 +240,5 @@ def benchmark_parser():
                         default=.3,
                         help='Proportion of max available GPU memory to use for single-size evals')
     parser.add_argument("--debug", action="store_true", help='Enables all_to_all debug prints')
+    parser.add_argument("--device", type=str, default=DEFAULT_DEVICE, help='target device')
     return parser