[BERT/PyT] stop and resume, single gpu and timing fixes. (NVIDIA#509)

sharathts · web-flow · commit 9df464f277cc · 2020-05-17T12:46:53.000-07:00
* stop and resume, single gpu and timing fixes.

* Update utils.py

* accumulation features check
diff --git a/PyTorch/LanguageModeling/BERT/run_pretraining.py b/PyTorch/LanguageModeling/BERT/run_pretraining.py
@@ -291,6 +291,10 @@ def setup_training(args):
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
         args.n_gpu = 1
+
+    if args.gradient_accumulation_steps == 1:
+        args.allreduce_post_accumulation = False
+        args.allreduce_post_accumulation_fp16 = False
         
     if is_main_process():
         dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
diff --git a/PyTorch/LanguageModeling/BERT/utils.py b/PyTorch/LanguageModeling/BERT/utils.py
@@ -21,6 +21,13 @@ def get_rank():
         return 0
     return dist.get_rank()
 
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
 def is_main_process():
     return get_rank() == 0
 
@@ -34,4 +41,4 @@ def format_step(step):
         s += "Training Iteration: {} ".format(step[1])
     if len(step) > 2:
         s += "Validation Iteration: {} ".format(step[2])
-    return s
+    return s