Skip to content

Commit e1af187

Browse files
hXl3snv-kkudrynski
authored andcommitted
[Convnets/TF] Added barrier at the end of multiprocess run to allow step synchronization
1 parent 18e7756 commit e1af187

File tree

5 files changed

+23
-14
lines changed

5 files changed

+23
-14
lines changed

TensorFlow/Classification/ConvNets/main.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,17 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
from utils.cmdline_helper import parse_cmdline
19+
from model.resnet import model_architectures
20+
from runtime import Runner
21+
import dllogger
22+
from utils import hvd_wrapper as hvd
23+
import tensorflow as tf
1824
import os
1925

2026
import warnings
2127
warnings.simplefilter("ignore")
2228

23-
import tensorflow as tf
24-
25-
from utils import hvd_wrapper as hvd
26-
import dllogger
27-
28-
from runtime import Runner
29-
from model.resnet import model_architectures
30-
31-
from utils.cmdline_helper import parse_cmdline
3229

3330
if __name__ == "__main__":
3431

@@ -42,7 +39,8 @@
4239
os.makedirs(FLAGS.results_dir, exist_ok=True)
4340

4441
dllogger.init(backends=[
45-
dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
42+
dllogger.JSONStreamBackend(
43+
verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
4644
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
4745
])
4846
else:
@@ -100,7 +98,8 @@
10098
if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
10199

102100
if FLAGS.mode == 'inference_benchmark' and hvd.size() > 1:
103-
raise NotImplementedError("Only single GPU inference is implemented.")
101+
raise NotImplementedError(
102+
"Only single GPU inference is implemented.")
104103

105104
elif hvd.rank() == 0:
106105
runner.evaluate(iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
@@ -114,6 +113,10 @@
114113
symmetric=FLAGS.symmetric,
115114
use_final_conv=FLAGS.use_final_conv,
116115
use_qdq=FLAGS.use_qdq)
116+
if hvd.size() > 1:
117+
# Wait for all processes to finish
118+
from mpi4py import MPI
119+
MPI.COMM_WORLD.Barrier()
117120

118121
if FLAGS.mode == 'predict':
119122
if FLAGS.to_predict is None:
@@ -123,7 +126,8 @@
123126
raise ValueError("Only prediction on single images is supported!")
124127

125128
if hvd.size() > 1:
126-
raise NotImplementedError("Only single GPU inference is implemented.")
129+
raise NotImplementedError(
130+
"Only single GPU inference is implemented.")
127131

128132
else:
129133
runner.predict(FLAGS.to_predict,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
git+https://github.com/NVIDIA/dllogger#egg=dllogger
2+
mpi4py

TensorFlow/Classification/ConvNets/resnet50v1.5/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,7 @@ on NVIDIA T4 with (1x T4 16G) GPU.
856856
8. August 2020
857857
* Updated command line argument names
858858
* Added support for syntetic dataset with different image size
859-
859+
9. January, 2022
860+
* Added barrier at the end of multiprocess run
860861
### Known issues
861862
Performance without XLA enabled is low due to BN + ReLU fusion bug.

TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,7 @@ June 2020
796796
August 2020
797797
- Updated command line argument names
798798
- Added support for syntetic dataset with different image size
799-
799+
January 2022
800+
- Added barrier at the end of multiprocess run
800801
### Known issues
801802
Performance without XLA enabled is low due to BN + ReLU fusion bug.

TensorFlow/Classification/ConvNets/se-resnext101-32x4d/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,8 @@ April 2020
789789
August 2020
790790
- Updated command line argument names
791791
- Added support for syntetic dataset with different image size
792+
January 2022
793+
- Added barrier at the end of multiprocess run
792794

793795
### Known issues
794796
Performance without XLA enabled is low due to BN + ReLU fusion bug.

0 commit comments

Comments
 (0)