Skip to content

Commit 76a056c

Browse files
committed
[VNet/TF] Updating for 20.06 container
1 parent 36f3b1b commit 76a056c

12 files changed

Lines changed: 183 additions & 112 deletions

File tree

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1-
FROM nvcr.io/nvidia/tensorflow:19.11-tf1-py3
1+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
2+
FROM ${FROM_IMAGE_NAME}
23

34
ADD . /workspace/vnet
45
WORKDIR /workspace/vnet
56

67
RUN pip install --upgrade pip
78
RUN pip install --disable-pip-version-check -r requirements.txt
89

10+
ENV TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"
11+
912

TensorFlow/Segmentation/VNet/README.md

Lines changed: 121 additions & 81 deletions
Large diffs are not rendered by default.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
jobs:
2+
# no AMP
3+
- export TF_ENABLE_AUTO_MIXED_PRECISION=0 && python main.py --data_dir /data/Task04_Hippocampus --model_dir /tmp --exec_mode train_and_evaluate --augment --train_epochs 80 --split_seed 42 --base_lr 0.0001 --train_split 0.9 --log_dir /result/log.json --batch_size 2 --use_xla
4+
# with AMP
5+
- export TF_ENABLE_AUTO_MIXED_PRECISION=0 && python main.py --data_dir /data/Task04_Hippocampus --model_dir /tmp --exec_mode train_and_evaluate --augment --train_epochs 80 --split_seed 42 --base_lr 0.0001 --train_split 0.9 --log_dir /result/log.json --batch_size 2 --use_xla --use_amp
6+
7+
backend:
8+
container: nvcr.io/nvidian/swdl/vnet_tf1:20.06
9+
download_dir: /tmp
10+
hostname: ngc
11+
instance: dgx1v.16g.1.norm
12+
result_dir: /result
13+
datasets:
14+
/data: 36816
15+
16+
reports:
17+
filename: vnet_ngc_conv_1gpu_20.06
18+
types:
19+
- xls

TensorFlow/Segmentation/VNet/examples/vnet_benchmark.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,7 @@
4141
required=True,
4242
type=int)
4343

44-
PARSER.add_argument('--precision',
45-
choices=['fp32', 'fp16'],
46-
required=True,
47-
type=str)
44+
PARSER.add_argument('--amp', dest='use_amp', action='store_true', default=False)
4845

4946

5047
def build_horovod_prefix(gpus):
@@ -65,7 +62,7 @@ def build_command(FLAGS, path_to_main, use_amp):
6562
def main():
6663
FLAGS = PARSER.parse_args()
6764

68-
use_amp = '--use_amp' if FLAGS.precision == 'fp16' else ''
65+
use_amp = '--amp' if FLAGS.use_amp else ''
6966
path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), 'main.py')
7067

7168
cmd = build_command(FLAGS, path_to_main, use_amp)

TensorFlow/Segmentation/VNet/examples/vnet_predict.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,7 @@
3232
required=True,
3333
type=int)
3434

35-
PARSER.add_argument('--precision',
36-
choices=['fp32', 'fp16'],
37-
required=True,
38-
type=str)
35+
PARSER.add_argument('--amp', dest='use_amp', action='store_true', default=False)
3936

4037

4138
def build_command(FLAGS, path_to_main, use_amp):
@@ -50,7 +47,7 @@ def build_command(FLAGS, path_to_main, use_amp):
5047
def main():
5148
FLAGS = PARSER.parse_args()
5249

53-
use_amp = '' if FLAGS.precision == 'fp32' else '--use_amp'
50+
use_amp = '--amp' if FLAGS.use_amp else ''
5451
path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), 'main.py')
5552

5653
cmd = build_command(FLAGS, path_to_main, use_amp)

TensorFlow/Segmentation/VNet/examples/vnet_train.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,8 @@
4646
type=int,
4747
help='Number of epochs for training')
4848

49-
PARSER.add_argument('--precision',
50-
choices=['fp32', 'fp16'],
51-
required=True,
52-
type=str,
53-
help='Arithmetic precision to be used during training')
49+
50+
PARSER.add_argument('--amp', dest='use_amp', action='store_true', default=False)
5451

5552
PARSER.add_argument('--base_lr',
5653
default=0.0001,
@@ -77,7 +74,7 @@ def build_command(FLAGS, path_to_main, use_amp):
7774
def main():
7875
FLAGS = PARSER.parse_args()
7976

80-
use_amp = '' if FLAGS.precision == 'fp32' else '--use_amp'
77+
use_amp = '--amp' if FLAGS.use_amp else ''
8178
path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), 'main.py')
8279

8380
cmd = build_command(FLAGS, path_to_main, use_amp)

TensorFlow/Segmentation/VNet/examples/vnet_train_and_evaluate.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,7 @@
4545
type=int,
4646
help='Number of epochs for training')
4747

48-
PARSER.add_argument('--precision',
49-
choices=['fp32', 'fp16'],
50-
required=True,
51-
type=str,
52-
help='Arithmetic precision to be used during training')
48+
PARSER.add_argument('--amp', dest='use_amp', action='store_true', default=False)
5349

5450
PARSER.add_argument('--base_lr',
5551
default=0.0001,
@@ -76,7 +72,7 @@ def build_command(FLAGS, path_to_main, use_amp):
7672
def main():
7773
FLAGS = PARSER.parse_args()
7874

79-
use_amp = '' if FLAGS.precision == 'fp32' else '--use_amp'
75+
use_amp = '--amp' if FLAGS.use_amp else ''
8076
path_to_main = os.path.join(dirname(dirname(os.path.realpath(__file__))), 'main.py')
8177

8278
cmd = build_command(FLAGS, path_to_main, use_amp)

TensorFlow/Segmentation/VNet/hooks/profiling_hook.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,24 @@ def before_run(self, run_context):
3535
self._timestamps.append(time.time())
3636

3737
def end(self, session):
38-
deltas = [self._timestamps[i + 1] - self._timestamps[i] for i in range(len(self._timestamps) - 1)]
39-
self._logger.log(step=(), data={
40-
'average_throughput_train' if self._training else 'average_throughput_test': self._global_batch_size / np.mean(deltas)})
38+
deltas = np.array([self._timestamps[i + 1] - self._timestamps[i] for i in range(len(self._timestamps) - 1)])
39+
stats = process_performance_stats(np.array(deltas),
40+
self._global_batch_size)
41+
42+
self._logger.log(step=(), data={metric: value for (metric, value) in stats})
4143
self._logger.flush()
44+
45+
46+
def process_performance_stats(timestamps, batch_size):
47+
timestamps_ms = 1000 * timestamps
48+
latency_ms = timestamps_ms.mean()
49+
std = timestamps_ms.std()
50+
n = np.sqrt(len(timestamps_ms))
51+
throughput_imgps = (1000.0 * batch_size / timestamps_ms).mean()
52+
53+
stats = [("Throughput Avg", str(throughput_imgps)),
54+
('Latency Avg:', str(latency_ms))]
55+
for ci, lvl in zip(["90%:", "95%:", "99%:"],
56+
[1.645, 1.960, 2.576]):
57+
stats.append(("Latency_"+ci, str(latency_ms + lvl * std / n)))
58+
return stats
44.5 KB
Loading

TensorFlow/Segmentation/VNet/main.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,10 @@ def main(_):
8686

8787
gpu_options = tf.GPUOptions()
8888
config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
89-
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
89+
90+
if FLAGS.use_xla:
91+
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
92+
9093
config.gpu_options.allow_growth = True
9194
config.gpu_options.visible_device_list = str(hvd.local_rank())
9295

@@ -135,9 +138,9 @@ def main(_):
135138
steps=dataset.eval_steps,
136139
hooks=[])
137140

138-
DLLogger.log(step=tuple(), data={'background_dice': str(result['background dice'])})
139-
DLLogger.log(step=tuple(), data={'anterior_dice': str(result['Anterior dice'])})
140-
DLLogger.log(step=tuple(), data={'posterior_dice': str(result['Posterior dice'])})
141+
DLLogger.log(step=tuple(), data={'background_dice': str(result['background dice']),
142+
'anterior_dice': str(result['Anterior dice']),
143+
'posterior_dice': str(result['Posterior dice'])})
141144

142145
if 'predict' in FLAGS.exec_mode:
143146
count = 1

0 commit comments

Comments
 (0)