Skip to content

Commit 1e35179

Browse files
committed
[SSD/TF] Updating for Ampere
1 parent 31ca062 commit 1e35179

File tree

16 files changed

+462
-61
lines changed

16 files changed

+462
-61
lines changed

TensorFlow/Detection/SSD/Dockerfile

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
FROM nvcr.io/nvidia/tensorflow:19.05-py3 as base
1+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
2+
FROM ${FROM_IMAGE_NAME}
23

3-
FROM base as sha
44

5-
RUN mkdir /sha
6-
RUN cat `cat HEAD | cut -d' ' -f2` > /sha/repo_sha
7-
8-
FROM base as final
95

106
WORKDIR /workdir
117

8+
RUN export DEBIAN_FRONTEND=noninteractive \
9+
&& apt-get update \
10+
&& apt-get install -y --no-install-recommends \
11+
libpmi2-0-dev \
12+
&& rm -rf /var/lib/apt/lists/*
13+
1214
RUN PROTOC_VERSION=3.0.0 && \
1315
PROTOC_ZIP=protoc-${PROTOC_VERSION}-linux-x86_64.zip && \
1416
curl -OL https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/$PROTOC_ZIP && \
@@ -18,6 +20,7 @@ RUN PROTOC_VERSION=3.0.0 && \
1820
COPY requirements.txt .
1921
RUN pip install Cython
2022
RUN pip install -r requirements.txt
23+
RUN pip --no-cache-dir --no-cache install 'git+https://github.com/NVIDIA/dllogger'
2124

2225
WORKDIR models/research/
2326
COPY models/research/ .
@@ -26,6 +29,5 @@ ENV PYTHONPATH="/workdir/models/research/:/workdir/models/research/slim/:$PYTHON
2629

2730
COPY examples/ examples
2831
COPY configs/ configs/
32+
COPY qa/ qa/
2933
COPY download_all.sh download_all.sh
30-
31-
COPY --from=sha /sha .

TensorFlow/Detection/SSD/README.md

Lines changed: 316 additions & 34 deletions
Large diffs are not rendered by default.

TensorFlow/Detection/SSD/examples/SSD320_FP16_1GPU.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
CKPT_DIR=${1:-"/results/SSD320_FP16_1GPU"}
1616
PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_full_1gpus.config"
1717

18-
export TF_ENABLE_AUTO_MIXED_PRECISION=1
19-
2018
TENSOR_OPS=0
2119
export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
2220
export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -26,4 +24,5 @@ time python -u ./object_detection/model_main.py \
2624
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
2725
--model_dir=${CKPT_DIR} \
2826
--alsologtostder \
27+
--amp \
2928
"${@:3}"

TensorFlow/Detection/SSD/examples/SSD320_FP16_1GPU_BENCHMARK.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_1GPU"}
1616
PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_bench.config"
1717
GPUS=1
1818

19-
export TF_ENABLE_AUTO_MIXED_PRECISION=1
20-
2119
TENSOR_OPS=0
2220
export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
2321
export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -27,6 +25,7 @@ TRAIN_LOG=$(python -u ./object_detection/model_main.py \
2725
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
2826
--model_dir=${CKPT_DIR} \
2927
--alsologtostder \
28+
--amp \
3029
"${@:3}" 2>&1)
3130
PERF=$(echo "$TRAIN_LOG" | sed -n 's|.*global_step/sec: \(\S\+\).*|\1|p' | python -c "import sys; x = sys.stdin.readlines(); x = [float(a) for a in x[int(len(x)*3/4):]]; print(32*$GPUS*sum(x)/len(x), 'img/s')")
3231

TensorFlow/Detection/SSD/examples/SSD320_FP16_4GPU.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_4GPU"}
1616
PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_full_4gpus.config"
1717
GPUS=4
1818

19-
export TF_ENABLE_AUTO_MIXED_PRECISION=1
20-
2119
TENSOR_OPS=0
2220
export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
2321
export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -37,4 +35,5 @@ time mpirun --allow-run-as-root \
3735
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
3836
--model_dir=${CKPT_DIR} \
3937
--alsologtostder \
38+
--amp \
4039
"${@:3}"

TensorFlow/Detection/SSD/examples/SSD320_FP16_4GPU_BENCHMARK.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_4GPU"}
1616
PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_bench.config"
1717
GPUS=4
1818

19-
export TF_ENABLE_AUTO_MIXED_PRECISION=1
20-
2119
TENSOR_OPS=0
2220
export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
2321
export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -37,6 +35,7 @@ TRAIN_LOG=$(mpirun --allow-run-as-root \
3735
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
3836
--model_dir=${CKPT_DIR} \
3937
--alsologtostder \
38+
--amp \
4039
"${@:3}" 2>&1)
4140
PERF=$(echo "$TRAIN_LOG" | sed -n 's|.*global_step/sec: \(\S\+\).*|\1|p' | python -c "import sys; x = sys.stdin.readlines(); x = [float(a) for a in x[int(len(x)*3/4):]]; print(32*$GPUS*sum(x)/len(x), 'img/s')")
4241

TensorFlow/Detection/SSD/examples/SSD320_FP16_8GPU.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_8GPU"}
1616
PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_full_8gpus.config"
1717
GPUS=8
1818

19-
export TF_ENABLE_AUTO_MIXED_PRECISION=1
20-
2119
TENSOR_OPS=0
2220
export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
2321
export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -39,4 +37,5 @@ time mpirun --allow-run-as-root \
3937
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
4038
--model_dir=${CKPT_DIR} \
4139
--alsologtostder \
40+
--amp \
4241
"${@:3}" 2>&1 | tee $CKPT_DIR/train_log

TensorFlow/Detection/SSD/examples/SSD320_FP16_8GPU_BENCHMARK.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_8GPU"}
1616
PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_bench.config"
1717
GPUS=8
1818

19-
export TF_ENABLE_AUTO_MIXED_PRECISION=1
20-
2119
TENSOR_OPS=0
2220
export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
2321
export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -37,6 +35,7 @@ TRAIN_LOG=$(mpirun --allow-run-as-root \
3735
--pipeline_config_path=${PIPELINE_CONFIG_PATH} \
3836
--model_dir=${CKPT_DIR} \
3937
--alsologtostder \
38+
--amp \
4039
"${@:3}" 2>&1)
4140
PERF=$(echo "$TRAIN_LOG" | sed -n 's|.*global_step/sec: \(\S\+\).*|\1|p' | python -c "import sys; x = sys.stdin.readlines(); x = [float(a) for a in x[int(len(x)*3/4):]]; print(32*$GPUS*sum(x)/len(x), 'img/s')")
4241

TensorFlow/Detection/SSD/examples/SSD320_inference.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,26 @@
1919
from time import time
2020

2121
import tensorflow as tf
22+
import dllogger
2223

2324
from object_detection import model_hparams
2425
from object_detection import model_lib
26+
from object_detection.utils.exp_utils import setup_dllogger
2527

28+
import numpy as np
2629

2730
flags.DEFINE_string('checkpoint_dir', None, 'Path to directory holding a checkpoint. If '
2831
'`checkpoint_dir` is not provided, benchmark is running on random model')
2932
flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config file.')
33+
flags.DEFINE_string("raport_file", default="summary.json",
34+
help="Path to dlloger json")
3035
flags.DEFINE_integer('warmup_iters', 100, 'Number of iterations skipped during benchmark')
3136
flags.DEFINE_integer('benchmark_iters', 300, 'Number of iterations measured by benchmark')
3237
flags.DEFINE_integer('batch_size', 1, 'Number of inputs processed paralelly')
38+
flags.DEFINE_list("percentiles", default=['90', '95', '99'],
39+
help="percentiles for latency confidence intervals")
40+
41+
3342
FLAGS = flags.FLAGS
3443

3544
flags.mark_flag_as_required('pipeline_config_path')
@@ -58,6 +67,7 @@ def benchmark_input_fn(params={}):
5867
class TimingHook(tf.train.SessionRunHook):
5968
def __init__(self):
6069
super(TimingHook, self).__init__()
70+
setup_dllogger(enabled=True, filename=FLAGS.raport_file)
6171
self.times = []
6272

6373
def before_run(self, *args, **kwargs):
@@ -73,13 +83,23 @@ def after_run(self, *args, **kwargs):
7383
self.times.append(time() - self.start_time)
7484
self.log_progress()
7585

76-
def collect_result(self):
77-
return FLAGS.batch_size * FLAGS.benchmark_iters / sum(self.times[FLAGS.benchmark_iters:])
78-
7986
def end(self, *args, **kwargs):
8087
super(TimingHook, self).end(*args, **kwargs)
88+
throughput = sum([1/x for x in self.times[FLAGS.warmup_iters:]]) * FLAGS.batch_size / FLAGS.benchmark_iters
89+
latency_avg = 1000 * sum(self.times[FLAGS.warmup_iters:]) / FLAGS.benchmark_iters
90+
latency_data = 1000 * np.array(self.times[FLAGS.warmup_iters:])
91+
summary = {
92+
'infer_throughput': throughput,
93+
'eval_avg_latency': latency_avg
94+
}
8195
print()
82-
print('Benchmark result:', self.collect_result(), 'img/s')
96+
print('Benchmark result:', throughput, 'img/s')
97+
for p in FLAGS.percentiles:
98+
p = int(p)
99+
tf.logging.info("Latency {}%: {:>4.2f} ms".format(
100+
p, np.percentile(latency_data, p)))
101+
summary[f'eval_{p}%_latency'] = np.percentile(latency_data, p)
102+
dllogger.log(step=tuple(), data=summary)
83103

84104

85105
def main(unused_argv):
527 Bytes
Loading

0 commit comments

Comments
 (0)