[SSD/TF] Updating for Ampere

nv-kkudrynski · nv-kkudrynski · commit 1e35179e9683 · 2020-07-08T00:10:36.000+02:00
diff --git a/TensorFlow/Detection/SSD/Dockerfile b/TensorFlow/Detection/SSD/Dockerfile
@@ -1,14 +1,16 @@
-FROM nvcr.io/nvidia/tensorflow:19.05-py3 as base
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
+FROM ${FROM_IMAGE_NAME}
 
-FROM base as sha
 
-RUN mkdir /sha
-RUN cat `cat HEAD | cut -d' ' -f2` > /sha/repo_sha
-
-FROM base as final
 
 WORKDIR /workdir
 
+RUN export DEBIAN_FRONTEND=noninteractive \
+ && apt-get update \
+ && apt-get install -y --no-install-recommends \
+        libpmi2-0-dev \
+ && rm -rf /var/lib/apt/lists/*
+
 RUN PROTOC_VERSION=3.0.0 && \
     PROTOC_ZIP=protoc-${PROTOC_VERSION}-linux-x86_64.zip && \
     curl -OL https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/$PROTOC_ZIP && \
@@ -18,6 +20,7 @@ RUN PROTOC_VERSION=3.0.0 && \
 COPY requirements.txt .
 RUN pip install Cython
 RUN pip install -r requirements.txt
+RUN pip --no-cache-dir --no-cache install 'git+https://github.com/NVIDIA/dllogger'
 
 WORKDIR models/research/
 COPY models/research/ .
@@ -26,6 +29,5 @@ ENV PYTHONPATH="/workdir/models/research/:/workdir/models/research/slim/:$PYTHON
 
 COPY examples/ examples
 COPY configs/ configs/
+COPY qa/ qa/
 COPY download_all.sh download_all.sh
-
-COPY --from=sha /sha .
diff --git a/TensorFlow/Detection/SSD/README.md b/TensorFlow/Detection/SSD/README.md
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_FP16_1GPU.sh b/TensorFlow/Detection/SSD/examples/SSD320_FP16_1GPU.sh
@@ -15,8 +15,6 @@
 CKPT_DIR=${1:-"/results/SSD320_FP16_1GPU"}
 PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_full_1gpus.config"
 
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-
 TENSOR_OPS=0
 export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
 export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -26,4 +24,5 @@ time python -u ./object_detection/model_main.py \
        --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
        --model_dir=${CKPT_DIR} \
        --alsologtostder \
+       --amp \
        "${@:3}"
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_FP16_1GPU_BENCHMARK.sh b/TensorFlow/Detection/SSD/examples/SSD320_FP16_1GPU_BENCHMARK.sh
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_1GPU"}
 PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_bench.config"
 GPUS=1
 
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-
 TENSOR_OPS=0
 export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
 export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -27,6 +25,7 @@ TRAIN_LOG=$(python -u ./object_detection/model_main.py \
        --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
        --model_dir=${CKPT_DIR} \
        --alsologtostder \
+       --amp \
        "${@:3}" 2>&1)
 PERF=$(echo "$TRAIN_LOG" | sed -n 's|.*global_step/sec: \(\S\+\).*|\1|p' | python -c "import sys; x = sys.stdin.readlines(); x = [float(a) for a in x[int(len(x)*3/4):]]; print(32*$GPUS*sum(x)/len(x), 'img/s')")
 
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_FP16_4GPU.sh b/TensorFlow/Detection/SSD/examples/SSD320_FP16_4GPU.sh
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_4GPU"}
 PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_full_4gpus.config"
 GPUS=4
 
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-
 TENSOR_OPS=0
 export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
 export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -37,4 +35,5 @@ time mpirun --allow-run-as-root \
                --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
                --model_dir=${CKPT_DIR} \
                --alsologtostder \
+               --amp \
                "${@:3}"
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_FP16_4GPU_BENCHMARK.sh b/TensorFlow/Detection/SSD/examples/SSD320_FP16_4GPU_BENCHMARK.sh
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_4GPU"}
 PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_bench.config"
 GPUS=4
 
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-
 TENSOR_OPS=0
 export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
 export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -37,6 +35,7 @@ TRAIN_LOG=$(mpirun --allow-run-as-root \
                --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
                --model_dir=${CKPT_DIR} \
                --alsologtostder \
+               --amp \
                "${@:3}" 2>&1)
 PERF=$(echo "$TRAIN_LOG" | sed -n 's|.*global_step/sec: \(\S\+\).*|\1|p' | python -c "import sys; x = sys.stdin.readlines(); x = [float(a) for a in x[int(len(x)*3/4):]]; print(32*$GPUS*sum(x)/len(x), 'img/s')")
 
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_FP16_8GPU.sh b/TensorFlow/Detection/SSD/examples/SSD320_FP16_8GPU.sh
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_8GPU"}
 PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_full_8gpus.config"
 GPUS=8
 
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-
 TENSOR_OPS=0
 export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
 export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -39,4 +37,5 @@ time mpirun --allow-run-as-root \
                --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
                --model_dir=${CKPT_DIR} \
                --alsologtostder \
+               --amp \
                "${@:3}" 2>&1 | tee $CKPT_DIR/train_log
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_FP16_8GPU_BENCHMARK.sh b/TensorFlow/Detection/SSD/examples/SSD320_FP16_8GPU_BENCHMARK.sh
@@ -16,8 +16,6 @@ CKPT_DIR=${1:-"/results/SSD320_FP16_8GPU"}
 PIPELINE_CONFIG_PATH=${2:-"/workdir/models/research/configs"}"/ssd320_bench.config"
 GPUS=8
 
-export TF_ENABLE_AUTO_MIXED_PRECISION=1
-
 TENSOR_OPS=0
 export TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
 export TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32=${TENSOR_OPS}
@@ -37,6 +35,7 @@ TRAIN_LOG=$(mpirun --allow-run-as-root \
                --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
                --model_dir=${CKPT_DIR} \
                --alsologtostder \
+               --amp \
                "${@:3}" 2>&1)
 PERF=$(echo "$TRAIN_LOG" | sed -n 's|.*global_step/sec: \(\S\+\).*|\1|p' | python -c "import sys; x = sys.stdin.readlines(); x = [float(a) for a in x[int(len(x)*3/4):]]; print(32*$GPUS*sum(x)/len(x), 'img/s')")
 
diff --git a/TensorFlow/Detection/SSD/examples/SSD320_inference.py b/TensorFlow/Detection/SSD/examples/SSD320_inference.py
@@ -19,17 +19,26 @@
 from time import time
 
 import tensorflow as tf
+import dllogger
 
 from object_detection import model_hparams
 from object_detection import model_lib
+from object_detection.utils.exp_utils import setup_dllogger
 
+import numpy as np
 
 flags.DEFINE_string('checkpoint_dir', None, 'Path to directory holding a checkpoint.  If '
                     '`checkpoint_dir` is not provided, benchmark is running on random model')
 flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config file.')
+flags.DEFINE_string("raport_file", default="summary.json",
+                         help="Path to dlloger json")
 flags.DEFINE_integer('warmup_iters', 100, 'Number of iterations skipped during benchmark')
 flags.DEFINE_integer('benchmark_iters', 300, 'Number of iterations measured by benchmark')
 flags.DEFINE_integer('batch_size', 1, 'Number of inputs processed paralelly')
+flags.DEFINE_list("percentiles", default=['90', '95', '99'],
+                  help="percentiles for latency confidence intervals")
+
+
 FLAGS = flags.FLAGS
 
 flags.mark_flag_as_required('pipeline_config_path')
@@ -58,6 +67,7 @@ def benchmark_input_fn(params={}):
 class TimingHook(tf.train.SessionRunHook):
     def __init__(self):
         super(TimingHook, self).__init__()
+        setup_dllogger(enabled=True, filename=FLAGS.raport_file)
         self.times = []
 
     def before_run(self, *args, **kwargs):
@@ -73,13 +83,23 @@ def after_run(self, *args, **kwargs):
         self.times.append(time() - self.start_time)
         self.log_progress()
 
-    def collect_result(self):
-        return FLAGS.batch_size * FLAGS.benchmark_iters / sum(self.times[FLAGS.benchmark_iters:])
-
     def end(self, *args, **kwargs):
         super(TimingHook, self).end(*args, **kwargs)
+        throughput = sum([1/x for x in self.times[FLAGS.warmup_iters:]]) * FLAGS.batch_size / FLAGS.benchmark_iters
+        latency_avg = 1000 * sum(self.times[FLAGS.warmup_iters:]) / FLAGS.benchmark_iters
+        latency_data = 1000 * np.array(self.times[FLAGS.warmup_iters:])
+        summary = {
+            'infer_throughput': throughput,
+            'eval_avg_latency': latency_avg
+        }
         print()
-        print('Benchmark result:', self.collect_result(), 'img/s')
+        print('Benchmark result:', throughput, 'img/s')
+        for p in FLAGS.percentiles:
+            p = int(p)
+            tf.logging.info("Latency {}%: {:>4.2f} ms".format(
+                p, np.percentile(latency_data, p)))
+            summary[f'eval_{p}%_latency'] = np.percentile(latency_data, p)
+        dllogger.log(step=tuple(), data=summary)
 
 
 def main(unused_argv):
diff --git a/TensorFlow/Detection/SSD/img/training_loss.png b/TensorFlow/Detection/SSD/img/training_loss.png
diff --git a/TensorFlow/Detection/SSD/img/validation_accuracy.png b/TensorFlow/Detection/SSD/img/validation_accuracy.png
diff --git a/TensorFlow/Detection/SSD/models/research/object_detection/builders/dataset_builder.py b/TensorFlow/Detection/SSD/models/research/object_detection/builders/dataset_builder.py
@@ -74,6 +74,9 @@ def read_dataset(file_read_func, input_files, config):
   """
   # Shard, shuffle, and read files.
   filenames = tf.gfile.Glob(input_files)
+  if not filenames:
+      raise ValueError('Invalid input path specified in '
+                       '`input_reader_config`.')
   num_readers = config.num_readers
   if num_readers > len(filenames):
     num_readers = len(filenames)
diff --git a/TensorFlow/Detection/SSD/models/research/object_detection/metrics/coco_tools.py b/TensorFlow/Detection/SSD/models/research/object_detection/metrics/coco_tools.py
@@ -42,6 +42,7 @@
 from collections import OrderedDict
 import copy
 import time
+import dllogger
 import numpy as np
 
 from pycocotools import coco
@@ -251,6 +252,8 @@ def ComputeMetrics(self,
         ('Recall/AR@100 (medium)', self.stats[10]),
         ('Recall/AR@100 (large)', self.stats[11])
     ])
+    dllogger.log(step=tuple(), data=summary_metrics)
+
     if not include_metrics_per_category:
       return summary_metrics, {}
     if not hasattr(self, 'category_stats'):
diff --git a/TensorFlow/Detection/SSD/models/research/object_detection/model_lib.py b/TensorFlow/Detection/SSD/models/research/object_detection/model_lib.py
@@ -567,6 +567,7 @@ def create_estimator_and_inputs(run_config,
     'predict_input_fn': A prediction input function.
     'train_steps': Number of training steps. Either directly from input or from
       configuration.
+    'train_batch_size': train batch size per GPU
   """
   get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
       'get_configs_from_pipeline_file']
@@ -666,7 +667,8 @@ def create_estimator_and_inputs(run_config,
       eval_input_names=eval_input_names,
       eval_on_train_input_fn=eval_on_train_input_fn,
       predict_input_fn=predict_input_fn,
-      train_steps=train_steps)
+      train_steps=train_steps,
+      train_batch_size=train_config.batch_size)
 
 
 def create_train_and_eval_specs(train_input_fn,
diff --git a/TensorFlow/Detection/SSD/models/research/object_detection/model_main.py b/TensorFlow/Detection/SSD/models/research/object_detection/model_main.py
@@ -36,15 +36,21 @@
 
 import tensorflow as tf
 import horovod.tensorflow as hvd
+import dllogger
+import time
+import os
 
 from object_detection import model_hparams
 from object_detection import model_lib
+from object_detection.utils.exp_utils import AverageMeter, setup_dllogger
 
 flags.DEFINE_string(
     'model_dir', None, 'Path to output model directory '
     'where event and checkpoint files will be written.')
 flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
                     'file.')
+flags.DEFINE_string("raport_file", default="summary.json",
+                         help="Path to dlloger json")
 flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
 flags.DEFINE_boolean('eval_training_data', False,
                      'If training data should be evaluated for this job. Note '
@@ -67,15 +73,48 @@
     'writing resulting metrics to `model_dir`.')
 flags.DEFINE_boolean(
     'allow_xla', False, 'Enable XLA compilation')
+flags.DEFINE_boolean(
+    'amp', False, 'Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.')
 flags.DEFINE_boolean(
     'run_once', False, 'If running in eval-only mode, whether to run just '
     'one round of eval vs running continuously (default).'
 )
 FLAGS = flags.FLAGS
 
+class DLLoggerHook(tf.estimator.SessionRunHook):
+  def __init__(self, global_batch_size, rank=-1):
+    self.global_batch_size = global_batch_size
+    self.rank = rank
+    setup_dllogger(enabled=True, filename=FLAGS.raport_file, rank=rank)
+
+  def after_create_session(self, session, coord):
+    self.meters = {}
+    warmup = 100
+    self.meters['train_throughput'] = AverageMeter(warmup=warmup)
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+    return tf.estimator.SessionRunArgs(fetches=['global_step:0', 'learning_rate:0'])
+
+  def after_run(self, run_context, run_values):
+    throughput = self.global_batch_size/(time.time() - self.t0)
+    global_step, lr = run_values.results
+    self.meters['train_throughput'].update(throughput)
+
+  def end(self, session):
+    summary = {
+      'train_throughput': self.meters['train_throughput'].avg,
+    }
+    dllogger.log(step=tuple(), data=summary)
+
+
 
 def main(unused_argv):
   tf.logging.set_verbosity(tf.logging.INFO)
+  if FLAGS.amp:
+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
+  else:
+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0"
 
   hvd.init()
 
@@ -130,9 +169,9 @@ def main(unused_argv):
         train_steps,
         eval_on_train_data=False)
 
-    train_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
+    train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())]
     eval_hooks = []
-    
+
     for x in range(FLAGS.eval_count):
         estimator.train(train_input_fn,
                         hooks=train_hooks,
diff --git a/TensorFlow/Detection/SSD/models/research/object_detection/utils/exp_utils.py b/TensorFlow/Detection/SSD/models/research/object_detection/utils/exp_utils.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dllogger
+import os
+
+class AverageMeter:
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, warmup=0, keep=False):
+        self.reset()
+        self.warmup = warmup
+        self.keep = keep
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.iters = 0
+        self.vals = []
+
+    def update(self, val, n=1):
+        self.iters += 1
+        self.val = val
+
+        if self.iters > self.warmup:
+            self.sum += val * n
+            self.count += n
+            self.avg = self.sum / self.count
+            if self.keep:
+                self.vals.append(val)
+
+def setup_dllogger(enabled=True, filename=os.devnull, rank=0):
+    if enabled and rank == 0:
+        backends = [
+            dllogger.JSONStreamBackend(
+                dllogger.Verbosity.VERBOSE,
+                filename,
+                ),
+            ]
+        dllogger.init(backends)
+    else:
+        dllogger.init([])