NVIDIA
diff --git a/‎TensorFlow/Classification/ConvNets/export_frozen_graph.py‎
Lines changed: 1 addition & 3 deletions b/‎TensorFlow/Classification/ConvNets/export_frozen_graph.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎TensorFlow/Classification/ConvNets/inference.py‎
Lines changed: 134 additions & 0 deletions b/‎TensorFlow/Classification/ConvNets/inference.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎TensorFlow/Classification/ConvNets/main.py‎
Lines changed: 6 additions & 8 deletions b/‎TensorFlow/Classification/ConvNets/main.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎TensorFlow/Classification/ConvNets/model/blocks/conv2d_block.py‎
Lines changed: 4 additions & 4 deletions b/‎TensorFlow/Classification/ConvNets/model/blocks/conv2d_block.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎TensorFlow/Classification/ConvNets/model/resnet.py‎
Lines changed: 3 additions & 5 deletions b/‎TensorFlow/Classification/ConvNets/model/resnet.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎TensorFlow/Classification/ConvNets/resnet50v1.5/README.md‎
Lines changed: 5 additions & 2 deletions b/‎TensorFlow/Classification/ConvNets/resnet50v1.5/README.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md‎
Lines changed: 0 additions & 1 deletion b/‎TensorFlow/Classification/ConvNets/resnext101-32x4d/README.md‎
Lines changed: 0 additions & 1 deletion
@@ -22,7 +22,7 @@
 
 import tensorflow as tf
 
-import horovod.tensorflow as hvd
+from utils import hvd_wrapper as hvd
 from model import resnet
 
 tf.app.flags.DEFINE_string(
@@ -75,8 +75,6 @@
 
 
 def main(_):
-  
-  # Initialize Horovod (TODO: Remove dependency of horovod for freezing graphs)
   hvd.init()
 
   if not FLAGS.output_file:
 
@@ -0,0 +1,134 @@
+import argparse
+import os
+import pathlib
+import time
+import tempfile
+
+import tensorflow as tf
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt import trt_convert as trt
+
+import dllogger
+
+from runtime import runner_utils
+from runtime import runner
+from model.resnet import model_architectures
+from utils import data_utils
+from utils import hvd_wrapper as hvd
+
+OUTPUT_SAVED_MODEL_PATH = tempfile.mkdtemp(prefix="tftrt-converted")
+LOG_FREQUENCY = 100
+
+def argument_parser() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+
+    exclusive_args = parser.add_mutually_exclusive_group()
+    exclusive_args.add_argument("--model", type=str, default=None, help="Saved model location to use for inference")
+    exclusive_args.add_argument("--architecture", type=str, choices=model_architectures.keys())
+
+    parser.add_argument("--log-path", type=str, default="./log.json", help="Path to log file")
+    parser.add_argument("--tf-trt", action="store_true", default=False, help="Use TF-TRT for inference")
+    parser.add_argument("--amp", action="store_true", default=False, help="Use AMP for inference")
+    parser.add_argument("--data-dir", type=str, required=False, 
+                        default=None, help="Localization of validation data")
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size for inference")
+
+    return parser.parse_args()
+
+def main(args: argparse.Namespace):
+    hvd.init()
+
+    dllogger.init(backends=[
+        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path),
+        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
+    ])
+    dllogger.log(data=vars(args), step='PARAMETER')
+
+    if args.model is None:
+        saved_model_to_load = tempfile.mkdtemp(prefix="tftrt-savedmodel")
+        r = runner.Runner(n_classes=1001, architecture=args.architecture, use_tf_amp=args.amp, 
+            model_dir=saved_model_to_load)
+        r.train("batch", 1, 1, args.batch_size, is_benchmark=True)
+        r.evaluate("batch", 1, args.batch_size, export_dir=saved_model_to_load, 
+            is_benchmark=True)
+
+        saved_model_to_load = r.exported_path.decode("utf-8")
+    else:
+        saved_model_to_load = args.model
+
+    output_tensor_name = "y_preds_ref:0" if not args.tf_trt else "ArgMax:0"
+    batch_size = args.batch_size
+
+    if args.tf_trt:
+        converter = trt.TrtGraphConverter(input_saved_model_dir=str(saved_model_to_load),
+                                          precision_mode="FP16" if args.amp else "FP32")
+        converter.convert()
+        converter.save(OUTPUT_SAVED_MODEL_PATH)
+        saved_model_to_load = OUTPUT_SAVED_MODEL_PATH
+    elif args.amp:
+        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+
+    if args.data_dir is not None:
+        filenames, _, num_steps, _, _ = runner_utils.parse_tfrecords_dataset(
+                    data_dir=str(args.data_dir),
+                    mode="validation",
+                    iter_unit="epoch",
+                    num_iter=1,
+                    global_batch_size=batch_size,
+                )
+
+
+        dataset = data_utils.get_tfrecords_input_fn(filenames=filenames,
+                                                    batch_size=batch_size,
+                                                    height=224,
+                                                    width=224,
+                                                    training=False,
+                                                    distort_color=False,
+                                                    num_threads=1,
+                                                    deterministic=True)
+        iterator = dataset.make_initializable_iterator()
+        next_item = iterator.get_next()
+    else:
+        num_steps=60000 / batch_size
+    
+
+    with tf.Session() as sess:
+        if args.data_dir is not None:
+            sess.run(iterator.initializer)
+        tf.saved_model.loader.load(sess, 
+            [tf.saved_model.tag_constants.SERVING],
+            str(saved_model_to_load))
+
+        try:
+            start_time = time.time()
+            last_time = start_time
+            image_processed = 0
+            image_correct = 0
+
+            for samples_processed in range(int(num_steps)):
+                if args.data_dir is not None:
+                    next_batch_image, next_batch_target = sess.run(next_item)
+                else:
+                    if samples_processed == 0:
+                        next_batch_image = np.random.normal(size=(batch_size, 224, 224, 3))
+                        next_batch_target = np.random.randint(0, 1000, size=(batch_size,))
+                output = sess.run([output_tensor_name], feed_dict={"input_tensor:0": next_batch_image})
+                image_processed += args.batch_size
+                image_correct += np.sum(output == next_batch_target)
+
+                if samples_processed % LOG_FREQUENCY == 0 and samples_processed != 0:
+                    current_time = time.time()
+                    current_throughput = LOG_FREQUENCY * batch_size / (current_time - last_time)
+                    dllogger.log(step=(0, samples_processed), data={"throughput": current_throughput})
+                    last_time = current_time
+
+        except tf.errors.OutOfRangeError:
+            pass
+        finally:
+            dllogger.log(step=tuple(), data={"throughput": image_processed / (last_time - start_time), 
+                                             "accuracy": image_correct / image_processed})
+
+
+if __name__ == "__main__":
+    main(argument_parser())
@@ -22,10 +22,9 @@
 
 import tensorflow as tf
 
-import horovod.tensorflow as hvd
+from utils import hvd_wrapper as hvd
 import dllogger
 
-from utils import hvd_utils
 from runtime import Runner
 from model.resnet import model_architectures
 
@@ -36,7 +35,7 @@
     tf.logging.set_verbosity(tf.logging.ERROR)
 
     FLAGS = parse_cmdline(model_architectures.keys())
-    hvd.init()
+    hvd.init(True)
 
     if hvd.rank() == 0:
         log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
@@ -100,11 +99,10 @@
 
     if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']:
 
-        if FLAGS.mode == 'inference_benchmark' and hvd_utils.is_using_hvd():
+        if FLAGS.mode == 'inference_benchmark' and hvd.size() > 1:
             raise NotImplementedError("Only single GPU inference is implemented.")
 
-        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
-
+        elif hvd.rank() == 0:
             runner.evaluate(iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch",
                             num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1,
                             warmup_steps=FLAGS.warmup_steps,
@@ -124,10 +122,10 @@
         if not os.path.isfile(FLAGS.to_predict):
             raise ValueError("Only prediction on single images is supported!")
 
-        if hvd_utils.is_using_hvd():
+        if hvd.size() > 1:
             raise NotImplementedError("Only single GPU inference is implemented.")
 
-        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:
+        else:
             runner.predict(FLAGS.to_predict,
                            quantize=FLAGS.quantize,
                            symmetric=FLAGS.symmetric,
 
@@ -64,10 +64,10 @@ def conv2d_block(
                 trainable=is_training,
                 dtype=tf.float32)
             net = tf.nn.conv2d(inputs,
-                                      group_filter,
-                                      strides=strides,
-                                      padding='SAME',
-                                      data_format=data_format)
+                               group_filter,
+                               strides=strides,
+                               padding='SAME',
+                               data_format=data_format)
         if use_batch_norm:
             net = layers.batch_norm(
                 net,
 
@@ -19,15 +19,13 @@
 
 import tensorflow as tf
 
-import horovod.tensorflow as hvd
+from utils import hvd_wrapper as hvd
 import dllogger
 
 from model import layers
 from model import blocks
 
 from utils import var_storage
-from utils import hvd_utils
-
 from utils.data_utils import normalized_inputs
 
 from utils.learning_rate import learning_rate_scheduler
@@ -337,8 +335,8 @@ def loss_filter_fn(name):
                     if params["apply_loss_scaling"]:
                         optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])
 
-                    if hvd_utils.is_using_hvd():
-                        optimizer = hvd.DistributedOptimizer(optimizer)
+                    if hvd.size() > 1:
+                        optimizer = hvd.hvd_global_object.DistributedOptimizer(optimizer)
 
                     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                     if mode != tf.estimator.ModeKeys.TRAIN:
 
@@ -276,7 +276,6 @@ The `utils/` directory contains the following utility modules:
  - `cmdline_helper.py`: helper module for command line processing
  - `data_utils.py`: module defining input data pipelines
  - `dali_utils.py`: helper module for DALI 
- - `hvd_utils.py`: helper module for Horovod
  - `image_processing.py`: image processing and data augmentation functions
  - `learning_rate.py`: definition of used learning rate schedule
  - `optimizers.py`: definition of used custom optimizers
@@ -447,7 +446,11 @@ To run inference on a single example with a checkpoint and a model script, use:
 
 `python main.py --mode predict --model_dir <path to model> --to_predict <path to image> --results_dir <path to results>`
 
-The optional `--xla` and `--amp` flags control XLA and AMP during inference.
+The optional `--xla` and `--amp` flags control XLA and AMP during inference. To run inference using TF-TRT, please use the following command:
+
+`python inference.py --model <path to model> --tf-trt --batch-size <inference_batch_size> --data-dir <path to data>`
+
+The optional `--amp` flag controls AMP during inference.
 
 ## Performance
 
 
@@ -283,7 +283,6 @@ The `utils/` directory contains the following utility modules:
  - `cmdline_helper.py`: helper module for command line processing
  - `data_utils.py`: module defining input data pipelines
  - `dali_utils.py`: helper module for DALI 
- - `hvd_utils.py`: helper module for Horovod
  - `image_processing.py`: image processing and data augmentation functions
  - `learning_rate.py`: definition of used learning rate schedule
  - `optimizers.py`: definition of used custom optimizers