[ELECTRA/TF2] Update inference latency (NVIDIA#657)

sharathts · web-flow · commit 446c87887817 · 2020-08-19T20:43:44.000-07:00
* Update inference latency

* Fix inference perf numbers

* Fix latency computation
diff --git a/TensorFlow2/LanguageModeling/ELECTRA/README.md b/TensorFlow2/LanguageModeling/ELECTRA/README.md
@@ -531,16 +531,16 @@ FP16
 | Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
 |------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
 |          1 |             384 |                            178 |            5.630 |            5.500 |            5.555 |            5.608 |
-|        256 |             384 |                            857 |            1.112 |            1.111 |            1.111 |            1.112 |
-|        512 |             384 |                            864 |            1.054 |            1.051 |            1.053 |            1.053 |
+|        256 |             384 |                            857 |           284.67 |           284.416|          284.416 |           284.67 |
+|        512 |             384 |                            864 |           539.648|          538.112 |          539.136 |          539.136 |
  
 TF32
  
 | Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
 |------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
 |          1 |             384 |                            123 |            8.186 |            7.995 |            8.078 |            8.152 |
-|        256 |             384 |                            344 |            2.832 |            2.822 |            2.826 |            2.830 |
-|        512 |             384 |                            351 |            2.787 |            2.781 |            2.784 |            2.784 |
+|        256 |             384 |                            344 |          724.992 |          722.432 |          723.456 |           724.48 |
+|        512 |             384 |                            351 |         1426.944 |         1423.872 |         1425.408 |         1425.408 |
  
  
  
@@ -556,17 +556,17 @@ FP16
 | Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
 |------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
 |          1 |             384 |                            141 |            7.100 |            7.071 |            7.081 |            7.091 |
-|        128 |             384 |                            517 |            1.933 |            1.930 |            1.930 |            1.932 |
-|        256 |             384 |                            524 |            1.910 |            1.907 |            1.908 |            1.909 |
+|        128 |             384 |                            517 |          247.424 |           247.04 |           247.04 |          247.296 |
+|        256 |             384 |                            524 |          488.96  |          488.192 |          488.448 |          488.704 |
  
  
 FP32
  
 | Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
 |------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
 |          1 |             384 |                             84 |           11.869 |           11.814 |           11.832 |           11.850 |
-|        128 |             384 |                            117 |            8.548 |            8.527 |            8.529 |            8.537 |
-|        256 |             384 |                            141 |            7.100 |            7.071 |            7.081 |            7.091 |
+|        128 |             384 |                            117 |         1094.144 |         1091.456 |         1091.712 |         1092.736 |
+|        256 |             384 |                            141 |         1817.6   |         1810.176 |         1812.736 |         1815.552 |
  
  
 ##### Inference performance: NVIDIA DGX-2 (1x V100 32GB)
@@ -581,16 +581,16 @@ FP16
 | Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
 |------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
 |          1 |             384 |                            144 |            6.953 |            6.888 |            6.910 |            6.932 |
-|        128 |             384 |                            547 |            1.828 |            1.827 |            1.827 |            1.828 |
-|        256 |             384 |                            557 |            1.795 |            1.792 |            1.793 |            1.794 |
+|        128 |             384 |                            547 |          233.984 |          233.856 |          233.856 |          233.984 |
+|        256 |             384 |                            557 |          459.52  |          458.752 |          459.008 |          459.264 |
  
 FP32
  
 | Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
 |------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
 |          1 |             384 |                             86 |           11.580 |           11.515 |           11.535 |           11.558 |
-|        128 |             384 |                            124 |            8.056 |             8.05 |            8.052 |            8.055 |
-|        256 |             384 |                            125 |            8.006 |            8.002 |            8.004 |            8.005 |
+|        128 |             384 |                            124 |         1031.168 |           1030.4 |         1030.656 |          1031.04 |
+|        256 |             384 |                            125 |         2049.536 |         2048.512 |         2049.024 |          2049.29 |
  
  
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
diff --git a/TensorFlow2/LanguageModeling/ELECTRA/run_tf_squad.py b/TensorFlow2/LanguageModeling/ELECTRA/run_tf_squad.py
@@ -560,7 +560,7 @@ def main():
 
                     infer_time = (time.time() - iter_start)
                     infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time)
-                    latency.append(1. * infer_time / EVAL_BATCH_SIZE)
+                    latency.append(infer_time)
 
                     for iter_ in range(input_ids.shape[0]):