Skip to content

Commit e470c21

Browse files
committed
Updating RN50/MxNet
1 parent f2fe090 commit e470c21

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+3205
-1482
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
FROM nvcr.io/nvidia/mxnet:19.07-py3
2+
COPY . /workspace/rn50
3+
WORKDIR /workspace/rn50

MxNet/Classification/RN50v1.5/LICENSE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
Apache License
23
Version 2.0, January 2004
34
http://www.apache.org/licenses/

MxNet/Classification/RN50v1.5/README.md

Lines changed: 614 additions & 127 deletions
Large diffs are not rendered by default.

MxNet/Classification/RN50v1.5/__init__.py

Whitespace-only changes.

MxNet/Classification/RN50v1.5/benchmark.py

100644100755
Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,72 +20,105 @@
1820
import tempfile
1921
import json
2022
import os
23+
import traceback
24+
import numpy as np
2125
from collections import OrderedDict
2226
from subprocess import Popen
2327

24-
parser = argparse.ArgumentParser(description='Benchmark')
28+
def int_list(x):
29+
return list(map(int, x.split(',')))
30+
31+
parser = argparse.ArgumentParser(description='Benchmark',
32+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
2533
parser.add_argument('--executable', default='./runner', help='path to runner')
26-
parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]',
34+
parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
35+
parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]', type=int_list,
2736
required=True, help='numbers of gpus separated by comma')
28-
parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]',
37+
parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]', type=int_list,
2938
required=True, help='batch sizes separated by comma')
3039
parser.add_argument('-i', '--benchmark-iters', metavar='I',
3140
type=int, default=100, help='iterations')
3241
parser.add_argument('-e', '--epochs', metavar='E',
3342
type=int, default=1, help='number of epochs')
3443
parser.add_argument('-w', '--warmup', metavar='N',
3544
type=int, default=0, help='warmup epochs')
36-
parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report")
37-
parser.add_argument('--only-inference', action='store_true', help="benchmark inference only")
45+
parser.add_argument('--timeout', metavar='T',
46+
type=str, default='inf', help='timeout for each run')
47+
parser.add_argument('--mode', metavar='MODE', choices=('train_val', 'train', 'val'), default='train_val',
48+
help="benchmark mode")
3849
args, other_args = parser.parse_known_args()
3950

40-
ngpus = list(map(int, args.ngpus.split(',')))
41-
batch_sizes = list(map(int, args.batch_sizes.split(',')))
42-
51+
latency_percentiles = ['avg', 50, 90, 95, 99, 100]
52+
harmonic_mean_metrics = ['train.total_ips', 'val.total_ips']
4353

4454
res = OrderedDict()
4555
res['model'] = ''
46-
res['ngpus'] = ngpus
47-
res['bs'] = batch_sizes
48-
if args.only_inference:
49-
res['metric_keys'] = ['val.total_ips']
50-
else:
51-
res['metric_keys'] = ['train.total_ips', 'val.total_ips']
56+
res['ngpus'] = args.ngpus
57+
res['bs'] = args.batch_sizes
58+
res['metric_keys'] = []
59+
if args.mode == 'train' or args.mode == 'train_val':
60+
res['metric_keys'].append('train.total_ips')
61+
for percentile in latency_percentiles:
62+
res['metric_keys'].append('train.latency_{}'.format(percentile))
63+
if args.mode == 'val' or args.mode == 'train_val':
64+
res['metric_keys'].append('val.total_ips')
65+
for percentile in latency_percentiles:
66+
res['metric_keys'].append('val.latency_{}'.format(percentile))
67+
5268
res['metrics'] = OrderedDict()
5369

54-
for n in ngpus:
70+
for n in args.ngpus:
5571
res['metrics'][str(n)] = OrderedDict()
56-
for bs in batch_sizes:
72+
for bs in args.batch_sizes:
5773
res['metrics'][str(n)][str(bs)] = OrderedDict()
5874

5975
report_file = args.output + '-{},{}'.format(n, bs)
60-
Popen([args.executable, '-n', str(n), '-b', str(bs),
76+
Popen(['timeout', args.timeout, args.executable, '-n', str(n), '-b', str(bs),
6177
'--benchmark-iters', str(args.benchmark_iters),
6278
'-e', str(args.epochs), '--report', report_file,
63-
*([] if not args.only_inference else ['--only-inference']),
64-
'--no-metrics'] + other_args, stdout=sys.stderr).wait()
79+
'--mode', args.mode, '--no-metrics'] + other_args,
80+
stdout=sys.stderr).wait()
81+
82+
try:
83+
for suffix in ['', *['-{}'.format(i) for i in range(1, n)]]:
84+
try:
85+
with open(report_file + suffix, 'r') as f:
86+
report = json.load(f)
87+
break
88+
except FileNotFoundError:
89+
pass
90+
else:
91+
with open(report_file, 'r') as f:
92+
report = json.load(f)
6593

66-
with open(report_file, 'r') as f:
67-
report = json.load(f)
94+
for metric in res['metric_keys']:
95+
if len(report['metrics'][metric]) != args.epochs:
96+
raise ValueError('Wrong number epochs in report')
97+
data = report['metrics'][metric][args.warmup:]
98+
if metric in harmonic_mean_metrics:
99+
avg = len(data) / sum(map(lambda x: 1 / x, data))
100+
else:
101+
avg = np.mean(data)
102+
res['metrics'][str(n)][str(bs)][metric] = avg
103+
except Exception as e:
104+
traceback.print_exc()
68105

69-
for metric in res['metric_keys']:
70-
data = report['metrics'][metric][args.warmup:]
71-
avg = len(data) / sum(map(lambda x: 1 / x, data))
72-
res['metrics'][str(n)][str(bs)][metric] = avg
106+
for metric in res['metric_keys']:
107+
res['metrics'][str(n)][str(bs)][metric] = float('nan')
73108

74109

75-
column_len = 7
110+
column_len = 11
76111
for m in res['metric_keys']:
77112
print(m, file=sys.stderr)
78113
print(' ' * column_len, end='|', file=sys.stderr)
79-
for bs in batch_sizes:
114+
for bs in args.batch_sizes:
80115
print(str(bs).center(column_len), end='|', file=sys.stderr)
81116
print(file=sys.stderr)
82-
print('-' * (len(batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
83-
for n in ngpus:
117+
print('-' * (len(args.batch_sizes) + 1) * (column_len + 1), file=sys.stderr)
118+
for n in args.ngpus:
84119
print(str(n).center(column_len), end='|', file=sys.stderr)
85-
for bs in batch_sizes:
86-
print(str(round(res['metrics'][str(n)][str(bs)][m])).center(column_len), end='|', file=sys.stderr)
120+
for bs in args.batch_sizes:
121+
print('{:.5g}'.format(res['metrics'][str(n)][str(bs)][m]).center(column_len), end='|', file=sys.stderr)
87122
print(file=sys.stderr)
88123
print(file=sys.stderr)
89124

MxNet/Classification/RN50v1.5/benchmarking.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,14 @@ def __next__(self):
5252
def __getattr__(self, attr):
5353
return getattr(self.data_iter, attr)
5454

55-
def get_avg_time_and_clear(self):
55+
def get_avg_time(self):
5656
if self.num <= 1:
5757
avg = float('nan')
5858
else:
5959
avg = self.overall_time / (self.num - 1)
60+
return avg
61+
62+
def reset(self):
6063
self.overall_time = 0
6164
self.num = 0
62-
return avg
65+
self.data_iter.reset()

0 commit comments

Comments
 (0)