|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
1 | 3 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. |
2 | 4 | # |
3 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
|
18 | 20 | import tempfile |
19 | 21 | import json |
20 | 22 | import os |
| 23 | +import traceback |
| 24 | +import numpy as np |
21 | 25 | from collections import OrderedDict |
22 | 26 | from subprocess import Popen |
23 | 27 |
|
24 | | -parser = argparse.ArgumentParser(description='Benchmark') |
| 28 | +def int_list(x): |
| 29 | + return list(map(int, x.split(','))) |
| 30 | + |
| 31 | +parser = argparse.ArgumentParser(description='Benchmark', |
| 32 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
25 | 33 | parser.add_argument('--executable', default='./runner', help='path to runner') |
26 | | -parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]', |
| 34 | +parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report") |
| 35 | +parser.add_argument('-n', '--ngpus', metavar='N1,[N2,...]', type=int_list, |
27 | 36 | required=True, help='numbers of gpus separated by comma') |
28 | | -parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]', |
| 37 | +parser.add_argument('-b', '--batch-sizes', metavar='B1,[B2,...]', type=int_list, |
29 | 38 | required=True, help='batch sizes separated by comma') |
30 | 39 | parser.add_argument('-i', '--benchmark-iters', metavar='I', |
31 | 40 | type=int, default=100, help='iterations') |
32 | 41 | parser.add_argument('-e', '--epochs', metavar='E', |
33 | 42 | type=int, default=1, help='number of epochs') |
34 | 43 | parser.add_argument('-w', '--warmup', metavar='N', |
35 | 44 | type=int, default=0, help='warmup epochs') |
36 | | -parser.add_argument('-o', '--output', metavar='OUT', required=True, help="path to benchmark report") |
37 | | -parser.add_argument('--only-inference', action='store_true', help="benchmark inference only") |
| 45 | +parser.add_argument('--timeout', metavar='T', |
| 46 | + type=str, default='inf', help='timeout for each run') |
| 47 | +parser.add_argument('--mode', metavar='MODE', choices=('train_val', 'train', 'val'), default='train_val', |
| 48 | + help="benchmark mode") |
38 | 49 | args, other_args = parser.parse_known_args() |
39 | 50 |
|
40 | | -ngpus = list(map(int, args.ngpus.split(','))) |
41 | | -batch_sizes = list(map(int, args.batch_sizes.split(','))) |
42 | | - |
| 51 | +latency_percentiles = ['avg', 50, 90, 95, 99, 100] |
| 52 | +harmonic_mean_metrics = ['train.total_ips', 'val.total_ips'] |
43 | 53 |
|
44 | 54 | res = OrderedDict() |
45 | 55 | res['model'] = '' |
46 | | -res['ngpus'] = ngpus |
47 | | -res['bs'] = batch_sizes |
48 | | -if args.only_inference: |
49 | | - res['metric_keys'] = ['val.total_ips'] |
50 | | -else: |
51 | | - res['metric_keys'] = ['train.total_ips', 'val.total_ips'] |
| 56 | +res['ngpus'] = args.ngpus |
| 57 | +res['bs'] = args.batch_sizes |
| 58 | +res['metric_keys'] = [] |
| 59 | +if args.mode == 'train' or args.mode == 'train_val': |
| 60 | + res['metric_keys'].append('train.total_ips') |
| 61 | + for percentile in latency_percentiles: |
| 62 | + res['metric_keys'].append('train.latency_{}'.format(percentile)) |
| 63 | +if args.mode == 'val' or args.mode == 'train_val': |
| 64 | + res['metric_keys'].append('val.total_ips') |
| 65 | + for percentile in latency_percentiles: |
| 66 | + res['metric_keys'].append('val.latency_{}'.format(percentile)) |
| 67 | + |
52 | 68 | res['metrics'] = OrderedDict() |
53 | 69 |
|
54 | | -for n in ngpus: |
| 70 | +for n in args.ngpus: |
55 | 71 | res['metrics'][str(n)] = OrderedDict() |
56 | | - for bs in batch_sizes: |
| 72 | + for bs in args.batch_sizes: |
57 | 73 | res['metrics'][str(n)][str(bs)] = OrderedDict() |
58 | 74 |
|
59 | 75 | report_file = args.output + '-{},{}'.format(n, bs) |
60 | | - Popen([args.executable, '-n', str(n), '-b', str(bs), |
| 76 | + Popen(['timeout', args.timeout, args.executable, '-n', str(n), '-b', str(bs), |
61 | 77 | '--benchmark-iters', str(args.benchmark_iters), |
62 | 78 | '-e', str(args.epochs), '--report', report_file, |
63 | | - *([] if not args.only_inference else ['--only-inference']), |
64 | | - '--no-metrics'] + other_args, stdout=sys.stderr).wait() |
| 79 | + '--mode', args.mode, '--no-metrics'] + other_args, |
| 80 | + stdout=sys.stderr).wait() |
| 81 | + |
| 82 | + try: |
| 83 | + for suffix in ['', *['-{}'.format(i) for i in range(1, n)]]: |
| 84 | + try: |
| 85 | + with open(report_file + suffix, 'r') as f: |
| 86 | + report = json.load(f) |
| 87 | + break |
| 88 | + except FileNotFoundError: |
| 89 | + pass |
| 90 | + else: |
| 91 | + with open(report_file, 'r') as f: |
| 92 | + report = json.load(f) |
65 | 93 |
|
66 | | - with open(report_file, 'r') as f: |
67 | | - report = json.load(f) |
| 94 | + for metric in res['metric_keys']: |
| 95 | + if len(report['metrics'][metric]) != args.epochs: |
| 96 | + raise ValueError('Wrong number epochs in report') |
| 97 | + data = report['metrics'][metric][args.warmup:] |
| 98 | + if metric in harmonic_mean_metrics: |
| 99 | + avg = len(data) / sum(map(lambda x: 1 / x, data)) |
| 100 | + else: |
| 101 | + avg = np.mean(data) |
| 102 | + res['metrics'][str(n)][str(bs)][metric] = avg |
| 103 | + except Exception as e: |
| 104 | + traceback.print_exc() |
68 | 105 |
|
69 | | - for metric in res['metric_keys']: |
70 | | - data = report['metrics'][metric][args.warmup:] |
71 | | - avg = len(data) / sum(map(lambda x: 1 / x, data)) |
72 | | - res['metrics'][str(n)][str(bs)][metric] = avg |
| 106 | + for metric in res['metric_keys']: |
| 107 | + res['metrics'][str(n)][str(bs)][metric] = float('nan') |
73 | 108 |
|
74 | 109 |
|
75 | | -column_len = 7 |
| 110 | +column_len = 11 |
76 | 111 | for m in res['metric_keys']: |
77 | 112 | print(m, file=sys.stderr) |
78 | 113 | print(' ' * column_len, end='|', file=sys.stderr) |
79 | | - for bs in batch_sizes: |
| 114 | + for bs in args.batch_sizes: |
80 | 115 | print(str(bs).center(column_len), end='|', file=sys.stderr) |
81 | 116 | print(file=sys.stderr) |
82 | | - print('-' * (len(batch_sizes) + 1) * (column_len + 1), file=sys.stderr) |
83 | | - for n in ngpus: |
| 117 | + print('-' * (len(args.batch_sizes) + 1) * (column_len + 1), file=sys.stderr) |
| 118 | + for n in args.ngpus: |
84 | 119 | print(str(n).center(column_len), end='|', file=sys.stderr) |
85 | | - for bs in batch_sizes: |
86 | | - print(str(round(res['metrics'][str(n)][str(bs)][m])).center(column_len), end='|', file=sys.stderr) |
| 120 | + for bs in args.batch_sizes: |
| 121 | + print('{:.5g}'.format(res['metrics'][str(n)][str(bs)][m]).center(column_len), end='|', file=sys.stderr) |
87 | 122 | print(file=sys.stderr) |
88 | 123 | print(file=sys.stderr) |
89 | 124 |
|
|
0 commit comments