-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathnvidia_tools.py
More file actions
82 lines (66 loc) · 2.15 KB
/
nvidia_tools.py
File metadata and controls
82 lines (66 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import subprocess
try:
import telemetry
ngc_telemetry = telemetry.ApplicationTelemetry()
except:
ngc_telemetry = False
print("Could not load NGC telemetry!")
def push_ngc_telemetry(name, value):
# if NGC telemetry logging enabled:
try:
ngc_telemetry.metric_push_async({'metric': name, 'value': value})
except:
pass
def log_ngc(train_metric):
for key in train_metric.keys():
push_ngc_telemetry(key, train_metric[key])
def get_gpu_memory_map():
"""Get the current gpu usage.
Returns
-------
usage: dict
Keys are device ids as integers.
Values are memory usage as integers in MB.
"""
result = subprocess.check_output(
[
'nvidia-smi', '--query-gpu=memory.used',
'--format=csv,noheader'
], encoding='utf-8')
# Convert lines into a dictionary
gpu_memory = [x for x in result.strip().split('\n')]
# gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
return gpu_memory
def get_gpu_utilization():
"""Get the current gpu usage.
Returns
-------
usage: dict
Keys are device ids as integers.
Values are memory usage as integers in MB.
"""
result = subprocess.check_output(
[
'nvidia-smi', '--query-gpu=utilization.gpu',
'--format=csv,nounits,noheader'
], encoding='utf-8')
# Convert lines into a dictionary
gpu_memory = [int(x) for x in result.strip().split('\n')]
# gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
return gpu_memory
def log_gpu_statistics():
memory = get_gpu_memory_map()
utilization = get_gpu_utilization()
printstring="\n"
ngpu=1
for mem, use in zip(memory, utilization):
printstring += "GPU{}: memory {} ({}%); ".format(ngpu, mem, use)
try:
log_ngc("GPU{} memory".format(ngpu), mem)
log_ngc("GPU{} usage".format(ngpu), use)
except:
pass
print(printstring)
def log_ngc_dict(metric, prefix):
for key in metric.keys():
push_ngc_telemetry("{prefix}-{key}".format(prefix=prefix, key=key), metric[key])