Skip to content

Commit 2545c15

Browse files
committed
sandbox/dan2: various modifications and bug fixed to support CUDA training in Dan's recipe. Modify CUDA device selection code (cleanup, and make error-status controllable).
git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/dan2@3105 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
1 parent 38ea570 commit 2545c15

15 files changed

+336
-201
lines changed

src/cudamatrix/cu-device.cc

Lines changed: 127 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <vector>
3131
#include <algorithm>
3232
#include <dlfcn.h>
33+
#include <unistd.h> // for sleep
3334

3435
#include "cudamatrix/cu-common.h"
3536
#include "cudamatrix/cu-device.h"
@@ -62,7 +63,7 @@ namespace kaldi {
6263
* or not at all (when we intentionally want to run on the CPU).
6364
*
6465
*/
65-
void CuDevice::SelectGpuId(int32 gpu_id) {
66+
void CuDevice::SelectGpuId(int32 gpu_id, bool abort_on_error) {
6667
// Make sure this function is not called twice!
6768
if (Enabled()) {
6869
KALDI_ERR << "There is already an active GPU " << active_gpu_id_
@@ -80,69 +81,78 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
8081
cudaGetDeviceCount(&n_gpu);
8182
if(n_gpu == 0 && gpu_id == -2) {
8283
// If we do automatic selection and no GPU is found, we run on a CPU
83-
KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
84-
active_gpu_id_ = -2;
85-
return;
84+
if (abort_on_error) {
85+
KALDI_ERR << "No CUDA capable GPU was detected";
86+
} else {
87+
KALDI_WARN << "CUDA will NOT be used!!! No CUDA capable GPU detected...";
88+
active_gpu_id_ = -2;
89+
return;
90+
}
8691
}
87-
// In other cases it is an error, no GPU is an error
8892
if(n_gpu == 0) {
89-
KALDI_ERR << "No CUDA capable GPU detected, while explicitly asked for gpu-id '"
90-
<< gpu_id << "'.";
93+
if (abort_on_error) {
94+
KALDI_ERR << "No CUDA capable GPU was detected.";
95+
} else {
96+
KALDI_WARN << "No CUDA capable GPU detected, while explicitly asked for gpu-id '"
97+
<< gpu_id << "'.CUDA will NOT be used!!!";
98+
active_gpu_id_ = -2;
99+
return;
100+
}
91101
}
92-
93-
94-
//Now we know that there is a GPU in the system,
95-
//and we don't want to have it disabled.
102+
103+
// Now we know that there is a GPU in the system,
104+
// and we don't want to have it disabled.
96105
//
97-
//For the GPU selection there are 3 possibilities,
98-
//with priorities according to the order:
106+
// For the GPU selection there are 3 possibilities,
107+
// with priorities according to the order:
99108
//
100-
//1.) We have compute exclusive mode on (GPU is selected by OS)
101-
//2.) User did not specify the GPU-id (default value -2),
102-
// we will do automatic selection.
103-
//3.) User specified the GPU to run on, so we select it.
104-
if(IsComputeExclusive()) {
105-
//we have the GPU context now...
106-
;
107-
} else if(gpu_id == -2) {
108-
SelectGpuIdAuto();
109-
} else {
110-
//try to select the desired GPU
111-
int32 ret = cudaSetDevice(gpu_id);
112-
//handle the possible errors (no recovery!!!)
113-
switch(ret) {
114-
case cudaSuccess : {
115-
//create the GPU context
116-
cudaError_t e;
117-
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
118-
if(e != cudaSuccess) {
119-
KALDI_ERR << "Failed to create CUDA context on a GPU.";
120-
}
121-
//this was okay, so we are done!
122-
KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
123-
break;
124-
}
125-
case cudaErrorInvalidDevice : {
126-
int32 n_gpu = 0;
127-
cudaGetDeviceCount(&n_gpu);
128-
KALDI_ERR << "cudaSetDevice(" << gpu_id << "):"
129-
<< " '" << gpu_id << "' is not a VALID CUDA device! "
130-
<< " (system has " << n_gpu << " GPUs,"
131-
<< " valid IDs 0.." << n_gpu-1 << ")";
132-
break;
109+
// 1.) We have compute exclusive mode on (GPU is selected by OS)
110+
// 2.) User did not specify the GPU-id (default value -2),
111+
// we will do automatic selection.
112+
// 3.) User specified the GPU to run on, so we select it.
113+
bool error;
114+
if (IsComputeExclusive(&error)) {
115+
FinalizeActiveGpu();
116+
return;
117+
}
118+
if (error) { // There was some error detecting compute-exclusive status
119+
// (perhaps no GPU available). Sleep a bit and retry.
120+
int32 sec_sleep = 2;
121+
KALDI_WARN << "Will try again to get a GPU after " << sec_sleep
122+
<< " seconds.";
123+
sleep(sec_sleep);
124+
125+
if (IsComputeExclusive(&error)) {
126+
FinalizeActiveGpu();
127+
return;
128+
} else {
129+
if (abort_on_error) {
130+
KALDI_ERR << "Error acquiring GPU in exclusive mode.";
131+
} else {
132+
KALDI_WARN << "Error selecting GPU. CUDA will NOT be used!!!.";
133+
active_gpu_id_ = -2;
134+
return;
133135
}
134-
default :
135-
KALDI_ERR << "cudaSetDevice(" << gpu_id << "): "
136-
<< "returned " << ret << ", "
137-
<< cudaGetErrorString((cudaError_t)ret);
138136
}
139137
}
138+
139+
bool ans = (gpu_id == -2 ? SelectGpuIdAuto() : SelectGpuIdManual(gpu_id));
140+
if (ans) {
141+
FinalizeActiveGpu();
142+
} else {
143+
if (abort_on_error) {
144+
KALDI_ERR << "Error acquiring GPU.";
145+
} else {
146+
KALDI_WARN << "Error selecting GPU. CUDA will NOT be used!!!.";
147+
active_gpu_id_ = -2;
148+
}
149+
}
150+
}
140151

152+
void CuDevice::FinalizeActiveGpu() {
153+
// The device at this point should have active GPU, so we can query its name
154+
// and memory stats and notify user which GPU is finally used.
141155

142-
// Now the we should have active GPU,
143-
// so we can query its name and memory stats
144-
// and notify user which GPU is finally used.
145-
//
146156
// Get the device-id of active device:
147157
{
148158
int32 act_gpu_id;
@@ -161,17 +171,51 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
161171
DeviceGetName(name,128,act_gpu_id);
162172

163173
CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
164-
174+
165175
KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
166176
<< GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
167177
<< properties_.major << "." << properties_.minor;
168178

169179
if (verbose_) PrintMemoryUsage();
170180
}
171-
172181
return;
173182
}
174183

184+
bool CuDevice::SelectGpuIdManual(int32 gpu_id) {
185+
// The user selected a particular GPU using --use-gpu-id=X; try to select
186+
// that one.
187+
int32 ret = cudaSetDevice(gpu_id);
188+
//handle the possible errors (no recovery!!!)
189+
switch(ret) {
190+
case cudaSuccess : {
191+
//create the GPU context
192+
cudaError_t e;
193+
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
194+
if(e != cudaSuccess) {
195+
KALDI_WARN << "Failed to create CUDA context on a GPU.";
196+
return false;
197+
}
198+
//this was okay, so we are done!
199+
KALDI_LOG << "Selected device: " << gpu_id << " (manually)";
200+
return true;
201+
}
202+
case cudaErrorInvalidDevice : {
203+
int32 n_gpu = 0;
204+
cudaGetDeviceCount(&n_gpu);
205+
KALDI_WARN << "cudaSetDevice(" << gpu_id << "):"
206+
<< " '" << gpu_id << "' is not a VALID CUDA device! "
207+
<< " (system has " << n_gpu << " GPUs,"
208+
<< " valid IDs 0.." << n_gpu-1 << ")";
209+
return false;
210+
}
211+
default :
212+
KALDI_WARN << "cudaSetDevice(" << gpu_id << "): "
213+
<< "returned " << ret << ", "
214+
<< cudaGetErrorString((cudaError_t)ret);
215+
return false;
216+
}
217+
}
218+
175219

176220
bool CuDevice::DoublePrecisionSupported() {
177221
if (!Enabled()) return true;
@@ -180,8 +224,9 @@ bool CuDevice::DoublePrecisionSupported() {
180224
}
181225

182226

183-
bool CuDevice::IsComputeExclusive() {
227+
bool CuDevice::IsComputeExclusive(bool *error) {
184228
// check that we have a gpu
229+
*error = false;
185230
int32 n_gpu = 0;
186231
cudaGetDeviceCount(&n_gpu);
187232
if(n_gpu == 0) {
@@ -197,55 +242,63 @@ bool CuDevice::IsComputeExclusive() {
197242
// and the context is already created.
198243
cudaError_t e;
199244
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
200-
if(e != cudaSuccess) {
201-
KALDI_ERR << "Failed to create CUDA context on a GPU. No more unused GPUs in compute exclusive mode?";
245+
if (e != cudaSuccess) {
246+
KALDI_WARN << "Failed to create CUDA context on a GPU. No more unused GPUs "
247+
<< "in compute exclusive mode?";
248+
*error = true;
249+
return false;
202250
}
203251

204252
// get the device-id and its device-properties
205253
int32 gpu_id = -1;
206254
e = cudaGetDevice(&gpu_id);
207255
if(e != cudaSuccess) {
208-
KALDI_ERR << "Failed to get current device";
256+
KALDI_WARN << "Failed to get current device";
257+
*error = true;
258+
return false;
209259
}
210260
struct cudaDeviceProp gpu_prop;
211261
e = cudaGetDeviceProperties(&gpu_prop, gpu_id);
212262
if(e != cudaSuccess) {
213-
KALDI_ERR << "Failed to get device properties";
263+
KALDI_WARN << "Failed to get device properties";
264+
*error = true;
265+
return false;
214266
}
215267
// find out whether compute exclusive mode is used
216268
switch (gpu_prop.computeMode) {
217269
case cudaComputeModeExclusive :
218270
KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
219271
return true;
220272
break;
221-
#if (CUDA_VERSION >= 4000)
273+
#if (CUDA_VERSION >= 4000)
222274
case cudaComputeModeExclusiveProcess :
223275
KALDI_LOG << "CUDA setup operating under Compute Exclusive Process Mode.";
224276
return true;
225277
break;
226-
#endif
278+
#endif
227279
default :
228280
// The computation mode is not compute-exclusive,
229281
// in this case we release the GPU context...
230282
e = cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
231283
if(e != cudaSuccess) {
232-
KALDI_ERR << "Failed to release CUDA context on a GPU";
284+
KALDI_WARN << "Failed to release CUDA context on a GPU";
285+
*error = true;
286+
return false;
233287
}
234288
return false;
235289
}
236290
}
237291

238292

239-
240-
void CuDevice::SelectGpuIdAuto() {
293+
bool CuDevice::SelectGpuIdAuto() {
241294
// check that we have at least one gpu
242295
int32 n_gpu = 0;
243296
cudaGetDeviceCount(&n_gpu);
244297
if(n_gpu == 0) {
245-
KALDI_ERR << "No CUDA devices found";
246-
return;
298+
KALDI_WARN << "No CUDA devices found";
299+
return false;
247300
}
248-
301+
249302
// The GPU is selected according to maximal free memory ratio
250303
std::vector<float> free_mem_ratio(n_gpu+1, 0.0);
251304
//get ratios of memory use, if possible
@@ -295,7 +348,8 @@ void CuDevice::SelectGpuIdAuto() {
295348
}
296349
//the free_mem_ratio should be bigger than zero
297350
if(!free_mem_ratio[max_id] > 0.0) {
298-
KALDI_ERR << "No device could be selected (this should never happen)";
351+
KALDI_WARN << "No device could be selected (this should never happen)";
352+
return false;
299353
}
300354

301355
//finally select the GPU
@@ -305,8 +359,10 @@ void CuDevice::SelectGpuIdAuto() {
305359
cudaError_t e;
306360
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
307361
if(e != cudaSuccess) {
308-
KALDI_ERR << "Failed to create CUDA context on a GPU.";
362+
KALDI_WARN << "Failed to create CUDA context on a GPU.";
363+
return false;
309364
}
365+
return true;
310366
}
311367

312368

src/cudamatrix/cu-device.h

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ class CuDevice {
6464
}
6565

6666
/// Manually select GPU by id (more comments in cu-device.cc)
67-
void SelectGpuId(int32 gpu_id);
67+
void SelectGpuId(int32 gpu_id,
68+
bool abort_on_failure = true);
69+
6870
/// Get the active GPU id
6971
int32 ActiveGpuId() {
7072
return active_gpu_id_;
@@ -98,10 +100,20 @@ class CuDevice {
98100

99101
static CuDevice global_device_;
100102

101-
/// Check if the GPU run in compute exclusive mode
102-
bool IsComputeExclusive();
103-
/// Automatically select GPU
104-
void SelectGpuIdAuto();
103+
/// Check if the GPU run in compute exclusive mode Returns true if it is
104+
/// running in compute exclusive mode and we have a GPU. Returns false
105+
/// otherwise. Sets error to true if there was some error, such as that we
106+
/// were running in compute exclusive modes but no GPUs available; otherwise
107+
/// sets it to false.
108+
bool IsComputeExclusive(bool *error);
109+
110+
/// Automatically select GPU and get CUDA context. Returns true on success.
111+
bool SelectGpuIdAuto();
112+
113+
/// Try to get CUDA context on manually selected GPU. Return true on success.
114+
bool SelectGpuIdManual(int32 gpu_id);
115+
116+
void FinalizeActiveGpu();
105117

106118
/// Should only be called if Enabled() == true.
107119
int32 MajorDeviceVersion();

0 commit comments

Comments
 (0)