3030#include < vector>
3131#include < algorithm>
3232#include < dlfcn.h>
33+ #include < unistd.h> // for sleep
3334
3435#include " cudamatrix/cu-common.h"
3536#include " cudamatrix/cu-device.h"
@@ -62,7 +63,7 @@ namespace kaldi {
6263 * or not at all (when we intentionally want to run on the CPU).
6364 *
6465 */
65- void CuDevice::SelectGpuId (int32 gpu_id) {
66+ void CuDevice::SelectGpuId (int32 gpu_id, bool abort_on_error ) {
6667 // Make sure this function is not called twice!
6768 if (Enabled ()) {
6869 KALDI_ERR << " There is already an active GPU " << active_gpu_id_
@@ -80,69 +81,78 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
8081 cudaGetDeviceCount (&n_gpu);
8182 if (n_gpu == 0 && gpu_id == -2 ) {
8283 // If we do automatic selection and no GPU is found, we run on a CPU
83- KALDI_WARN << " CUDA will NOT be used!!! No CUDA capable GPU detected..." ;
84- active_gpu_id_ = -2 ;
85- return ;
84+ if (abort_on_error) {
85+ KALDI_ERR << " No CUDA capable GPU was detected" ;
86+ } else {
87+ KALDI_WARN << " CUDA will NOT be used!!! No CUDA capable GPU detected..." ;
88+ active_gpu_id_ = -2 ;
89+ return ;
90+ }
8691 }
87- // In other cases it is an error, no GPU is an error
8892 if (n_gpu == 0 ) {
89- KALDI_ERR << " No CUDA capable GPU detected, while explicitly asked for gpu-id '"
90- << gpu_id << " '." ;
93+ if (abort_on_error) {
94+ KALDI_ERR << " No CUDA capable GPU was detected." ;
95+ } else {
96+ KALDI_WARN << " No CUDA capable GPU detected, while explicitly asked for gpu-id '"
97+ << gpu_id << " '.CUDA will NOT be used!!!" ;
98+ active_gpu_id_ = -2 ;
99+ return ;
100+ }
91101 }
92-
93-
94- // Now we know that there is a GPU in the system,
95- // and we don't want to have it disabled.
102+
103+ // Now we know that there is a GPU in the system,
104+ // and we don't want to have it disabled.
96105 //
97- // For the GPU selection there are 3 possibilities,
98- // with priorities according to the order:
106+ // For the GPU selection there are 3 possibilities,
107+ // with priorities according to the order:
99108 //
100- // 1.) We have compute exclusive mode on (GPU is selected by OS)
101- // 2.) User did not specify the GPU-id (default value -2),
102- // we will do automatic selection.
103- // 3.) User specified the GPU to run on, so we select it.
104- if (IsComputeExclusive ()) {
105- // we have the GPU context now...
106- ;
107- } else if (gpu_id == -2 ) {
108- SelectGpuIdAuto ();
109- } else {
110- // try to select the desired GPU
111- int32 ret = cudaSetDevice (gpu_id);
112- // handle the possible errors (no recovery!!!)
113- switch (ret) {
114- case cudaSuccess : {
115- // create the GPU context
116- cudaError_t e;
117- e = cudaThreadSynchronize (); // deprecated, but for legacy not cudaDeviceSynchronize
118- if (e != cudaSuccess) {
119- KALDI_ERR << " Failed to create CUDA context on a GPU." ;
120- }
121- // this was okay, so we are done!
122- KALDI_LOG << " Selected device: " << gpu_id << " (manually)" ;
123- break ;
124- }
125- case cudaErrorInvalidDevice : {
126- int32 n_gpu = 0 ;
127- cudaGetDeviceCount (&n_gpu);
128- KALDI_ERR << " cudaSetDevice(" << gpu_id << " ):"
129- << " '" << gpu_id << " ' is not a VALID CUDA device! "
130- << " (system has " << n_gpu << " GPUs,"
131- << " valid IDs 0.." << n_gpu-1 << " )" ;
132- break ;
109+ // 1.) We have compute exclusive mode on (GPU is selected by OS)
110+ // 2.) User did not specify the GPU-id (default value -2),
111+ // we will do automatic selection.
112+ // 3.) User specified the GPU to run on, so we select it.
113+ bool error;
114+ if (IsComputeExclusive (&error)) {
115+ FinalizeActiveGpu ();
116+ return ;
117+ }
118+ if (error) { // There was some error detecting compute-exclusive status
119+ // (perhaps no GPU available). Sleep a bit and retry.
120+ int32 sec_sleep = 2 ;
121+ KALDI_WARN << " Will try again to get a GPU after " << sec_sleep
122+ << " seconds." ;
123+ sleep (sec_sleep);
124+
125+ if (IsComputeExclusive (&error)) {
126+ FinalizeActiveGpu ();
127+ return ;
128+ } else {
129+ if (abort_on_error) {
130+ KALDI_ERR << " Error acquiring GPU in exclusive mode." ;
131+ } else {
132+ KALDI_WARN << " Error selecting GPU. CUDA will NOT be used!!!." ;
133+ active_gpu_id_ = -2 ;
134+ return ;
133135 }
134- default :
135- KALDI_ERR << " cudaSetDevice(" << gpu_id << " ): "
136- << " returned " << ret << " , "
137- << cudaGetErrorString ((cudaError_t)ret);
138136 }
139137 }
138+
139+ bool ans = (gpu_id == -2 ? SelectGpuIdAuto () : SelectGpuIdManual (gpu_id));
140+ if (ans) {
141+ FinalizeActiveGpu ();
142+ } else {
143+ if (abort_on_error) {
144+ KALDI_ERR << " Error acquiring GPU." ;
145+ } else {
146+ KALDI_WARN << " Error selecting GPU. CUDA will NOT be used!!!." ;
147+ active_gpu_id_ = -2 ;
148+ }
149+ }
150+ }
140151
152+ void CuDevice::FinalizeActiveGpu () {
153+ // The device at this point should have active GPU, so we can query its name
154+ // and memory stats and notify user which GPU is finally used.
141155
142- // Now the we should have active GPU,
143- // so we can query its name and memory stats
144- // and notify user which GPU is finally used.
145- //
146156 // Get the device-id of active device:
147157 {
148158 int32 act_gpu_id;
@@ -161,17 +171,51 @@ void CuDevice::SelectGpuId(int32 gpu_id) {
161171 DeviceGetName (name,128 ,act_gpu_id);
162172
163173 CU_SAFE_CALL (cudaGetDeviceProperties (&properties_, act_gpu_id));
164-
174+
165175 KALDI_LOG << " The active GPU is [" << act_gpu_id << " ]: " << name << " \t "
166176 << GetFreeMemory (&free_memory_at_startup_, NULL ) << " version "
167177 << properties_.major << " ." << properties_.minor ;
168178
169179 if (verbose_) PrintMemoryUsage ();
170180 }
171-
172181 return ;
173182}
174183
184+ bool CuDevice::SelectGpuIdManual (int32 gpu_id) {
185+ // The user selected a particular GPU using --use-gpu-id=X; try to select
186+ // that one.
187+ int32 ret = cudaSetDevice (gpu_id);
188+ // handle the possible errors (no recovery!!!)
189+ switch (ret) {
190+ case cudaSuccess : {
191+ // create the GPU context
192+ cudaError_t e;
193+ e = cudaThreadSynchronize (); // deprecated, but for legacy not cudaDeviceSynchronize
194+ if (e != cudaSuccess) {
195+ KALDI_WARN << " Failed to create CUDA context on a GPU." ;
196+ return false ;
197+ }
198+ // this was okay, so we are done!
199+ KALDI_LOG << " Selected device: " << gpu_id << " (manually)" ;
200+ return true ;
201+ }
202+ case cudaErrorInvalidDevice : {
203+ int32 n_gpu = 0 ;
204+ cudaGetDeviceCount (&n_gpu);
205+ KALDI_WARN << " cudaSetDevice(" << gpu_id << " ):"
206+ << " '" << gpu_id << " ' is not a VALID CUDA device! "
207+ << " (system has " << n_gpu << " GPUs,"
208+ << " valid IDs 0.." << n_gpu-1 << " )" ;
209+ return false ;
210+ }
211+ default :
212+ KALDI_WARN << " cudaSetDevice(" << gpu_id << " ): "
213+ << " returned " << ret << " , "
214+ << cudaGetErrorString ((cudaError_t)ret);
215+ return false ;
216+ }
217+ }
218+
175219
176220bool CuDevice::DoublePrecisionSupported () {
177221 if (!Enabled ()) return true ;
@@ -180,8 +224,9 @@ bool CuDevice::DoublePrecisionSupported() {
180224}
181225
182226
183- bool CuDevice::IsComputeExclusive () {
227+ bool CuDevice::IsComputeExclusive (bool *error ) {
184228 // check that we have a gpu
229+ *error = false ;
185230 int32 n_gpu = 0 ;
186231 cudaGetDeviceCount (&n_gpu);
187232 if (n_gpu == 0 ) {
@@ -197,55 +242,63 @@ bool CuDevice::IsComputeExclusive() {
197242 // and the context is already created.
198243 cudaError_t e;
199244 e = cudaThreadSynchronize (); // deprecated, but for legacy not cudaDeviceSynchronize
200- if (e != cudaSuccess) {
201- KALDI_ERR << " Failed to create CUDA context on a GPU. No more unused GPUs in compute exclusive mode?" ;
245+ if (e != cudaSuccess) {
246+ KALDI_WARN << " Failed to create CUDA context on a GPU. No more unused GPUs "
247+ << " in compute exclusive mode?" ;
248+ *error = true ;
249+ return false ;
202250 }
203251
204252 // get the device-id and its device-properties
205253 int32 gpu_id = -1 ;
206254 e = cudaGetDevice (&gpu_id);
207255 if (e != cudaSuccess) {
208- KALDI_ERR << " Failed to get current device" ;
256+ KALDI_WARN << " Failed to get current device" ;
257+ *error = true ;
258+ return false ;
209259 }
210260 struct cudaDeviceProp gpu_prop;
211261 e = cudaGetDeviceProperties (&gpu_prop, gpu_id);
212262 if (e != cudaSuccess) {
213- KALDI_ERR << " Failed to get device properties" ;
263+ KALDI_WARN << " Failed to get device properties" ;
264+ *error = true ;
265+ return false ;
214266 }
215267 // find out whether compute exclusive mode is used
216268 switch (gpu_prop.computeMode ) {
217269 case cudaComputeModeExclusive :
218270 KALDI_LOG << " CUDA setup operating under Compute Exclusive Mode." ;
219271 return true ;
220272 break ;
221- #if (CUDA_VERSION >= 4000)
273+ #if (CUDA_VERSION >= 4000)
222274 case cudaComputeModeExclusiveProcess :
223275 KALDI_LOG << " CUDA setup operating under Compute Exclusive Process Mode." ;
224276 return true ;
225277 break ;
226- #endif
278+ #endif
227279 default :
228280 // The computation mode is not compute-exclusive,
229281 // in this case we release the GPU context...
230282 e = cudaThreadExit (); // deprecated, but for legacy reason not cudaDeviceReset
231283 if (e != cudaSuccess) {
232- KALDI_ERR << " Failed to release CUDA context on a GPU" ;
284+ KALDI_WARN << " Failed to release CUDA context on a GPU" ;
285+ *error = true ;
286+ return false ;
233287 }
234288 return false ;
235289 }
236290}
237291
238292
239-
240- void CuDevice::SelectGpuIdAuto () {
293+ bool CuDevice::SelectGpuIdAuto () {
241294 // check that we have at least one gpu
242295 int32 n_gpu = 0 ;
243296 cudaGetDeviceCount (&n_gpu);
244297 if (n_gpu == 0 ) {
245- KALDI_ERR << " No CUDA devices found" ;
246- return ;
298+ KALDI_WARN << " No CUDA devices found" ;
299+ return false ;
247300 }
248-
301+
249302 // The GPU is selected according to maximal free memory ratio
250303 std::vector<float > free_mem_ratio (n_gpu+1 , 0.0 );
251304 // get ratios of memory use, if possible
@@ -295,7 +348,8 @@ void CuDevice::SelectGpuIdAuto() {
295348 }
296349 // the free_mem_ratio should be bigger than zero
297350 if (!free_mem_ratio[max_id] > 0.0 ) {
298- KALDI_ERR << " No device could be selected (this should never happen)" ;
351+ KALDI_WARN << " No device could be selected (this should never happen)" ;
352+ return false ;
299353 }
300354
301355 // finally select the GPU
@@ -305,8 +359,10 @@ void CuDevice::SelectGpuIdAuto() {
305359 cudaError_t e;
306360 e = cudaThreadSynchronize (); // deprecated, but for legacy not cudaDeviceSynchronize
307361 if (e != cudaSuccess) {
308- KALDI_ERR << " Failed to create CUDA context on a GPU." ;
362+ KALDI_WARN << " Failed to create CUDA context on a GPU." ;
363+ return false ;
309364 }
365+ return true ;
310366}
311367
312368
0 commit comments