Skip to content

Commit fbe662f

Browse files
author
pradeep
committed
Use static cufft,cublas,cusolver and cusolver on Unix
thrust::stable_sort_by_key has known issue with device linking. The code crashes with cudaInvalidValueError. It works as expected without any changes with or without separable compilation otherwise. https://github.com/thrust/thrust/wiki/Debugging#known-issues https://github.com/thrust/thrust/blob/master/doc/changelog.md#known-issues-2 The above documents mention a known issue with device linking and thrust. Although the documents say it happens in debug mode(with -G flag), I noticed similar crashes in release configuration too in ArrayFire. Due to the above issue, I have separated out the relevant source files (fft,blas,sparse and solver) which require device linking into separate static library. Once separated into a separate static library, sort_by_key and all the other unit tests that use it are running as expected without any crashes.
1 parent c8552fa commit fbe662f

11 files changed

Lines changed: 227 additions & 105 deletions

File tree

CMakeModules/AFcuda_helpers.cmake

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright (c) 2020, ArrayFire
2+
# All rights reserved.
3+
#
4+
# This file is distributed under 3-clause BSD license.
5+
# The complete license agreement can be obtained at:
6+
# http://arrayfire.com/licenses/BSD-3-Clause
7+
8+
9+
# The following macro uses a macro defined by
10+
# FindCUDA module from cmake.
11+
function(af_find_static_cuda_libs libname)
12+
set(search_name
13+
"${CMAKE_STATIC_LIBRARY_PREFIX}${libname}${CMAKE_STATIC_LIBRARY_SUFFIX}")
14+
cuda_find_library_local_first(CUDA_${libname}_LIBRARY
15+
${search_name} "${libname} static library")
16+
mark_as_advanced(CUDA_${libname}_LIBRARY)
17+
endfunction()
18+
19+
## Copied from FindCUDA.cmake
20+
## The target_link_library needs to link with the cuda libraries using
21+
## PRIVATE
22+
function(cuda_add_library cuda_target)
23+
cuda_add_cuda_include_once()
24+
25+
# Separate the sources from the options
26+
cuda_get_sources_and_options(_sources _cmake_options _options ${ARGN})
27+
cuda_build_shared_library(_cuda_shared_flag ${ARGN})
28+
# Create custom commands and targets for each file.
29+
cuda_wrap_srcs( ${cuda_target} OBJ _generated_files ${_sources}
30+
${_cmake_options} ${_cuda_shared_flag}
31+
OPTIONS ${_options} )
32+
33+
# Compute the file name of the intermedate link file used for separable
34+
# compilation.
35+
cuda_compute_separable_compilation_object_file_name(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
36+
37+
# Add the library.
38+
add_library(${cuda_target} ${_cmake_options}
39+
${_generated_files}
40+
${_sources}
41+
${link_file}
42+
)
43+
44+
# Add a link phase for the separable compilation if it has been enabled. If
45+
# it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
46+
# variable will have been defined.
47+
cuda_link_separable_compilation_objects("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
48+
49+
target_link_libraries(${cuda_target}
50+
PRIVATE ${CUDA_LIBRARIES}
51+
)
52+
53+
# We need to set the linker language based on what the expected generated file
54+
# would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
55+
set_target_properties(${cuda_target}
56+
PROPERTIES
57+
LINKER_LANGUAGE ${CUDA_C_OR_CXX}
58+
POSITION_INDEPENDENT_CODE ON
59+
)
60+
endfunction()

src/backend/cuda/CMakeLists.txt

Lines changed: 136 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,18 @@
55
# The complete license agreement can be obtained at:
66
# http://arrayfire.com/licenses/BSD-3-Clause
77

8+
dependency_check(CUDA_FOUND "CUDA not found.")
9+
10+
include(AFcuda_helpers)
11+
include(FileToString)
812
include(InternalUtils)
913
include(select_compute_arch)
1014

11-
dependency_check(CUDA_FOUND "CUDA not found.")
12-
13-
find_cuda_helper_libs(nvrtc)
14-
find_cuda_helper_libs(nvrtc-builtins)
15+
# Remove cublas_device library which is no longer included with the cuda
16+
# toolkit. Fixes issues with older CMake versions
17+
if(DEFINED CUDA_cublas_device_LIBRARY AND NOT CUDA_cublas_device_LIBRARY)
18+
list(REMOVE_ITEM CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_device_LIBRARY})
19+
endif()
1520

1621
if(NOT OPENGL_FOUND)
1722
# create a dummy gl.h header to satisfy cuda_gl_interop.h requirement
@@ -24,9 +29,50 @@ if(NOT OPENGL_FOUND)
2429
file(WRITE "${dummy_gl_root}/gl.h" "// Dummy file to satisy cuda_gl_interop")
2530
endif()
2631

27-
get_filename_component(CUDA_LIBRARIES_PATH ${CUDA_cudart_static_LIBRARY} DIRECTORY CACHE)
32+
# Find if CUDA Toolkit is at least 10.0 to use static
33+
# lapack library. Otherwise, we have to use regular shared library
34+
if(UNIX AND CUDA_VERSION_MAJOR VERSION_GREATER 10 OR CUDA_VERSION_MAJOR VERSION_EQUAL 10)
35+
set(use_static_cuda_lapack ON)
36+
else()
37+
set(use_static_cuda_lapack OFF)
38+
endif()
2839

29-
include(FileToString)
40+
find_cuda_helper_libs(nvrtc)
41+
find_cuda_helper_libs(nvrtc-builtins)
42+
if(UNIX)
43+
af_find_static_cuda_libs(culibos)
44+
af_find_static_cuda_libs(cublas_static)
45+
af_find_static_cuda_libs(cublasLt_static)
46+
af_find_static_cuda_libs(cufft_static)
47+
af_find_static_cuda_libs(cusparse_static)
48+
49+
# FIXME When NVCC resolves this particular issue.
50+
# NVCC doesn't like -l<full_path_static_lib>, hence we cannot
51+
# use ${CMAKE_*_LIBRARY} variables in the following flags.
52+
set(af_cuda_static_flags "-rdc=true;-dlink")
53+
set(af_cuda_static_flags "${af_cuda_static_flags};-lculibos")
54+
set(af_cuda_static_flags "${af_cuda_static_flags};-lcublas_static")
55+
set(af_cuda_static_flags "${af_cuda_static_flags};-lcublasLt_static")
56+
set(af_cuda_static_flags "${af_cuda_static_flags};-lcufft_static")
57+
set(af_cuda_static_flags "${af_cuda_static_flags};-lcusparse_static")
58+
59+
if(${use_static_cuda_lapack})
60+
af_find_static_cuda_libs(cusolver_static)
61+
set(cusolver_static_lib "${CUDA_cusolver_static_LIBRARY}")
62+
63+
# NVIDIA LAPACK library liblapack_static.a is a subset of LAPACK and only
64+
# contains GPU accelerated stedc and bdsqr. The user has to link
65+
# libcusolver_static.a with liblapack_static.a in order to build
66+
# successfully.
67+
af_find_static_cuda_libs(lapack_static)
68+
69+
set(af_cuda_static_flags "${af_cuda_static_flags};-lcusolver_static")
70+
else()
71+
set(cusolver_lib "${CUDA_cusolver_LIBRARY}")
72+
endif()
73+
endif()
74+
75+
get_filename_component(CUDA_LIBRARIES_PATH ${CUDA_cudart_static_LIBRARY} DIRECTORY CACHE)
3076

3177
if(NOT CUDA_architecture_build_targets)
3278
cuda_detect_installed_gpus(detected_gpus)
@@ -171,54 +217,9 @@ file_to_string(
171217
NULLTERM
172218
)
173219

174-
## Copied from FindCUDA.cmake
175-
## The target_link_library needs to link with the cuda libraries using
176-
## PRIVATE
177-
function(cuda_add_library cuda_target)
178-
cuda_add_cuda_include_once()
179-
180-
# Separate the sources from the options
181-
cuda_get_sources_and_options(_sources _cmake_options _options ${ARGN})
182-
cuda_build_shared_library(_cuda_shared_flag ${ARGN})
183-
# Create custom commands and targets for each file.
184-
cuda_wrap_srcs( ${cuda_target} OBJ _generated_files ${_sources}
185-
${_cmake_options} ${_cuda_shared_flag}
186-
OPTIONS ${_options} )
187-
188-
# Compute the file name of the intermedate link file used for separable
189-
# compilation.
190-
cuda_compute_separable_compilation_object_file_name(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
191-
192-
# Add the library.
193-
add_library(${cuda_target} ${_cmake_options}
194-
${_generated_files}
195-
${_sources}
196-
${link_file}
197-
)
198-
199-
# Add a link phase for the separable compilation if it has been enabled. If
200-
# it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
201-
# variable will have been defined.
202-
cuda_link_separable_compilation_objects("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
203-
204-
target_link_libraries(${cuda_target}
205-
PRIVATE ${CUDA_LIBRARIES}
206-
)
207-
208-
# We need to set the linker language based on what the expected generated file
209-
# would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
210-
set_target_properties(${cuda_target}
211-
PROPERTIES
212-
LINKER_LANGUAGE ${CUDA_C_OR_CXX}
213-
POSITION_INDEPENDENT_CODE ON
214-
)
215-
216-
endfunction()
217-
218220
arrayfire_get_cuda_cxx_flags(cuda_cxx_flags)
219221
arrayfire_get_platform_definitions(platform_flags)
220222

221-
222223
get_property(boost_includes TARGET Boost::boost PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
223224
get_property(boost_definitions TARGET Boost::boost PROPERTY INTERFACE_COMPILE_DEFINITIONS)
224225

@@ -245,9 +246,78 @@ list(APPEND cuda_cxx_flags ${cxx_definitions})
245246
include(kernel/scan_by_key/CMakeLists.txt)
246247
include(kernel/thrust_sort_by_key/CMakeLists.txt)
247248

249+
# CUDA static libraries require device linking to successfully link
250+
# against afcuda target. Device linking requires CUDA_SEPARABLE_COMPILATION
251+
# to be ON. Therefore, we turn on separable compilation for a subset of
252+
# source files while compiling af_cuda_static_cuda_library target. Once
253+
# this subset is compiled, separable compilation is reset to it's original
254+
# value.
255+
if(UNIX)
256+
# Static linking cuda libs require device linking, which in turn
257+
# requires separable compilation.
258+
set(pior_val_CUDA_SEPARABLE_COMPILATION OFF)
259+
if(DEFINED CUDA_SEPARABLE_COMPILATION)
260+
set(pior_val_CUDA_SEPARABLE_COMPILATION ${CUDA_SEPARABLE_COMPILATION})
261+
endif()
262+
set(CUDA_SEPARABLE_COMPILATION ON)
263+
endif()
264+
265+
cuda_add_library(af_cuda_static_cuda_library STATIC
266+
blas.cu
267+
blas.hpp
268+
cufft.cu
269+
cufft.hpp
270+
fft.cu
271+
sparse.cu
272+
sparse.hpp
273+
sparse_arith.cu
274+
sparse_arith.hpp
275+
sparse_blas.cu
276+
sparse_blas.hpp
277+
solve.cu
278+
solve.hpp
279+
280+
OPTIONS
281+
${platform_flags} ${cuda_cxx_flags} ${af_cuda_static_flags}
282+
-Xcudafe \"--diag_suppress=1427\" -DAFDLL
283+
)
284+
285+
set_target_properties(af_cuda_static_cuda_library
286+
PROPERTIES
287+
LINKER_LANGUAGE CXX
288+
FOLDER "Generated Targets"
289+
)
290+
291+
if(UNIX)
292+
target_link_libraries(af_cuda_static_cuda_library
293+
PRIVATE
294+
Boost::boost
295+
${CMAKE_DL_LIBS}
296+
${cusolver_lib}
297+
-Wl,--start-group
298+
${CUDA_culibos_LIBRARY} #also a static libary
299+
${CUDA_cublas_static_LIBRARY}
300+
${CUDA_cublasLt_static_LIBRARY}
301+
${CUDA_cufft_static_LIBRARY}
302+
${CUDA_lapack_static_LIBRARY}
303+
${CUDA_cusparse_static_LIBRARY}
304+
${cusolver_static_lib}
305+
-Wl,--end-group
306+
)
307+
set(CUDA_SEPARABLE_COMPILATION ${pior_val_CUDA_SEPARABLE_COMPILATION})
308+
else()
309+
target_link_libraries(af_cuda_static_cuda_library
310+
PRIVATE
311+
Boost::boost
312+
${CUDA_CUBLAS_LIBRARIES}
313+
${CUDA_CUFFT_LIBRARIES}
314+
${CUDA_cusolver_LIBRARY}
315+
${CUDA_cusparse_LIBRARY}
316+
)
317+
endif()
318+
248319
cuda_add_library(afcuda
249320
${thrust_sort_sources}
250-
sort.hpp
251321
252322
all.cu
253323
anisotropic_diffusion.cpp
@@ -390,7 +460,6 @@ cuda_add_library(afcuda
390460
backend.hpp
391461
bilateral.hpp
392462
binary.hpp
393-
blas.cpp
394463
blas.hpp
395464
canny.hpp
396465
cast.hpp
@@ -407,7 +476,6 @@ cuda_add_library(afcuda
407476
cudnn.hpp
408477
cudnnModule.cpp
409478
cudnnModule.hpp
410-
cufft.cpp
411479
cufft.hpp
412480
cusolverDn.cpp
413481
cusolverDn.hpp
@@ -427,7 +495,6 @@ cuda_add_library(afcuda
427495
fast.hpp
428496
fast_pyramid.cpp
429497
fast_pyramid.hpp
430-
fft.cpp
431498
fft.hpp
432499
fftconvolve.cpp
433500
fftconvolve.hpp
@@ -509,15 +576,12 @@ cuda_add_library(afcuda
509576
shift.hpp
510577
sift.hpp
511578
sobel.hpp
512-
solve.cpp
513579
solve.hpp
580+
sort.hpp
514581
sort_by_key.hpp
515582
sort_index.hpp
516-
sparse.cpp
517583
sparse.hpp
518-
sparse_arith.cpp
519584
sparse_arith.hpp
520-
sparse_blas.cpp
521585
sparse_blas.hpp
522586
surface.cpp
523587
surface.hpp
@@ -570,6 +634,8 @@ target_compile_options(afcuda PRIVATE ${cxx_definitions})
570634
add_library(ArrayFire::afcuda ALIAS afcuda)
571635
572636
add_dependencies(afcuda ${jit_kernel_targets} ${nvrtc_kernel_targets})
637+
add_dependencies(af_cuda_static_cuda_library ${nvrtc_kernel_targets})
638+
add_dependencies(afcuda af_cuda_static_cuda_library)
573639
574640
target_include_directories (afcuda
575641
PUBLIC
@@ -586,29 +652,14 @@ target_include_directories (afcuda
586652
${cuDNN_INCLUDE_DIRS}
587653
)
588654
589-
# Remove cublas_device library which is no longer included with the cuda
590-
# toolkit. Fixes issues with older CMake versions
591-
if(DEFINED CUDA_cublas_device_LIBRARY AND NOT CUDA_cublas_device_LIBRARY)
592-
list(REMOVE_ITEM CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_device_LIBRARY})
593-
endif()
594-
595-
# Remove cublas_device library which is no longer included with the cuda
596-
# toolkit. Fixes issues with older CMake versions
597-
if(DEFINED CUDA_cublas_device_LIBRARY AND NOT CUDA_cublas_device_LIBRARY)
598-
list(REMOVE_ITEM CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_device_LIBRARY})
599-
endif()
600-
601655
target_link_libraries(afcuda
602656
PRIVATE
603657
c_api_interface
604658
cpp_api_interface
605659
afcommon_interface
606-
${CUDA_nvrtc_LIBRARY}
607-
${CUDA_CUBLAS_LIBRARIES}
608-
${CUDA_CUFFT_LIBRARIES}
609-
${CUDA_cusolver_LIBRARY}
610-
${CUDA_cusparse_LIBRARY}
611660
${CMAKE_DL_LIBS}
661+
${CUDA_nvrtc_LIBRARY}
662+
af_cuda_static_cuda_library
612663
)
613664
614665
# If the driver is not found the cuda driver api need to be linked against the
@@ -703,13 +754,17 @@ function(afcu_collect_libs libname)
703754
endfunction()
704755
705756
if(AF_INSTALL_STANDALONE)
706-
afcu_collect_libs(cufft)
707757
afcu_collect_libs(cudnn)
708-
afcu_collect_libs(cublas)
709-
afcu_collect_libs(cublasLt)
710-
afcu_collect_libs(cusolver)
711-
afcu_collect_libs(cusparse)
712758
afcu_collect_libs(nvrtc FULL_VERSION)
759+
if(WIN32)
760+
afcu_collect_libs(cufft)
761+
afcu_collect_libs(cublas)
762+
afcu_collect_libs(cublasLt)
763+
afcu_collect_libs(cusolver)
764+
afcu_collect_libs(cusparse)
765+
elseif(NOT ${use_static_cuda_lapack})
766+
afcu_collect_libs(cusolver)
767+
endif()
713768
714769
if(APPLE)
715770
afcu_collect_libs(cudart)

0 commit comments

Comments
 (0)